*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	Complex FIR
*
*	Revision Date:	07/20/98
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*		void cfirf(float *x, float *h, float *y, int numH, int numY);
*
*		x is pointer to array holding the input floating point array
*		h is pointer to array holding the coefficient floating point
*		   array
*		y is pointer to array holding the output floating point array
*		numH is the number of complex coefficents
*		numY is the number of complex output values
*
*		If the routine is not to be used as a C callable function,
*		then you need to initialize values for all of the parameters
*		passed to the function since these are assumed to be in
*		registers as defined by the calling convention of the
*		compiler, (refer to the TMS320C6x Optimizing C Compiler
*		User's Guide).
*
*	C CODE
*		This is the C equivalent for the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*		void cfirf(float *x, float *h, float *y, int numH, int numY)
*		{
*		    int i, j;
*		    float real;
*		    float imag;
*		    for(j=0; j < numY*2; j+=2)
*		    {
*			imag = 0;
*			real = 0;
*			for(i=0; i < numH*2; i+=2)
*			{
*			    real += x[i+j] * h[i] - x[i+j+1] * h[i+1];
*			    imag += x[i+j] * h[i+1] + x[i+j+1] * h[i];
*			}
*			y[j]   = real;
*			y[j+1] = imag;
*		    }
*		}
*
*	DESCRIPTION
*
*		This routine implements a block complex FIR filter.  There are
*		"numH" complex filter coefficients, "numY" complex output
*		samples, and "numH+numY-1" complex input samples.  The
*		coefficients need to be placed in the "h" array in reverse
*		order {h(numH-1), ... , h(1), h(0)} and the array "x" starts
*		at x(-numH+1) and ends at x(numY-1).  The routine calculates
*		y(0) through y(numY-1) using the following formula:
*
*		y(n) = h(0)*x(n) + h(1)*x(n-1) + ... + h(numH-1)*x(n-numH+1)
*
*		where n = {0, 1, ... , numY-1}.
*
*	TECHNIQUES
*
*		1.  LDDW instructions are used to load a real and imaginary SP 
*		    floating point value simultaneously for the x and h arrays.
*		3.  The inner loop is unrolled once and software pipelined.
*		4.  No extraneous loads are performed since a load counter is
*		    used.
*		5.  The first 5 cycles of the inner loop prolog are
*		    conditionally scheduled in parallel with the outer loop.
*		    This increases the code size by 5 words, but
*		    improves the cycle time.
*	
*	ASSUMPTIONS
*
*		1.  Little Endian is assumed for LDDW instructions.
*		2.  The number of coefficients (numH) must be a multiple of 2
*		    and greater than or equal to 4 (4, 6, 8, ...).
*		3.  Since single assignment of registers is not used,
*		    interrupts should be disabled before this function is 
*		    called.
*		4.  The values in the arrays x, y, h are complex and are stored
*		    as {real0, imag0, ..., realN, imagN}.
*		
*	MEMORY NOTE
*
*		x, and h arrays should be placed on double-word boundaries.
*
*       ARGUMENTS PASSED
*
*		x	 ->  A4 = ptr_x
*		h	 ->  B4 = ptr_h
*		y	 ->  A6 = ptr_y
*		numH	 ->  B6
*		numY	 ->  A8
*
*	CYCLES
*
*		((2*numH) + 14)*numY + 17 + (numY-1)   with C overhead
*		((2*numH) + 14)*numY +	9 + (numY-1)   without C overhead
*
*		The (numY-1) value is due to bank hits with the outer loop
*		STW instructions in parallel with the inner loop prolog.
*
*	NOTATIONS
*
*		e = Inner Loop Epilog
*		o = Outer Loop
*		p = Inner Loop Prolog
*		f = Function Epilog or Prolog
*
*===============================================================================
SP	   .set    B15
	   .text
	   .def    _cfirf
_cfirf:

	   STW	   .D2T1   A10,*SP--[2]    ; f push A10
||	   MV	   .S1	   A4,A1	   ; f A1 = ptr_x
||	   SUB	   .S2	   B6,4,B2	   ; f B2 = numH - 4
||	   MV	   .D1	   A6,A0	   ; f A0 = ptr_y
||	   MV	   .L1	   A8,A2	   ; f A2 = numY

	   STW	   .D2T2   B10,*+SP[1]	   ; f push B10
||	   MV	   .L1X    B4,A3	   ; f A3 = ptr_h
||	   MVK	   .S2	   1,B0 	   ; f No stores the first time

*** BEGIN Benchmark Timing
B_START:
;** --------------------------------------------------------------------------*
	   ; PIPED LOOP PROLOG
	   LDDW    .D2T1   *B4++(16),A9:A8 ; load h(i)

	   LDDW    .D1T2   *A4++(16),B9:B8 ; load x(i+j)

	   LDDW    .D1T2   *-A4(8),B7:B6   ; load x(i+j+1)

	   LDDW    .D2T1   *-B4(8),A7:A6   ; load h(i+1)

	   LDDW    .D2T1   *B4++(16),A9:A8 ; @ load h(i)
O_LOOP:
	   LDDW    .D1T2   *A4++(16),B9:B8 ; @ load x(i+j)
||	   MV	   .S2	   B2,B1	   ; o set cntr= numH

	   MPYSP   .M1X    B8,A9,A5	   ; x_real * h_imag
||	   MPYSP   .M2X    B9,A8,B5	   ; x_imag * h_real
||	   LDDW    .D1T2   *-A4(8),B7:B6   ; @ load x(i+j+1)

	   MPYSP   .M1X    B9,A9,A5	   ; x_imag * h_imag
||	   MPYSP   .M2X    B8,A8,B5	   ; x_real * h_real
||	   LDDW    .D2T1   *-B4(8),A7:A6   ; @ load h(i+1)
|| [!B0]   STW	   .D1	   B5,*A0++	   ; o store y(j).real

	   MPYSP   .M1X    B7,A7,A5	   ; x_imag1 * h_imag1
||	   MPYSP   .M2X    B7,A6,B5	   ; x_imag1 * h_real1
|| [ B1]   LDDW    .D2T1   *B4++(16),A9:A8 ; @@ load h(i)
|| [!B0]   STW	   .D1	   B10,*A0++	   ; o store y(j).imag
||	   MV	   .S2	   B2,B0	   ; o reset load cntr
||	   B	   .S1	   LOOP 	   ; branch to loop

	   MPYSP   .M1X    B6,A7,A5	   ; x_real1 * h_imag1
||	   MPYSP   .M2X    B6,A6,B5	   ; x_real1 * h_real1
|| [ B1]   LDDW    .D1T2   *A4++(16),B9:B8 ; @@ load x(i+j)
||	   ZERO    .S1	   A10		   ; o accumulator = 0
||	   ZERO    .S2	   B10		   ; o accumulator = 0

;** --------------------------------------------------------------------------*
LOOP:        ; PIPED LOOP KERNEL

	   ADDSP   .L1	   A10,A5,A10	   ; ad_acc += x_real * h_imag
||	   ADDSP   .L2	   B10,B5,B10	   ; bc_acc += x_imag * h_real
||	   MPYSP   .M1X    B8,A9,A5	   ; @ x_real * h_imag
||	   MPYSP   .M2X    B9,A8,B5	   ; @ x_imag * h_real
|| [ B1]   LDDW    .D1T2   *-A4(8),B7:B6   ; @@ load x(i+j+1)

	   ADDSP   .L1	   A10,A5,A10	   ; bd_acc += x_imag * h_imag
||	   ADDSP   .L2	   B10,B5,B10	   ; ac_acc += x_real * h_real
||	   MPYSP   .M1X    B9,A9,A5	   ; @ x_imag * h_imag
||	   MPYSP   .M2X    B8,A8,B5	   ; @ x_real * h_real
|| [ B1]   LDDW    .D2T1   *-B4(8),A7:A6   ; @@ load h(i+1)
|| [ B1]   SUB	   .S2	   B1,2,B1	   ; adjust load counter

	   ADDSP   .L1	   A10,A5,A10	   ; bd_acc1 += x_imag1 + h_imag1
||	   ADDSP   .L2	   B10,B5,B10	   ; bc_acc1 += x_imag1 * h_real1
||	   MPYSP   .M1X    B7,A7,A5	   ; @ x_imag1 * h_imag1
||	   MPYSP   .M2X    B7,A6,B5	   ; @ x_imag1 * h_real1
|| [ B1]   LDDW    .D2T1   *B4++(16),A9:A8 ; @@@ load h(i)
|| [ B0]   B	   .S1	   LOOP 	   ; branch to loop
|| [ B0]   SUB	   .S2	   B0,2,B0	   ; decrement inner loop counter

	   ADDSP   .L1	   A10,A5,A10	   ; ad_acc1 += x_real1 * h_imag1
||	   ADDSP   .L2	   B10,B5,B10	   ; ac_acc1 += x_real1 * h_real1
||	   MPYSP   .M1X    B6,A7,A5	   ; @ x_real1 * h_imag1
||	   MPYSP   .M2X    B6,A6,B5	   ; @ x_real1 * h_real1
|| [ B1]   LDDW    .D1T2   *A4++(16),B9:B8 ; @@@ load x(i+j)

;** branch to loop    --------------------------------------------------------*
	   ADD	   .S1	   A1,8,A1	   ; o set up pointer x(j+1)
||	   ADDSP   .L2X    A10,B10,B5	   ; o ad_acc + bc_acc

	   MV	   .S1	   A1,A4	   ; o reset pointer
||	   SUBSP   .L2X    B10,A10,B6	   ; o ac_acc - bd_acc

   [ A2]   SUB	   .D1	   A2,01,A2	   ; o decrement outer loop counter
||	   MV	   .S1	   A10,A8	   ; o A8 = A10
||	   MV	   .D2	   B10,B8	   ; o B8 = B10
||	   MV	   .S2X    A3,B4	   ; o reset pointer

	   ADDSP   .L2X    B8,A10,B10	   ; o bc_acc1 + ad_acc1
||	   SUBSP   .L1X    B10,A8,A10	   ; o ac_acc1 - bd_acc1
|| [ A2]   B	   .S1	   O_LOOP	   ; o branch to outer loop

;** Parallel prolog starts ---------------------------------------------------*

   [ A2]   LDDW    .D2T1   *B4++(16),A9:A8 ; p load h(i)

   [ A2]   LDDW    .D1T2   *A4++(16),B9:B8 ; p load x(i+j)

   [ A2]   LDDW    .D1T2   *-A4(8),B7:B6   ; p load x(i+j+1)

   [ A2]   LDDW    .D2T1   *-B4(8),A7:A6   ; p load h(i+1)
||	   ADDSP   .L2X    B6,A10,B5	   ; o y_real = ac_acc+ac_acc1-bd_acc+
					   ;		-bd_acc1
   [ A2]   LDDW    .D2T1   *B4++(16),A9:A8 ; p load h(i)
||	   ADDSP   .L2	   B5,B10,B10	   ; o y_imag = ad_acc+ad_acc1+bc_acc+
					   ;		+bc_acc1
           ; BRANCH OCCURS  
;** --------------------------------------------------------------------------*
	   LDW	   .D2	   *+SP[1],B10	   ; f pop B10

	   LDW	   .D2	   *++SP[2],A10    ; f pop A10

	   STW	   .D1	   B5,*A0++	   ; o store y(j).real

	   STW	   .D1	   B10,*A0++	   ; o store y(j).imag
B_END:
*** END Benchmark Timing
	   B	   .S2	   B3		   ; f return
	   NOP		   5		   ; f wait for branch
           ; BRANCH OCCURS
