*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	FIR
*
*	Revision Date:	02/23/98
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*		void fir(float *x, float *h, float *y, int numH, int numY);
*
*		x is pointer to array holding the input floating point array
*		h is pointer to array holding the coefficient floating point
*		   array
*		y is pointer to array holding the output floating point array
*		numH is the number of coefficents
*		numY is the number of output values
*
*		If the routine is not to be used as a C callable function,
*		then you need to initialize values for all of the parameters
*		passed to the function since these are assumed to be in
*		registers as defined by the calling convention of the
*		compiler, (refer to the TMS320C6x Optimizing C Compiler
*		User's Guide).
*
*	C CODE
*		This is the C equivalent for the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*		void fir(float *x, float *h, float *y, int numH, int numY)
*		{
*		   int i, j;
*		   float sum;
*
*		   for(j=0; j < numY; j++)
*		   {
*		      sum = 0;
*		      for(i=0; i < numH; i++)
*		      {
*			 sum += x[i+j] * h[i];
*		      }
*		      y[j] = sum;
*		   }
*		}
*
*	DESCRIPTION
*
*		This routine implements a block FIR filter.  There are "numH"
*		filter coefficients, "numY" output samples, and "numH+numY-1"
*		input samples.	The coefficients need to be placed in the "h"
*		array in reverse order {h(numH-1), ... , h(1), h(0)} and the
*		array "x" starts at x(-numH+1) and ends at x(numY-1).  The
*		routine calculates y(0) through y(numY-1) using the following
*		formula:
*
*		y(n) = h(0)*x(n) + h(1)*x(n-1) + ... + h(numH-1)*x(n-numH+1)
*
*		where n = {0, 1, ... , numY-1}.
*
*	TECHNIQUES
*
*		1.  LDDW instructions are used to load two SP floating point 
*		    values simultaneously for the x and h arrays.
*		2.  The outer loop is unrolled 4 times.
*		3.  The inner loop is unrolled 2 times and software pipelined.
*		4.  The variables prod1, prod3, prod5, and prod7 share A9.
*		    The variables prod0, prod2, prod4, and prod6 share B6.
*		    The variables sum1, sum3, sum5, and sum7 share A7.
*		    The variables sum0, sum2, sum4, and sum6 share B7.
*		    This multiple assignment is possible since the variables
*		    are always read just once on the first cycle that they
*		    are availble.
*		5.  The first 8 cycles of the inner loop prolog are
*		    conditionally scheduled in parallel with the outer loop.
*		    This increases the code size by 14 words, but
*		    improves the cycle time.
*		6.  A load counter is used so that an epilog is not needed.
*		    No extraneous loads are performed.
*		7.  The variables ptr_h and ptr_x are conditionally reset on
*		    the last cycle of the kernel loop when the loop counter
*		    reaches zero.  Since the loop counter is zero for the last
*		    two iterations, the instructions decrement the pointers
*		    by half of the desired value on each iteration.
*	
*	ASSUMPTIONS
*
*		1.  Little Endian is assumed for LDDW instructions.
*		2.  The number of coefficients (numH) must be a multiple of 2
*		    and greater than or equal to 4 (4, 6, 8, ...).
*		3.  The number of outputs (numY) must be a multiple of 4
*		    and greater than or equal to 4 (4, 8, 12, ...).
*		4.  Since single assignment of registers is not used,
*		    interrupts should be disabled before this function is 
*                   called.
*		
*	MEMORY NOTE
*
*		The y, x, and h arrays should be placed on the same double-
*		word boundary (i.e. y, x and h on even double-word boundaries)
*		to prevent internal data memory bank hits.
*
*       ARGUMENTS PASSED
*
*		x	 ->  A4
*		h	 ->  B4
*		y	 ->  A6 = ptr_y
*		numH	 ->  B6
*		numY	 ->  A8
*
*	CYCLES
*
*		((2*numH) + 10)*(numY/4) + 8	    with C overhead
*		((2*numH) + 10)*(numY/4) + 8	    without C overhead
*
*	NOTATIONS
*
*		f = Function Prolog or Epilog
*		o = Outer Loop
*		p = Inner Loop Prolog
*
*===============================================================================
	.global _fir
	.text
_fir:
*** BEGIN Benchmark Timing ***
B_START:
* Prolog Begins ****************************************************************
	LDDW	.D1	*A4++[1],B1:B0	; load x1:x0 from memory
||	MV	.L1X	B4,A8		; f ptr_h = h
||	SUB	.S1	A8,4,A2 	; f ocntr = numY - 4
||	SHL	.S2	B6,1,B9 	; f B9 = (numH) << 1

	LDDW	.D1	*A8++[1],A5:A4	; load h1:h0 from memory
||	MV	.S2X	A4,B8		; f ptr_x = x
||	SUB	.S1X	B6,4,A0 	; f ireset = numH - 4

	LDDW	.D2	*B8,B5:B4	; load x3:x2 from memory
||	MV	.L2X	A0,B2		; icntr = ireset
||	MV	.L1	A0,A1		; lcntr = ireset
||	SUB	.S2	B9,8,B9 	; f xreset = B9 - 8

	LDW	.D2	*+B8[2],A3	; load x4 from memory

	LDDW	.D2	*B8++[1],B1:B0	; @ load x1:x0 from memory

	LDDW	.D1	*A8++[1],A5:A4	; @ load h1:h0 from memory

	LDDW	.D2	*B8,B5:B4	; @ load x3:x2 from memory
||	MPYSP	.M1X	B1,A5,A9	; prod1 = x1 * h1
||	MPYSP	.M2X	B0,A4,B6	; prod0 = x0 * h0

	LDW	.D2	*+B8[2],A3	; @ load x4 from memory
||	MPYSP	.M1X	B4,A5,A9	; prod3 = x2 * h1
||	MPYSP	.M2X	B1,A4,B6	; prod2 = x1 * h0

OLOOP:
   [A1] LDDW	.D2	*B8++[1],B1:B0	; @@ load x1:x0 from memory
||	MPYSP	.M1X	B5,A5,A9	; prod5 = x3 * h1
||	MPYSP	.M2X	B4,A4,B6	; prod4 = x2 * h0
||	B	.S1	LOOP		; if(icntr) branch to LOOP

   [A1] LDDW	.D1	*A8++[1],A5:A4	; @@ load h1:h0 from memory
||	MPYSP	.M1	A3,A5,A9	; prod7 = x4 * h1
||	MPYSP	.M2X	B5,A4,B6	; prod6 = x3 * h0
||	ZERO	.S1	A7		; sum1 = 0
||	ZERO	.S2	B7		; sum0 = 0
**** Loop Begins ***************************************************************
LOOP:
   [A1] LDDW	.D2	*B8,B5:B4	; @@ load x3:x2 from memory
||	MPYSP	.M1X	B1,A5,A9	; @ prod1 = x1 * h1
||	MPYSP	.M2X	B0,A4,B6	; @ prod0 = x0 * h0
||	ADDSP	.L1	A7,A9,A7	; sum1 = prod1 + sum1
||	ADDSP	.L2	B7,B6,B7	; sum0 = prod0 + sum0

   [A1] LDW	.D2	*+B8[2],A3	; @@ load x4 from memory
||	MPYSP	.M1X	B4,A5,A9	; @ prod3 = x2 * h1
||	MPYSP	.M2X	B1,A4,B6	; @ prod2 = x1 * h0
||	ADDSP	.L1	A7,A9,A7	; sum3 = prod3 + sum3
||	ADDSP	.L2	B7,B6,B7	; sum2 = prod2 + sum2
|| [A1] SUB	.S1	A1,2,A1 	; if(lcntr) lcntr -= 2

   [A1] LDDW	.D2	*B8++[1],B1:B0	; @@@ load x1:x0 from memory
||	MPYSP	.M1X	B5,A5,A9	; @ prod5 = x3 * h1
||	MPYSP	.M2X	B4,A4,B6	; @ prod4 = x2 * h0
||	ADDSP	.L1	A7,A9,A7	; sum5 = prod5 + sum5
||	ADDSP	.L2	B7,B6,B7	; sum4 = prod4 + sum4
|| [B2] B	.S1	LOOP		; if(icntr) branch to LOOP

   [A1] LDDW	.D1	*A8++[1],A5:A4	; @@@ load h1:h0 from memory
||	MPYSP	.M1	A3,A5,A9	; @ prod7 = x4 * h1
||	MPYSP	.M2X	B5,A4,B6	; @ prod6 = x3 * h0
||	ADDSP	.L1	A7,A9,A7	; sum7 = prod7 + sum7
||	ADDSP	.L2	B7,B6,B7	; sum6 = prod6 + sum6
|| [B2] SUB	.D2	B2,2,B2 	; if(icntr) icntr -= 2
||[!B2] SUB	.S2	B8,B9,B8	; o if(!icntr) ptr_x -= xreset
||[!B2] SUB	.S1X	A8,B9,A8	; o if(!icntr) ptr_h -= xreset
**** Loop Ends *****************************************************************
	ADDSP	.L1X	B7,A7,A7	; o temp1 = sum0 + sum1
|| [A2] SUB	.D1	A8,16,A8	; o ptr_h -= 16
|| [A2] LDDW	.D2	*B8++[1],B1:B0	; p load x1:x0 from memory

	ADDSP	.L2X	B7,A7,B7	; o temp2 = sum2 + sum3
|| [A2] LDDW	.D1	*A8++[1],A5:A4	; p load h1:h0 from memory

	ADDSP	.L1X	B7,A7,A7	; o temp3 = sum4 + sum5
|| [A2] B	.S1	OLOOP		; o if(ocntr) branch to OLOOP
||[!A2] B	.S2	B3		; f if(!ocntr) return
|| [A2] LDDW	.D2	*B8,B5:B4	; p load x3:x2 from memory

	ADDSP	.L2X	B7,A7,B7	; o temp4 = sum6 + sum7
|| [A2] LDW	.D2	*+B8[2],A3	; p load x4 from memory

	STW	.D1	A7,*A6++[2]	; o store temp1
|| [A2] LDDW	.D2	*B8++[1],B1:B0	; p load x1:x0 from memory
||	MV	.S2X	A6,B6		; o B6 = A6

	STW	.D2	B7,*+B6[1]	; o store temp2
|| [A2] LDDW	.D1	*A8++[1],A5:A4	; p load h1:h0 from memory
|| [A2] MV	.S1	A0,A1		; p lcntr = ireset
|| [A2] MV	.S2X	A0,B2		; p icntr = ireset

	STW	.D1	A7,*A6++[1]	; o store temp3
|| [A2] LDDW	.D2	*B8,B5:B4	; p load x3:x2 from memory
|| [A2] MPYSP	.M1X	B1,A5,A9	; p prod1 = x1 * h1
|| [A2] MPYSP	.M2X	B0,A4,B6	; p prod0 = x0 * h0

	STW	.D1	B7,*A6++[1]	; o store temp4
|| [A2] SUB	.S1	A2,4,A2 	; o if(ocntr) ocntr -= 4
|| [A2] LDW	.D2	*+B8[2],A3	; p load x4 from memory
|| [A2] MPYSP	.M1X	B4,A5,A9	; p prod3 = x2 * h1
|| [A2] MPYSP	.M2X	B1,A4,B6	; p prod2 = x1 * h0
* Outer Loop Ends **************************************************************
B_END:
*** END Benchmark Timing ***
