*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.
*
*	FIR4
*
*	Revision Date: 4/17/97
*
*	USAGE	This routine is C Callable and can be called as:
*
*		void fir(short *x, short *h, short *y, int N, int M)
*		
*		x = input array
*		h = coefficient array
*		y = output array
*		N = number of coefficients (MULTIPLE of 4 >= 8)
*		M = number of output samples (M EVEN >= 2)
*
*		If routine is not to be used as a C callable function
*		then all instructions relating to stack should be removed.
*		Refer to comments of individual instructions.  You will also
*		need to initialize values for all of the values passed as these
*		are assumed to be in registers as defined by the calling 
*		convention of the compiler, (refer to the C compiler reference
*		guide).
*
*	C Code 	This is the C equivalent of the Assembly Code without 
*		restrictions.
*
*		Note that the assembly code is hand optimized and restrictions 
*		may apply
*
*		void fir4(short x[], short h[], short y[], int N, int M)
*		{
*			int i, j, sum;
*			
*			for (j = 0; j < M; j++) {
*				sum = 0;
*				for (i = 0; i < N; i++)
*					sum += x[i + j] * h[i];
*				y[j] = sum >> 15;
*			}
*		}
*
*	DESCRIPTION
*		This FIR assumes the number of filter coeficients is a multiple
*		of 4 and the number of output samples is a multiple of 2.  It
*		operates on 16-bit data with a 32-bit accumulate.  This
*		routine has no memory hits regardless of where x, h, and y 
*		arrays are located in memory.  The filter is M output samples
*		and N coefficients.  The assembly routine performs 2 output
*		samples at a time.
*
*
*	TECHNIQUES
*		The inner loop is unrolled four times thus the number of 
*		filter coefficients must be a multiple of four.  The outer
*		loop is unrolled twice so the number of output samples must
*		be a multiple of 2.
*
*		If an odd number of output samples is needed or possible, the
*		final store can either be removed or conditionally executed 
*		depending on whether M is even or odd.  This code would have to 
*		be added to the existing code.
*
*		The outer loop, like the inner loop, is software pipelined as
*		well.  e, o, and p in the comments of the individual 
*		instructions correspond to the epilogue, outer loop, and 
*		prologue respectively.  
*
*		Refer to FIR example in the optimizing assembly chapter of
*		the programmer's guide for more information.
*
*
*	ASSUMPTIONS
*		N MULTIPLE of 4 >= 8
*		M EVEN >= 2
*
*
*	MEMORY NOTE
*		This code has no memory hits regardless of where x and h are
*		located in memory.
*
*	CYCLES	M*(N+8)/2+6
*
*===============================================================================
	.global _fir4
	.text
_fir4:
	STW	.D2	B10,*B15--	; push register (for c-callable func)

*** BEGIN Benchmark Timing ***
B_START

	STW	.D2	B11,*B15--	; push register (for c-callable func)
||	SHR	.S1	A8,1,A2		; set up outer loop counter
||	SHL	.S2	B6,1,B10	; used to rst h pointer each outer loop

	STW	.D2	B12,*B15--	; push register (for c-callable func)
||	ADD	.L1X	B10,10,A3	; used to rst x pointer each outer loop
||	ADD	.S2	B10,8,B10	; used to rst h pointer each outer loop
||	ADD	.L2X	A6,2,B11	; set up pointer to y[1]

	LDH	.D1	*A4++,B8	; x0 = x[j]
||	ADD	.L2X	A4,4,B1		; set up pointer to x[j+2]
||	ADD	.L1X	B4,2,A8		; set up pointer to h[1]
||	SHR	.S2	B6,2,B12	; set up inner loop counter
||[A2]	SUB	.S1	A2,1,A2		; decrement outer loop counter

  	LDH	.D2	*B1++[2],B0	; x2 = x[j+i+2]
||	LDH	.D1	*A4++[2],A0	; x1 = x[j+i+1]

  	LDH	.D1	*A8++[2],B6	; h1 = h[i+1]
||	LDH	.D2	*B4++[2],A1	; h0 = h[i]

  	LDH	.D1	*A4++[2],A5	; x3 = x[j+i+3]
||	LDH	.D2	*B1++[2],B5	; x0 = x[j+i+4]

OUTLOOP:
  	LDH	.D2	*B4++[2],A7	; h2 = h[i+2]
||	LDH	.D1	*A8++[2],B8	; h3 = h[i+3]
||	ZERO	.L1	A9		; zero out sum0
||	ZERO	.L2	B9		; zero out sum1

  	LDH	.D2	*B1++[2],B0	;* x2 = x[j+i+2]
||	LDH	.D1	*A4++[2],A0	;* x1 = x[j+i+1]
||	SUB	.S2	B12,2,B2	; set up inner loop counter

	LDH	.D1	*A8++[2],B6	;* h1 = h[i+1]
||	LDH	.D2	*B4++[2],A1	;* h0 = h[i]

  	MPY	.M1X	B8,A1,A0	; x0 * h0
||	MPY	.M2X	A0,B6,B6	; x1 * h1
||	LDH	.D1	*A4++[2],A5	;* x3 = x[j+i+3]
||	LDH	.D2	*B1++[2],B5	;* x0 = x[j+i+4]

  [B2]	B	.S1	LOOP		; branch to loop
||	MPY	.M2	B0,B6,B7	; x2 * h1
||	MPY	.M1	A0,A1,A1	; x1 * h0
||	LDH	.D2	*B4++[2],A7	;* h2 = h[i+2]
||	LDH	.D1	*A8++[2],B8	;* h3 = h[i+3]
||[B2]	SUB	.S2	B2,1,B2		;* decrement loop counter

  	ADD	.L1	A0,A9,A9	; sum0 += x0 * h0
||	MPY	.M2X	A5,B8,B8	; x3 * h3
||	MPY	.M1X	B0,A7,A5	; x2 * h2
||	LDH	.D2	*B1++[2],B0	;** x2 = x[j+i+2]
||	LDH	.D1	*A4++[2],A0	;** x1 = x[j+i+1]

LOOP:
  	ADD	.L2X	A1,B9,B9	; sum1 += x1 * h0
||	ADD	.L1X	B6,A9,A9	; sum0 += x1 * h1
||	MPY	.M2	B5,B8,B7	; x0 * h3
||	MPY	.M1	A5,A7,A7	; x3 * h2
||	LDH	.D1	*A8++[2],B6	;** h1 = h[i+1]
||	LDH	.D2	*B4++[2],A1	;** h0 = h[i]

  	ADD	.L2	B7,B9,B9	; sum1 += x2 * h1
||	ADD	.L1	A5,A9,A9	; sum0 += x2 * h2
||	MPY	.M1X	B5,A1,A0	;* x0 * h0
||	MPY	.M2X	A0,B6,B6	;* x1 * h1
||	LDH	.D1	*A4++[2],A5	;** x3 = x[j+i+3]
||	LDH	.D2	*B1++[2],B5	;** x0 = x[j+i+4]

  	ADD	.L2X	A7,B9,B9	; sum1 += x3 * h2
||	ADD	.L1X	B8,A9,A9	; sum0 += x3 * h3
||[B2]	B	.S1	LOOP		;* branch to loop
||	MPY	.M2	B0,B6,B7	;* x2 * h1
||	MPY	.M1	A0,A1,A1	;* x1 * h0
||	LDH	.D2	*B4++[2],A7	;** h2 = h[i+2]
||	LDH	.D1	*A8++[2],B8	;** h3 = h[i+3]
||[B2]	SUB	.S2	B2,1,B2		;** decrement loop counter

  	ADD	.L2	B7,B9,B9	; sum1 += x0 * h3
||	ADD	.L1	A0,A9,A9	;* sum0 += x0 * h0
||	MPY	.M2X	A5,B8,B8	;* x3 * h3
||	MPY	.M1X	B0,A7,A5	;* x2 * h2
||	LDH	.D2	*B1++[2],B0	;*** x2 = x[j+i+2]
||	LDH	.D1	*A4++[2],A0	;*** x1 = x[j+i+1]
	; inner loop branch occurs here

  	ADD	.L2X	A1,B9,B9	;e sum1 += x1 * h0
||	ADD	.L1X	B6,A9,A9	;e sum0 += x1 * h1
||	MPY	.M2	B5,B8,B7	;e x0 * h3
||	MPY	.M1	A5,A7,A7	;e x3 * h2
||	SUB	.D1	A4,A3,A4	;o reset x pointer to x[j]
||	SUB	.D2	B4,B10,B4	;o reset h pointer to h[0]
||[A2]	B	.S1	OUTLOOP		;o branch to outer loop

  	ADD	.D2	B7,B9,B9	;e sum1 += x2 * h1
||	ADD	.L1	A5,A9,A9	;e sum0 += x2 * h2
||	LDH	.D1	*A4++,B8	;p x0 = x[j]
||	ADD	.L2X	A4,4,B1		;p set up pointer to x[j+2]
||	ADD	.S1X	B4,2,A8		;p set up pointer to h[1]

  	ADD	.L2X	A7,B9,B9	;e sum1 += x3 * h2
||	ADD	.L1X	B8,A9,A9	;e sum0 += x3 * h3
||  	LDH	.D2	*B1++[2],B0	;p x2 = x[j+i+2]
||	LDH	.D1	*A4++[2],A0	;p x1 = x[j+i+1]
||[A2]	SUB	.S1	A2,1,A2		;o decrement outer loop counter

  	ADD	.L2	B7,B9,B9	;e sum1 += x0 * h3
||	SHR	.S1	A9,15,A9	;e sum0 >> 15
||	LDH	.D1	*A8++[2],B6	;p h1 = h[i+1]
||	LDH	.D2	*B4++[2],A1	;p h0 = h[i]

	SHR	.S2	B9,15,B9	;e sum1 >> 15
||	LDH	.D1	*A4++[2],A5	;p x3 = x[j+i+3]
||	LDH	.D2	*B1++[2],B5	;p x0 = x[j+i+4]

	STH	.D1	A9,*A6++[2]	;e y[j] = sum0 >> 15
||	STH	.D2	B9,*B11++[2]	;e y[j+1] = sum1 >> 15
	; outer loop branch occurs here
B_END:
*** END Benchmark Timing ***


	LDW	.D2	*++B15,B12	; pop register (for c-callable func)

	LDW	.D2	*++B15,B11	; pop register (for c-callable func)
||	B	.S2	B3		; return

	LDW	.D2	*++B15,B10	; pop register (for c-callable func)

	NOP	4
