*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.
*
*	FIR4
*
*	Revision Date: 4/17/97
*
*	USAGE	This routine is C Callable and can be called as:
*
*		void lmsfir8(short *x,short *h,short *y,int N,short *d,
*			     short ar,short M)
*		
*		x =  input array
*		h =  coefficient array
*		y =  output array
*		N =  number of coefficients (MULTIPLE of 8 >= 8)
*		d =  desired output array
*		ar = adaptive rate
*		M =  number of output samples
*
*		If routine is not to be used as a C callable function
*		then all instructions relating to stack should be removed.
*		Refer to comments of individual instructions.  You will also
*		need to initialize values for all of the values passed as these
*		are assumed to be in registers as defined by the calling 
*		convention of the compiler, (refer to the C compiler reference
*		guide).
*
*	C Code 	This is the C equivalent of the Assembly Code without 
*		restrictions.
*
*		Note that the assembly code is hand optimized and restrictions 
*		may apply
*
*		void lmsfir8(short *x,short *h,short *y,int N,short *d,
*			     short ar,short M)
*		{
*		int i,j;
*		int sum;
*		short error = 0;
*
*		for (i = 0; i < M; i++) {
*
*			for (j = 0; j < N; j++) {
*			     h[j] = h[j] + ((((ar*error)>>15)*x[i-1+j])>>15);
*				}
*
*			sum = 0;
*			for (j = 0; j < N; j++) {
*			     sum += h[j] * x[i+j];
*				}
*
*			sum >>= 15;
*			*y++ = sum;
*
*			error = d[i] - sum;
*		
*			}
*		}
*
*	DESCRIPTION
*
*
*	TECHNIQUES
*		The inner loop is unrolled eight times to allow update of
*		previous stages coefficients to occur in the same inner loop.
*
*	ASSUMPTIONS
*		N MULTIPLE of 8
*
*	MEMORY NOTE
*		This code has no memory hits regardless of where x and h are
*		located in memory.  h must start on a word boundary
*
*	CYCLES	M * ((9/8) * N + 15) + 5
*
*===============================================================================

	.global _lmsfir8
	.bss	stack, 52
	.text

_lmsfir8:


	MVK	.S1	stack,	A0		; move stack pointer into A0
||	MVK	.S2	stack,	B0		; move stack pointer into B0

	MVKH	.S1	stack,	A0		; move stack pointer into A0
||	MVKH	.S2	stack,	B0		; move stack pointer into B0

	STW	.D1	A15,	*+A0[0]		; push A15 on stack
||	STW	.D2	B15,	*+B0[1]		; push B15 on stack

	STW	.D1	A14,	*+A0[2]		; push A14 on stack
||	STW	.D2	B14,	*+B0[3]		; push B14 on stack

	STW	.D1	A13,	*+A0[4]		; push A13 on stack
||	STW	.D2	B13,	*+B0[5]		; push B13 on stack

	STW	.D1	A12,	*+A0[6]		; push A12 on stack
||	STW	.D2	B12,	*+B0[7]		; push B12 on stack

	STW	.D1	A11,	*+A0[8]		; push A11 on stack
||	STW	.D2	B11,	*+B0[9]		; push B11 on stack

	STW	.D1	A10,	*+A0[10]	; push A10 on stack
||	STW	.D2	B10,	*+B0[11]	; push B10 on stack

	STW	.D1	B3,	*+A0[12]	; push return pointer on stack

*** BEGIN Benchmark Timing ***
B_START

	MV	.D1	A10,A2		; Outer Loop Count M
||	MV	.D2	B6,B14		; Copy Number of Coefs
||	LMBD	.L2	1,B6,B6		; Left most bit on nCoefs
||	MVK	.S2	32,B0		;

	MV	.D2	B4,B5		; Coefs Buffer Pointer
||	SUB	.L2	B0,B6,B6	; N where 2^(N+1) block size of circ buff
||	ZERO	.L1	A3		; Init desired to zero so Error = 0
||	MVK	.S2	0101h,B9	; Used to set A4,B4 circular addr mode

	SHL	.S2	B6,16,B6	;
||	MV	.L2X	A4,B4		; Buffer pointer
||	MV	.L1X	B5,A1		; Coefs Buffer Pointer

	ADD	.L2	B6,B9,B9	; Used to set circular addr mode BK size

	MVC	.S2	B9,AMR		; Set A4,B4, circular addr mode w/ BK0
||	ZERO	.L2	B3		; Init output to zero so Error = 0
||	MV	.D2	B8,B15		; Copy Adaptive Rate
||	ADDAW	.D1	A1,1,A1		; Offset to Coefs Buffer Pointer
||[A2]	SUB		A2,1,A2		; Decrement outer loop counter M


KERNEL:

  	LDH	.D1	*++A4[3],A0	; x(i+3)
||	LDH	.D2	*B4++,B2	; x(i)
||	SUB	.L2x	A3,B3,B1	; Error = Desired - Output
||	SHR	.S2	B14,3,B0	; Number of Coefs / 8

  	LDH	.D1	*--A4,A3	; x(i+2)
||	LDH	.D2	*B4++[4],B3	; x(i+1)
||	SMPY	.M2	B1,B15,B6	; Error*ar
||	ZERO		B11		; initialize y0 to zero

	ZERO		A10		; initialize y2 to zero
||  	LDH	.D1	*--A4[3],A11	; x(i-1) Previous state

  	LDH	.D1	*++A4[7],A13	; x(i+6)
||	LDH	.D2	*B4--,B1	; x(i+5)
||	SHR	.S2	B6,16,B6	; Q15 format ar*error

  	LDW	.D1	*A1++[2],A11	; h(3) & h(2)
||	LDW	.D2	*B5++[2],B10	; h(1) & h(0)
||	MV	.L1X	B6,A5		; Q15 format ar*error

  	SMPY	.M2	B6,B2,B10	; ph1 = ar * error * x(i)
||	SMPY	.M1	A5,A0,A7	; ph4 = ar * error * x(i+3)
||	LDW	.D1	*A1++[2],A15	; h(7) & h(6)
||	LDW	.D2	*B5++[2],B7	; h(5) & h(4)

  	SMPY	.M2	B6,B3,B12	; ph2 = ar * error * x(i+1)
||	SMPY	.M1	A5,A3,A7	; ph3 = ar * error * x(i+2)
||	LDH	.D2	*B4++[4],B8	; x(i+4)
||	ADDAH	.D1	A4,1,A4		; *x pointer update

  	CLR	.S2	B10,0,15,B9	; psh1 = ph1 w/ lower 16 bits cleared
||	SHRU	.S1	A7,16,A14	; psh4 = ph4 >> 15
||	SMPY	.M1	A5,A11,A7	; ph0 = ar * error * x(i+7)

  	SHRU	.S2	B12,16,B8	; psh2 = ph2 >> 16
||	CLR	.S1	A7,0,15,A15	; psh3 = ph3 w/ lower 16 bits cleared
||	LDH	.D1	*A4++[4],A11	; x(i+7)

  	SMPY	.M2	B6,B1,B9	; ph6 = ar * error * x(i+5)
||	SMPY	.M1	A5,A13,A9	; ph7 = ar * error * x(i+6)
||	ADD	.L1X	A15,B8,A12	; ph3 & ph2
||	LDH	.D1	*A4--,A0	;* x(i+3)
||	LDH	.D2	*B4++,B2	;* x(i)
||	SHRU	.S2X	A7,16,B12	; psh0 = ph0 >> 16

  	ADD2	.S1	A12,A11,A7	; h(3) += ph3 & h(2) += ph2
||	LDH	.D1	*A4++[4],A3	;* x(i+2)
||	LDH	.D2	*B4++[4],B3	;* x(i+1)
||	ADD	.L2	B9,B12,B13	; ph1 & ph0

  	ADD2	.S2	B13,B10,B13	; h(1) += ph1 & h(0) += ph0
||	SMPY	.M2	B6,B8,B9	; ph5 = ar * error * x(i+4)
||	CLR	.S1	A9,0,15,A9	; psh7 = ph7 w/ lower 16 bits cleared
||	STW	.D1	A7,*-A1[4]	; store h(3) & h(2)
||[B0]	ADD	.L2	-1,B0,B0	; dec loop counter
||	MPY	.M1	A3,A7,A7	; py2 = x(i+2) * h(2)
||	ZERO		A12		; initialize y3 to zero
||	ZERO		B12		; initialize y1 to zero

  	SHRU	.S2	B9,16,B10	; psh6 = ph6 >> 15
||	MPYLH	.M1	A0,A7,A9	; py3 = x(i+3) * h(3)
||	MPY	.M2	B2,B13,B10	; py0 = x(i) * h(0)
||	LDH	.D1	*A4++,A13	;* x(i+6)
||	LDH	.D2	*B4--,B1	;* x(i+5)
||[!B0] B		END		; if # coefs is 8 skip over loop

  	ADD		A7,A10,A10	; y2 += py2,
||	CLR	.S2	B9,0,15,B9	; psh5 = ph5 w/ lower 16 bits cleared
||	MPYLH	.M2	B3,B13,B10	; py1 = x(i+1) * h(1)
||	ADD	.L1X	A9,B10,A7	; ph7 & ph6
||	LDW	.D1	*A1++[2],A11	;* h(3) & h(2)
||	SMPY	.M1	A5,A11,A7	;* ph0 = ar * error * x(i)
||	LDW	.D2	*B5++[2],B10	;* h(1) & h(0)

  	ADD		B10,B11,B11	; y0 += py0,
||	ADD		A9,A12,A9	; y3 += py3,
||	ADD2	.S1	A7,A15,A15	; h(7) += ph7 & h(6) += ph6
||	ADD	.L2X	B9,A14,B9	; ph5 & ph4
||	SMPY	.M2	B6,B2,B10	;* ph1 = ar * error * x(i)
||	SMPY	.M1	A5,A0,A7	;* ph4 = ar * error * x(i+3)
||	LDW	.D1	*A1++[2],A15	;* h(7) & h(6)
||	LDW	.D2	*B5++[2],B7	;* h(5) & h(4)

  	ADD		B10,B12,B9	; y1 += py1,
||	ADD2	.S2	B9,B7,B7	; h(5) += ph5 & h(4) += ph4
||	SMPY	.M2	B6,B3,B12	;* ph2 = ar * error * x(i+1)
||	SMPY	.M1	A5,A3,A7	;* ph3 = ar * error * x(i+2)
||	SHRU	.S1	A7,16,A12	;* psh0 = ph0 >> 16
||	LDH	.D2	*B4++[4],B8	;* x(i+4)
||	LDH	.D1	*A4++[4],A11	;* x(i+7)

  	STW	.D2	B13,*-B5[8]	; store h(1) & h(0)
||	STW	.D1	A15,*-A1[6]	; store h(7) & h(6)
||	MPY	.M1	A13,A15,A7	; py6 = x(i+6) * h(6)
||	MPYLH	.M2	B1,B7,B7	; py5 = x(i+5) * h(5)
||	CLR	.S2	B10,0,15,B10	;* psh1 = ph1 w/ lower 16 bits cleared
||	SHRU	.S1	A7,16,A14	;* psh4 = ph4 >> 15

  	STW	.D2	B7,*-B5[6]	; store h(5) & h(4)
||	MPYLH	.M1	A11,A15,A7	; py7 = x(i+7) * h(7)
||	MPY	.M2	B8,B7,B8	; py4 = x(i+4) * h(4)
||	SHRU	.S2	B12,16,B8	;* psh2 = ph2 >> 16
||	CLR	.S1	A7,0,15,A15	;* psh3 = ph3 w/ lower 16 bits cleared
||	ADD	.L2X	B10,A12,B13	;* ph1 & ph0

OUTLOOP:
  	ADD		B7,B9,B12	; y1 += py5,
||	ADD		A7,A10,A10	; y2 += py6,
||	SMPY	.M2	B6,B1,B9	;* ph6 = ar * error * x(i+5)
||	SMPY	.M1	A5,A13,A9	;* ph7 = ar * error * x(i+6)
||	ADD	.L1X	A15,B8,A12	;* ph3 & ph2
||	LDH	.D1	*A4--,A0	;** x(i+3)
||	LDH	.D2	*B4++,B2	;** x(i)

  	ADD		B8,B11,B11	; y0 += py4,
||	ADD		A7,A9,A12	; y3 += py7,
||	ADD2	.S1	A12,A11,A7	;* h(3) += ph3 & h(2) += ph2
||	LDH	.D1	*A4++[4],A3	;** x(i+2)
||	LDH	.D2	*B4++[4],B3	;** x(i+1)

  	ADD2	.S2	B13,B10,B13	;* h(1) += ph1 & h(0) += ph0
||	SMPY	.M2	B6,B8,B9	;* ph5 = ar * error * x(i+4)
||	CLR	.S1	A9,0,15,A9	;* psh7 = ph7 w/ lower 16 bits cleared
||	STW	.D1	A7,*-A1[4]	;* store h(3) & h(2)
||[B0]	ADD	.L2	-1,B0,B0	;* dec loop counter
||	MPY	.M1	A3,A7,A7	;* py2 = x(i+2) * h(2)

  [B0]	B	.S1	OUTLOOP		; for OUTLOOP
||	SHRU	.S2	B9,16,B10	;* psh6 = ph6 >> 15
||	MPYLH	.M1	A0,A7,A9	;* py3 = x(i+3) * h(3)
||	MPY	.M2	B2,B13,B10	;* py0 = x(i) * h(0)
||	LDH	.D1	*A4++,A13	;** x(i+6)
||	LDH	.D2	*B4--,B1	;** x(i+5)

  	ADD		A7,A10,A10	;* y2 += py2,
||	CLR	.S2	B9,0,15,B9	;* psh5 = ph5 w/ lower 16 bits cleared
||	MPYLH	.M2	B3,B13,B10	;* py1 = x(i+1) * h(1)
||	ADD	.L1X	A9,B10,A7	;* ph7 & ph6
||	LDW	.D1	*A1++[2],A11	;** h(3) & h(2)
||	SMPY	.M1	A5,A11,A7	;** ph0 = ar * error * x(i)
||	LDW	.D2	*B5++[2],B10	;** h(1) & h(0)

  	ADD		B10,B11,B11	;* y0 += py0,
||	ADD		A9,A12,A9	;* y3 += py3,
||	ADD2	.S1	A7,A15,A15	;* h(7) += ph7 & h(6) += ph6
||	ADD	.L2X	B9,A14,B9	;* ph5 & ph4
||	SMPY	.M2	B6,B2,B10	;** ph1 = ar * error * x(i)
||	SMPY	.M1	A5,A0,A7	;** ph4 = ar * error * x(i+3)
||	LDW	.D1	*A1++[2],A15	;** h(7) & h(6)
||	LDW	.D2	*B5++[2],B7	;** h(5) & h(4)

  	ADD		B10,B12,B9	;* y1 += py1,
||	ADD2	.S2	B9,B7,B7	;* h(5) += ph5 & h(4) += ph4
||	SMPY	.M2	B6,B3,B12	;** ph2 = ar * error * x(i+1)
||	SMPY	.M1	A5,A3,A7	;** ph3 = ar * error * x(i+2)
||	SHRU	.S1	A7,16,A12	;** psh0 = ph0 >> 16
||	LDH	.D2	*B4++[4],B8	;** x(i+4)
||	LDH	.D1	*A4++[4],A11	;** x(i+7)

  	STW	.D2	B13,*-B5[8]	;* store h(1) & h(0)
||	STW	.D1	A15,*-A1[6]	;* store h(7) & h(6)
||	MPY	.M1	A13,A15,A7	;* py6 = x(i+6) * h(6)
||	MPYLH	.M2	B1,B7,B7	;* py5 = x(i+5) * h(5)
||	CLR	.S2	B10,0,15,B10	;** psh1 = ph1 w/ lower 16 bits cleared
||	SHRU	.S1	A7,16,A14	;** psh4 = ph4 >> 15

  	STW	.D2	B7,*-B5[6]	;* store h(5) & h(4)
||	MPYLH	.M1	A11,A15,A7	;* py7 = x(i+7) * h(7)
||	MPY	.M2	B8,B7,B8	;* py4 = x(i+4) * h(4)
||	SHRU	.S2	B12,16,B8	;** psh2 = ph2 >> 16
||	CLR	.S1	A7,0,15,A15	;** psh3 = ph3 w/ lower 16 bits cleared
||	ADD	.L2X	B10,A12,B13	;** ph1 & ph0



END:
  	ADD	.L2	B7,B9,B12	; y1 += py5,
||	ADD	.L1	A7,A10,A10	; y2 += py6,
||[A2]	B	.S2	KERNEL	;
||	SUBAH	.D2	B5,B14,B5	; reset *h pointer
||	LDH	.D1	*A8++,A3	; Load Desired

  	ADD		B8,B11,B11	; y0 += py4,
||	ADD		A7,A9,A12	; y3 += py7,
||	SUBAH	.D2	B4,B14,B4	; reset *x pointer
||[A2]	SUB		A2,1,A2		; Decrement outer loop counter M

	ADD		B11,B12,B12	; y01 = y0 + y1
||	ADD		A10,A12,A12	; y23 = y2 + y3
||	SUBAW	.D2	B5,4,B5		; reset *h pointer

	ADD		A12,B12,B3	; sum = y0 + y1 + y2 + y3
||	MV	.L1X	B5,A1		; copy *h pointer
||	SUBAH	.D2	B4,7,B4		; reset *x pointer

	SHR	.S2	B3,15,B3	; sum >>= 15
||	ADDAW	.D1	A1,1,A1		; reset *h pointer

	STH	.D1	B3,*A6++	; Store output to *y++
||	MV	.L1X	B4,A4		; copy *x pointer

B_END:
*** END Benchmark Timing ***
	MVK	.S1	stack,		A8	; move stack pointer into A8
||	MVK	.S2	stack,		B8	; move stack pointer into B8

	MVKH	.S1	stack,		A8	; move stack pointer into A8
||	MVKH	.S2	stack,		B8	; move stack pointer into B8

	LDW	.D1	*+A8[12],	B3	; pop return address off stack

	LDW	.D1	*+A8[0],	A15	; pop A15 off stack
||	LDW	.D2	*+B8[1],	B15	; pop B15 off stack

	LDW	.D1	*+A8[2],	A14	; pop A14 off stack
||	LDW	.D2	*+B8[3],	B14	; pop B14 off stack

	LDW	.D1	*+A8[4],	A13	; pop A13 off stack
||	LDW	.D2	*+B8[5],	B13	; pop B13 off stack

	LDW	.D1	*+A8[6],	A12	; pop A12 off stack
||	LDW	.D2	*+B8[7],	B12	; pop B12 off stack

	LDW	.D1	*+A8[8],	A11	; pop A11 off stack
||	LDW	.D2	*+B8[9],	B11	; pop B11 off stack
||	B	.S2	B3			; return

	LDW	.D1	*+A8[10],	A10	; pop A10 off stack
||	LDW	.D2	*+B8[11],	B10	; pop B10 off stack

	NOP		4


