*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.
*
*	IIR
*
*	Revision Data: 05/13/97
*
*	USAGE 	This routine is C Callable and can be called as:
*
*		void iir(short *oPtr, short *iPtr, short *inPtr, short *b,
*			 short *a, int M)
*
*		oPtr 	= output array (used)
*		iPtr 	= input array
*		inPtr	= output array (stored)
*		a	= filter coefs
*		b	= filter coefs
*		M	= length (number of output samples)
*
*		If routine is not to be used as a C callable function
*		then all instructions relating to stack should be removed.
*		Refer to comments of individual instructions.  You will also
*		need to initialize values for all of the values passed as these
*		are assumed to be in registers as defined by the calling 
*		convention of the compiler, (refer to the C compiler reference
*		guide).
*
*	C CODE 	This is the C equivalent of the assembly code without
*		restrictions.  Note that the assembly code is hand optimized and
*		restrictions may apply.
*
*		void iir(short *oPtr, short *iPtr, short *inPtr, short *b,
*			 short *a, int M)
* 
*			{
*			int j,i;
*			int sum;
* 
*			for (i=0; i<M; i++){
*				sum = b[0] * iPtr[4+i];
*				for (j = 1; j <= 4; j++) 
*					sum += b[j]*iPtr[4+i-j]-a[j]*oPtr[4+i-j];
*				oPtr[4+i] = (sum >> 15);
*				inPtr[i] = oPtr[4+i];
*				} 
*			}
*
*
*	DESCRIPTION
*		The iir performs an Auto-regressive moving-average (ARMA) filter
*		with 4 auto-regressive filter coefficients and 5 moving-average
*		filter coefficients for M output samples. The output vectro is
*		stored in two locations.  This routine is used as a high pass
*		filter in the VSELP vocoder.  All data is assumed to be 16-bit.
*		To avoid memory hits Optr must be aligned on the next halfword
*		boundary following the alignment of iPtr.
*
*	TECHNIQUES
*		The inner loop is completely unrolled and software pipelined
*		(i.e. each time the 5 cycle loop "LOOP" is executed the inner
*		loop of the C code is executed.)
*
*	MEMORY NOTE
*		To avoid memory hits Optr must be aligned on the next halfword
*		boundary following the alignment of iPtr. Other wise there is a
*		total of M memory hits (once per outer loop.)
*
*	CYCLES	M*5 + 16
*		for M = 160 -> 816 cycles or 4.08 usec
* 
*===============================================================================
	.global _iir
	.text
_iir:

	STW	.D2	A10,*B15--	; push A10 on stack
||	MV	.L1X	B15,A1		; copy stack pointer

	STW	.D1	A11,*--A1[2]	; push A11 on stack
||	STW	.D2	B10,*B15--[2]	; push B10 on stack

	STW	.D2	B11,*B15	; push B11 on stack

*** BEGIN Benchmark Timing ***
B_START

	LDH	.D2	*B6,B10		; get b[0]

	LDH	.D1	*+A8[2],A7	; get a[2] 
||	LDH	.D2	*+B6[2],B9	; get b[2] 
||	ADD	.L2	8,B4,B4		; i -> iPtr[4]

	LDHU	.D1	*+A8[1],A5	; get a[1] 
||	LDHU	.D2	*+B6[1],B5	; get b[1] 
||	MV	.S1	B8,A1		; A1 = LEN (normally 160)

	LDH	.D1	*+A8[4],A5	; get a[4]
||	LDH	.D2	*+B6[4],B5	; get b[4]

	LDHU	.D1	*+A8[3],A8	; get a[3]
||	LDHU	.D2	*+B6[3],B8	; get b[3]

	LDH	.D2	*B4--,B0	; get iPtr[4]
||	LDH	.D1	*A4++[2],A0	; get oPtr[0]
||	ADD	.L1	8,A4,A11	; used for store ptr
||	MVK	.S2	1,B1		; second primer

	LDH	.D2	*B4--,B0	; get iPtr[3]
||	LDH	.D1	*A4--,A9	; get oPtr[2]
||	SHL	.S1	A7,16,A7	; put a[2] in 16MSBs
||	SHL	.S2	B9,16,B9	; put b[2] in 16MSBs

	LDH	.D2	*B4--,B0	; get iPtr[2]
||	LDH	.D1	*A4++[2],A0	; get oPtr[1]
||	OR	.L1	A7,A5,A7	; a[2] & a[1]
||	OR	.L2	B9,B5,B9	; b[2] & b[1]

	LDH	.D2	*B4--,B0	; get iPtr[1]
||	LDH	.D1	*A4--[2],A9	; get oPtr[3]
||	SHL	.S1	A5,16,A5	; put a[4] in 16MSBs
||	SHL	.S2	B5,16,B5	; put b[4] in 16MSBs

	LDH	.D2	*B4++[5],B0	; get iPtr[0]
||	OR	.L1	A5,A8,A5	; a[4] & a[3]
||	OR	.L2	B5,B8,B5	; b[4] & b[3]
||	MVK	.S2	0,B2		; used for store priming cnt
|| [A1] B	.S1	LOOP		; for (i=0; i<M; i++)

LOOP:
  [B2]	ADD	.S1	A2,A10,A2	; o2*a2+o1*a3+o0*a4
||[B2]	ADD	.L2	B11,B7,B11	; i4*b0+i3*b1+i2*b2+i1*b3
||	MPY	.M2	B0,B10,B8	;* iPtr[4] * b[0]
||	MPYLH	.M1	A0,A5,A8	;* oPtr[0] * a[4]
||	LDH	.D2	*B4--,B0	;** get iPtr[4]
||	LDH	.D1	*A4++,A0	;** get oPtr[0]

  [B2]	ADD	.S1	A2,A8,A2	; o3*a1+o2*a2+o1*a3+o0*a4
||[B2]	ADD	.S2	B11,B7,B11	; i4*b0+i3*b1+i2*b2+i1*b3+i0*b4
||	MPY	.M2	B0,B9,B7	;* iPtr[3] * b[1]
||	MPYLH	.M1	A9,A7,A3	;* oPtr[2] * a[2]
||	LDH	.D2	*B4--,B0	;** get iPtr[3]
||[!B2]	LDH	.D1	*+A4[1],A9	;* get oPtr[2]
|| [A1] ADD	.L1	-1,A1,A1	; decrement loop counter

  [B2]	SUB	.L1X	B11,A2,A2	; oPtr[4] is computed here
||	MPYLH	.M2	B0,B9,B7	;* iPtr[2] * b[2]
||	MPY	.M1	A0,A5,A10	;* oPtr[1] * a[3]
||	LDH	.D2	*B4--,B0	;** get iPtr[2]
||	LDH	.D1	*A4,A0		;** get oPtr[1]

  [B2]	SHR	.S1	A2,15,A9	; scale oPtr[4]		
||[!B1] STH	.D1	A9,*A6++	; store oPtr[4] to *inPtr
||	MPY	.M2	B0,B5,B7	;* iPtr[1] * b[3]
||	ADD	.L2	B8,B7,B11	;* i4*b0+i3*b1
||	LDH	.D2	*B4--,B0	;** get iPtr[1]
||[B1]	SUB	.S2	B1,B2,B1	; prime first store

  [B2]	STH	.D1	A9,*+A4[1]	; store oPtr[4] to *iPtr	
||	MPYLH	.M2	B0,B5,B7	;* iPtr[0] * b[4]
||	MPY	.M1	A7,A9,A8	;* oPtr[3] * a[1]
||	ADD	.L1	A3,A8,A2	;* o2*a2+o1*a3
||	ADD	.S2	B11,B7,B11	;* i4*b0+i3*b1+i2*b2
||	LDH	.D2	*B4++[5],B0	;** get iPtr[0]
|| [A1] B	.S1	LOOP		;for (i=0; i<M; i++)
||[!B2] ADD	.L2	1,B2,B2		; used for store priming cnt	
; LOOP ends here

	STH	.D1	A9,*A6		; store oPtr[4] to *inPtr	
||	MV	.L1X	B15,A1		; copy stackpointer to A1

B_END:
*** END Benchmark Timing ***

	LDW	.D2	*B15++[2],B11	; pop B11 off stack
||	LDW	.D1	*++A1,A11	; pop A11 off stack
||	B	.S2	B3		; return from call

	LDW	.D2	*B15++,B10	; pop B10 off stack
||	LDW	.D1	*++A1[2],A10	; pop A10 off stack

	NOP	4
