*===============================================================================
*
*      TEXAS INSTRUMENTS, INC.
*
*      IIR
*
*      Revision Date:  04/09/98
*	
*      USAGE   This routine is C Callable and can be called as:
*		
*      void iir(float *x, float *y, float *b, float *a, int order, int numY)
*
*		x is pointer to array holding the input floating point array
*		y is pointer to array holding the output floating point array
*		b is pointer to array holding the input coefficient floating
*		   point array
*		a is pointer to array holding the output coefficient floating
*		   point array
*		order is the order of the IIR
*		numY is the number of output values
*
*		If the routine is not to be used as a C callable function,
*		then you need to initialize values for all of the parameters
*		passed to the function since these are assumed to be in
*		registers as defined by the calling convention of the
*		compiler, (refer to the TMS320C6x Optimizing C Compiler
*		User's Guide).
*
*      C CODE
*		This is the C equivalent for the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*               void 
*               iir(float *x, float *y, float *b, float *a, int order, int numY)
*               {
*	           int i;
*	           for(i=0; i<(numY-order); i+=2)
*	           {
*	             int j;
*	             float sum0, sum1, sum2, sum3;
*
*	             sum0 = sum1 = sum2 = sum3 = 0;
*	             for(j=0; j<order; j+=2)
*	             {
*	               sum0 += x[i+j]  *b[j]   - y[i+j]  *a[j];
*	               sum1 += x[i+j+1]*b[j+1] - y[i+j+1]*a[j+1];
*	               sum2 += x[i+j+1]*b[j]   - y[i+j+1]*a[j];
*	               sum3 += x[i+j+2]*b[j+1] - y[i+j+2]*a[j+1];
*	             }
*	             y[i+order] = sum0+sum1+x[i+order]*b[order];
*	             y[i+order+1] = sum2+sum3-y[i+order]*a[order-1]+
*                                   x[i+order+1]*b[order];
*	           }
*               }
*
*      DESCRIPTION
*
*		This routine implements a block IIR filter.  There are "order+1"
*		"a" filter coefficients and "order+1" "b" filter coefficients, 
*               "numY" input samples, and "numY" output samples. The  
*               coefficients need to be placed in the arrays in reverse order: 
*		{a(order), ..., a(1), a(0)} and {b(order), ..., b(1), b(0)}.
*               The first "order" values of the y array are not modified by the  
*		routine {y(0), y(1), ..., y(order-1)}.	It is best if the
*		remaining "numY-order" values in the y array are set to zero
*		prior to calling the function since a non-zero y(order) value
*		will generate an incorrect y(order+1) output value.
*
*		The routine calculates y(order) through y(numY-1) using 
*               the following formula:
*
*		a(0)*y(n) = b(0)*x(n) + b(1)*x(n-1) + ... + b(order)*x(n-order) +
*				      - a(1)*y(n-1) - ... - a(order)*y(n-order)
*
*		where n = {order, order+1, ... , numY-1} and a(0) is assumed
*		to be equal to 1.  You can force a(0) equal to 1 by dividing
*		the other coefficients by a(0):
*
*		y(n) = b'(0)*x(n) + b'(1)*x(n-1) + ... + b'(order)*x(n-order) +
*				  - a'(1)*y(n-1) - ... - a'(order)*y(n-order)
*
*		The value of a(0) is loaded, but not used.
*
*      TECHNIQUES
*
*		1.  LDDW instructions are used to load two SP floating point 
*		    values simultaneously.
*		2.  The outer loop is unrolled 2 times.
*		3.  The inner loop is unrolled 2 times and software pipelined. 
*		4.  The variables sum0, sum1, sum2, and sum3 share B0.
*		    The variables sum4, sum5, sum6, and sum7 share A0.
*		    This multiple assignment is possible since the variables
*		    are always read just once on the first cycle that they
*		    are availble.
*		5.  The first 12 cycles of the inner loop prolog are
*		    conditionally scheduled in parallel with the outer loop.
*		    This increases the code size by 33 words, but improves
*		    the cycle time.
*		6.  A load counter is used so that an epilog is not needed.
*		    No extraneous loads are performed.
*	
*      ASSUMPTIONS
*
*		1.  Little Endian is assumed for LDDW instructions.
*		2.  The order must be a multiple of 2 and greater than or
*		    equal to 4 (4, 6, 8, ...).
*		3.  The number of outputs (numY) must be a multiple of 2
*		    and greater than or equal to 2 + order
*		    (2 + order, 4 + order, 6 + order, ...).
*		4.  Since single assignment of registers is not used,
*		    interrupts should be disabled before this function is 
*                   called.
*		
*      MEMORY NOTE
*
*		The x and y arrays must be aligned on opposite (even and
*		odd) double word (64-bit) boundaries to avoid memory bank hits.
*
*		The a and b arrays must also be aligned on opposite (even
*		and odd) double word (64-bit) boundaries to avoid memory bank
*		hits.
*
*      ARGUMENTS PASSED
*
*		x	 ->  A4 = ptr_x
*		y	 ->  B4 = ptr_y
*		b	 ->  A6 = ptr_b
*		a	 ->  B6 = ptr_a
*		order	 ->  A8
*		numY	 ->  B8
*
*	CYCLES
*
*		(order + 10)*(numY-order) + 15	    with C overhead
*		(order + 10)*(numY-order) + 12	    without C overhead
*
*	NOTATIONS
*
*		f = Function Prolog or Epilog
*		o = Outer Loop
*		p = Inner Loop Prolog
*
*===============================================================================
	.global _iir
	.text
_iir:
	STW	.D2	A13,*B15--[6]	; f push A13 to stack
||	ADDAW	.D1	A6,A8,A3	; f A3 = &ptr_b[order]
||	SUB	.S1	A8,4,A7 	; f p_order = order - 4
||	SHL	.S2X	A8,2,B1 	; f reset = order << 2

	STW	.D2	A12,*+B15[5]	; f push A12
||	MV	.S1X	B15,A13 	; f A13 = B15 = SP

	STW	.D1	A11,*+A13[4]	; f push A11
||	STW	.D2	B11,*+B15[3]	; f push B11

*** BEGIN Benchmark Timing ***
B_START:

* Prolog Begins ****************************************************************
	LDDW	.D1	*A4++,A9:A8	; load x1:x0 from memory
||	LDDW	.D2	*B4++,B9:B8	; load y1:y0 from memory
||	SUB	.L1X	B8,A8,A2	; f A2 = numY - order

	LDDW	.D1	*A6++,A11:A10	; load bb1:bb0 from memory
||	LDDW	.D2	*B6++,B11:B10	; load aa1:aa0 from memory
||	SUB	.L1	A2,2,A2 	; f ocntr = A2 - 2

	LDDW	.D1	*A4,A13:A12	; load x3:x2 from memory
||	LDW	.D2	*B4,B7		; load y2 from memory

	STW	.D1	A10,*+A13[2]	; f push A10
||	STW	.D2	B10,*+B15[1]	; f push B10

	LDDW	.D1	*A4++,A9:A8	; @ load x1:x0 from memory
||	LDDW	.D2	*B4++,B9:B8	; @ load y1:y0 from memory

	LDDW	.D1	*A6++,A11:A10	; @ load bb1:bb0 from memory
||	LDDW	.D2	*B6++,B11:B10	; @ load aa1:aa0 from memory

	LDDW	.D1	*A4,A13:A12	; @ load x3:x2 from memory
||	LDW	.D2	*B4,B7		; @ load y2 from memory
||	MPYSP	.M1	A8,A10,A5	; prod0 = x0 * bb0
||	MPYSP	.M2	B8,B10,B5	; prod1 = y0 * aa0

	MPYSP	.M1	A9,A11,A5	; prod2 = x1 * bb1
||	MPYSP	.M2	B9,B11,B5	; prod3 = y1 * aa1
||	LDW	.D1	*A3,A3		; f bb = *A3 = *ptr[order]
||	MV	.L2X	A7,B2		; f lcntr = p_order

   [B2] LDDW	.D1	*A4++,A9:A8	; @@ load x1:x0 from memory
|| [B2] LDDW	.D2	*B4++,B9:B8	; @@ load y1:y0 from memory
||	MPYSP	.M1	A9,A10,A5	; prod4 = x1 * bb0
||	MPYSP	.M2	B9,B10,B5	; prod5 = y1 * aa0

   [B2] LDDW	.D1	*A6++,A11:A10	; @@ load bb1:bb0 from memory
|| [B2] LDDW	.D2	*B6++,B11:B10	; @@ load aa1:aa0 from memory
||	MPYSP	.M1	A12,A11,A5	; prod6 = x2 * bb1
||	MPYSP	.M2	B7,B11,B5	; prod7 = y2 * aa1

   [B2] LDDW	.D1	*A4,A13:A12	; @@ load x3:x2 from memory
|| [B2] LDW	.D2	*B4,B7		; @@ load y2 from memory
||	SUBSP	.L1X	A5,B5,A0	; sum4 = prod0 - prod1
||	MPYSP	.M1	A8,A10,A5	; @ prod0 = x0 * bb0
||	MPYSP	.M2	B8,B10,B5	; @ prod1 = y0 * aa0

	SUBSP	.L1X	A5,B5,A0	; sum5 = prod2 - prod3
||	MPYSP	.M1	A9,A11,A5	; @ prod2 = x1 * bb1
||	MPYSP	.M2	B9,B11,B5	; @ prod3 = y1 * aa1
|| [B2] SUB	.S2	B2,2,B2 	; if (lcntr) lcntr -= 2

OLOOP:
   [B2] LDDW	.D1	*A4++,A9:A8	; @@@ load x1:x0 from memory
|| [B2] LDDW	.D2	*B4++,B9:B8	; @@@ load y1:y0 from memory
||	SUBSP	.L1X	A5,B5,A0	; sum6 = prod4 - prod5
||	MPYSP	.M1	A9,A10,A5	; @ prod4 = x1 * bb0
||	MPYSP	.M2	B9,B10,B5	; @ prod5 = y1 * aa0
||	B	.S2	LOOP		; if (icntr) branch to LOOP

   [B2] LDDW	.D1	*A6++,A11:A10	; @@@ load bb1:bb0 from memory
|| [B2] LDDW	.D2	*B6++,B11:B10	; @@@ load aa1:aa0 from memory
||	SUBSP	.L1X	A5,B5,A0	; sum7 = prod6 - prod7
||	MPYSP	.M1	A12,A11,A5	; @ prod6 = x2 * bb1
||	MPYSP	.M2	B7,B11,B5	; @ prod7 = y2 * aa1
||	MV	.S1	A7,A1		; icntr = p_order
||	ZERO	.S2	B0		; sum0 = sum1 = sum2 = sum3 = 0
***** Loop Begins **************************************************************
LOOP:
   [B2] LDDW	.D1	*A4,A13:A12	; @@@ load x3:x2 from memory
|| [B2] LDW	.D2	*B4,B7		; @@@ load y2 from memory
||	SUBSP	.L1X	A5,B5,A0	; @ sum4 = prod0 - prod1
||	ADDSP	.L2X	A0,B0,B0	; sum0 += sum4
||	MPYSP	.M1	A8,A10,A5	; @@ prod0 = x0 * bb0
||	MPYSP	.M2	B8,B10,B5	; @@ prod1 = y0 * aa0

	SUBSP	.L1X	A5,B5,A0	; @ sum5 = prod2 - prod3
||	ADDSP	.L2X	A0,B0,B0	; sum1 += sum5
||	MPYSP	.M1	A9,A11,A5	; @@ prod2 = x1 * bb1
||	MPYSP	.M2	B9,B11,B5	; @@ prod3 = y1 * aa1
|| [B2] SUB	.S2	B2,2,B2 	; if (lcntr) lcntr -= 2

   [B2] LDDW	.D1	*A4++,A9:A8	; @@@@ load x1:x0 from memory
|| [B2] LDDW	.D2	*B4++,B9:B8	; @@@@ load y1:y0 from memory
||	SUBSP	.L1X	A5,B5,A0	; @ sum6 = prod4 - prod5
||	ADDSP	.L2X	A0,B0,B0	; sum2 += sum6
||	MPYSP	.M1	A9,A10,A5	; @@ prod4 = x1 * bb0
||	MPYSP	.M2	B9,B10,B5	; @@ prod5 = y1 * aa0
|| [A1] SUB	.S1	A1,2,A1 	; if (icntr) icntr -= 2
|| [A1] B	.S2	LOOP		; if (icntr) branch to LOOP

   [B2] LDDW	.D1	*A6++,A11:A10	; @@@@ load bb1:bb0 from memory
|| [B2] LDDW	.D2	*B6++,B11:B10	; @@@@ load aa1:aa0 from memory
||	SUBSP	.L1X	A5,B5,A0	; @ sum7 = prod6 - prod7
||	ADDSP	.L2X	A0,B0,B0	; sum3 += sum7
||	MPYSP	.M1	A12,A11,A5	; @@ prod6 = x2 * bb1
||	MPYSP	.M2	B7,B11,B5	; @@ prod7 = y2 * aa1
***** Loop Ends ****************************************************************
	MV	.S2	B0,B10		; o B10 = sum0
||	MV	.S1X	B4,A1		; o A1 = ptr_y

	ADDSP	.L2	B0,B10,B0	; o B0 = sum0 + sum1
||	MPYSP	.M1	A12,A3,A5	; o A5 = x2 * bb
|| [A2] SUB	.S1X	A4,B1,A4	; o ptr_x -= reset
|| [A2] SUB	.S2	B6,B1,B6	; o ptr_a -= reset

	MV	.S2	B0,B10		; o B10 = sum2
||[!A2] MV	.S1X	B15,A7		; f A7 = B15
||[!A2] LDW	.D2	*+B15[5],A12	; f pop A12

	ADDSP	.L2	B0,B10,B5	; o B5 = sum2 + sum3
||	MPYSP	.M1	A13,A3,A0	; o A0 = x3 * bb
|| [A2] SUB	.S1X	A6,B1,A6	; o ptr_b -= reset
|| [A2] SUB	.S2	B4,B1,B4	; o ptr_y -= reset
||[!A2] LDW	.D2	*+B15[1],B10	; f pop B10
||[!A2] LDW	.D1	*+A7 [2],A10	; f pop A10

   [A2] ADD	.S2	B4,8,B4 	; o ptr_y += 8
|| [A2] ADD	.S1	A4,8,A4 	; o ptr_x += 8

	ADDSP	.L2X	A5,B0,B0	; o B0 += A5
||[!A2] LDW	.D2	*+B15[3],B11	; f pop B11
||[!A2] LDW	.D1	*+A7 [4],A11	; f pop A11

   [A2] MV	.S2X	A7,B2		; o lcntr = p_order
|| [A2] LDDW	.D1	*A4++,A9:A8	; p load x1:x0 from memory
|| [A2] LDDW	.D2	*B4++,B9:B8	; p load y1:y0 from memory

	ADDSP	.L2X	A0,B5,B5	; o B5 += A0
||[!A2] ZERO	.S2	B2		; f if(!A2) lcntr = 0
|| [A2] LDDW	.D1	*A6++,A11:A10	; p load bb1:bb0 from memory
|| [A2] LDDW	.D2	*B6++,B11:B10	; p load aa1:aa0 from memory

   [A2] LDDW	.D1	*A4,A13:A12	; p load x3:x2 from memory
|| [A2] LDW	.D2	*B4,B7		; p load y2 from memory

	STW	.D1	B0,*A1		; o store B0
||	MPYSP	.M2	B11,B0,B0	; o B0 *= aa1

   [A2] LDDW	.D1	*A4++,A9:A8	; p load x1:x0 from memory
|| [A2] LDDW	.D2	*B4++,B9:B8	; p load y1:y0 from memory

   [A2] LDDW	.D1	*A6++,A11:A10	; p load bb1:bb0 from memory
|| [A2] LDDW	.D2	*B6++,B11:B10	; p load aa1:aa0 from memory

   [A2] B	.S1	OLOOP		; o if (ocntr) branch to OLOOP
||[!A2] B	.S2	B3		; f if (!ocntr) return
|| [A2] LDDW	.D1	*A4,A13:A12	; p load x3:x2 from memory
|| [A2] LDW	.D2	*B4,B7		; p load y2 from memory
|| [A2] MPYSP	.M1	A8,A10,A5	; p prod0 = x0 * bb0
|| [A2] MPYSP	.M2	B8,B10,B5	; p prod1 = y0 * aa0

	SUBSP	.L2	B5,B0,B0	; o B0 = B5 - B0
|| [A2] MPYSP	.M1	A9,A11,A5	; p prod2 = x1 * bb1
|| [A2] MPYSP	.M2	B9,B11,B5	; p prod3 = y1 * aa1
||[!A2] LDW	.D2	*++B15[6],A13	; f pop A13

   [B2] LDDW	.D1	*A4++,A9:A8	; p load x1:x0 from memory
|| [B2] LDDW	.D2	*B4++,B9:B8	; p load y1:y0 from memory
|| [A2] MPYSP	.M1	A9,A10,A5	; p prod4 = x1 * bb0
|| [A2] MPYSP	.M2	B9,B10,B5	; p prod5 = y1 * aa0

   [B2] LDDW	.D1	*A6++,A11:A10	; p load bb1:bb0 from memory
|| [B2] LDDW	.D2	*B6++,B11:B10	; p load aa1:aa0 from memory
|| [A2] MPYSP	.M1	A12,A11,A5	; p prod6 = x2 * bb1
|| [A2] MPYSP	.M2	B7,B11,B5	; p prod7 = y2 * aa1

   [B2] LDDW	.D1	*A4,A13:A12	; p load x3:x2 from memory
|| [B2] LDW	.D2	*B4,B7		; p load y2 from memory
|| [A2] SUBSP	.L1X	A5,B5,A0	; p sum4 = prod0 - prod1
|| [A2] MPYSP	.M1	A8,A10,A5	; p prod0 = x0 * bb0
|| [A2] MPYSP	.M2	B8,B10,B5	; p prod1 = y0 * aa0

	STW	.D1	B0,*+A1[1]	; o store B5
|| [A2] SUB	.S1	A2,2,A2 	; o if (ocntr) ocntr -= 2
|| [A2] SUBSP	.L1X	A5,B5,A0	; p sum5 = prod2 - prod3
|| [A2] MPYSP	.M1	A9,A11,A5	; p prod2 = x1 * bb1
|| [A2] MPYSP	.M2	B9,B11,B5	; p prod3 = y1 * aa1
|| [B2] SUB	.S2	B2,2,B2 	; p if (lcntr) lcntr -= 2
* Outer Loop Ends **************************************************************
B_END:
*** END Benchmark Timing ***
