*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	AUTOCORRELATION
*
*	Revision Date: 4/30/98
*	
*	USAGE	
*
*		This routine is C Callable and can be called as:
*		
*		void autocor( float *acl, const float *inp, int M, int N)
*
*		acl[]	---	Resulting array of autocorrelation 
*		inp[]	---	Input array of autocorrelation
*		M	---	Length of autocorrelation  vector  (MULTIPLE of 4)
*		N	---	{ Length of Input array (acl[]) vector  - M  }  
*				(MULTIPLE of 2)
*
*		If routine is not to be used as a C callable function then
*		you need to initialize values for all of the values passed
*		as these are assumed to be in registers as defined by the 
*		calling convention of the compiler, (refer to the C compiler
*		reference guide).
*
*		ARGUMENTS PASSED   ->   REGISTER
*		---------------------------------
*		acl                ->   A4
*		inp                ->   B4
*		M                  ->   A6
*		N                  ->   B6
*
*	C CODE
*
*		This is the C equivalent of the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*		void autocor( float *acl, const float *inp, int M, int N)
*		{
*			int i,k;
*			float sum;
*
*			for (i = 0; i < M; i++)
*			{
*				sum = 0;
*				for (k = M; k < N+M; k++)
*				{
*					sum += inp[k] * inp[k-i];
*				}
*				acl[i] = sum ;
*			}
*		}
*
*	DESCRIPTION
*
*		This routine performs the autocorrelation of the input array inp.
*		It is assumed that the length of the input array, inp, is a
*		multiple of 2 and the length of the output array, acl, is a 		
*		multiple of 4.  The assembly routine performs 4 output samples
*		at a time. 
*		
*	TECHNIQUES
*
*		The inner loop is unrolled twice. The length of 
*		the input array must be a multiple of 2.  The outer
*		loop is unrolled four times so the length of output array must
*		be a multiple of 4.
*
*		The outer loop is conditionally executed in parallel with the
*		inner loop.  This allows for a zero overhead outer loop.
*	
*	ASSUMPTIONS
*
*		N is a multiple of 2 and greater then 4
*		M is a multiple of 4 and greater than 4
*		inp is aligned on even doubleword boundary
*		acl is offset by a word from inp alignment
*		inp is assumed to be padded with M zeros starting from location 0
*		
*	MEMORY NOTE
*
*		No Memory bank hits if inp and acl alignment assumptions apply
*
*	CYCLES
*
*		(N/2)*M + (M/2)*5 + 9
*
*	NOTATIONS
*
*		f = Function Prolog or Epilog
*		o = Outer Loop
*		p = Inner Loop Prolog
*
*================================================================================

	.global	_autocor
	.text

_autocor:

; BEGIN BENCHMARK TIMING

	STW	.D2	B10,*B15--(4)	; f push B10
||	MV	.L1X	B4,A3		; f temp = inp
||	ADDAW	.D1	A4,A6,A4	; f acl = &acl[M]
||	ADD	.L2	B4,8,B4		; f inp += 8

;*-----------------------------------------------------------------------------*

	LDDW	.D2	*B4++,B1:B0	; p @ (k3:k2) inp[k-(i-3)]:inp[k-(i-2)]
||	ADDAW	.D1	A3,A6,A0	; f inp0 = &inp[M]

	LDDW	.D1	*A0++,A9:A8	; p @ (kk1:kk0) inp[k+1]:inp[k]
||	MV	.L1X	B4,A7		; f inp1 = inp
||	MV	.S1	A0,A3		; f temp = inp0

	LDW     .D1T2	*-A7(12),B7	; p @ (k1) inp[k-(i-1)] (or *-inp1[3] )
||	SUB	.S1	A6,4,A1		; f (outer loop counter) cntr1 = M - 4

	LDDW	.D1T2	*A7,B9:B8	; p @ (k5:k4) inp[k-(i-5)]:inp[k-(i-4)]

	LDDW	.D1T2	*A7++,B1:B0	; p @@ (k3:k2) inp[k-(i-3)]:inp[k-(i-2)]

	LDDW	.D1	*A0++,A9:A8	; p @@ (kk1:kk0) inp[k+1]:inp[k]
||	SUB	.L1X	B6,4,A2		; p (inner loop counter) cntr2 = N - 4 
||	SUB	.L2	B6,4,B6		; f N = N - 4

	LDW	.D1T2	*-A7(12),B7	; p @@ (k1) inp[k-(i-1)] (or *-inp1[3] )
||	MPYSP	.M1X	A8,B0,A5	; p prod2 = inp[k]*inp[k-(i-2)]
||	MPYSP	.M2X	A9,B1,B5	; p prod6 = inp[k+1]*inp[k-(i-3)]

	LDDW	.D1T2	*A7,B9:B8	; p @@ (k5:k4) inp[k-(i-5)]:inp[k-(i-4)]
||	MPYSP	.M1X	A8,B7,A5	; p prod1 = inp[k]*inp[k-(i-1)]
||	MPYSP	.M2X	A9,B0,B5	; p prod5 = inp[k+1]*inp[k-(i-2)]
||	SHRU	.S2	B6,1,B2		; p (load counter) lcntr = (N - 4)/2


LOOP1:		; OUTER LOOP

  [B2]	LDDW	.D1T2	*A7++,B1:B0	; @@@ (k3:k2) inp[k-(i-3)]:inp[k-(i-2)]
||	MPYSP	.M1X	A8,B1,A5	; prod3 = inp[k]*inp[k-(i-3)]
||	MPYSP	.M2X	A9,B8,B5	; prod7 = inp[k+1]*inp[k-(i-4)]
||	B	.S2	LOOP2		; Branch to inner loop


  [B2]	LDDW	.D1	*A0++,A9:A8	; @@@ (kk1:kk0) inp[k+1]:inp[k]
||	MPYSP	.M1X	A8,B8,A5	; prod4 = inp[k]*inp[k-(i-4)]
||	MPYSP	.M2X	A9,B9,B5	; prod8 = inp[k+1]*inp[k-(i-5)]
||	ZERO	.L1	A6		; sum1 = sum2 = sum3 = sum4 = 0 
||	ZERO	.L2	B10		; sum5 = sum6 = sum7 = sum8 = 0
||	ADD	.D2	8,B4,B4		; inp = inp + 8

;*-----------------------------------------------------------------*

LOOP2:        ; KERNEL

  [B2]	LDW	.D1T2	*-A7(12),B7	; @@@ (k1) inp[k-(i-1)] (or *-inp1[3] )
||	MPYSP	.M1X	A8,B0,A5	; prod2 = inp[k]*inp[k-(i-2)]
||	MPYSP	.M2X	A9,B1,B5	; prod6 = inp[k+1]*inp[k-(i-3)]
||	ADDSP	.L1	A6,A5,A6	; sum2 = sum2 + prod2
||	ADDSP	.L2	B10,B5,B10	; sum6 = sum6 + prod6

  [B2]	LDDW	.D1T2	*A7,B9:B8	; @@@ (k5:k4) inp[k-(i-5)]:inp[k-(i-4)]
||	MPYSP	.M1X	A8,B7,A5	; prod1 = inp[k]*inp[k-(i-1)]
||	MPYSP	.M2X	A9,B0,B5	; prod5 = inp[k+1]*inp[k-(i-2)]
||	ADDSP	.L1	A6,A5,A6	; sum1 = sum1 + prod1
||	ADDSP	.L2	B10,B5,B10	; sum5 = sum5 + prod5
||[B2]	SUB	.S2	B2,1,B2		; lcntr = lcntr - 1


  [B2]	LDDW	.D1T2	*A7++,B1:B0	; @@@@ (k3:k2) inp[k-(i-3)]:inp[k-(i-2)]
||	MPYSP	.M1X	A8,B1,A5	; prod3 = inp[k]*inp[k-(i-3)]
||	MPYSP	.M2X	A9,B8,B5	; prod7 = inp[k+1]*inp[k-(i-4)]
||	ADDSP	.L1	A6,A5,A6	; sum3 = sum3 + prod3
||	ADDSP	.L2	B10,B5,B10	; sum7 = sum7 + prod7
||[A2]	SUB	.S1	A2,2,A2		; cntr2 = cntr2 - 2
||[A2]	B	.S2	LOOP2

  [B2]	LDDW	.D1	*A0++,A9:A8	; @@@@ (kk1:kk0) inp[k+1]:inp[k]
||	MPYSP	.M1X	A8,B8,A5	; prod4 = inp[k]*inp[k-(i-4)]
||	MPYSP	.M2X	A9,B9,B5	; prod8 = inp[k+1]*inp[k-(i-5)]
||	ADDSP	.L1	A6,A5,A6	; sum4 = sum4 + prod4
||	ADDSP	.L2	B10,B5,B10	; sum8 = sum8 + prod8

;*-----------------------------------------------------------------*

	ADDSP	.L1	A6,B10,A6	; o sum2 = sum2 + sum6
||	MV	.S1	A3,A0		; p inp0 = temp ( = &inp[M])
||[A1]	LDDW	.D2	*B4++,B1:B0	; p @ (k3:k2) inp[k-(i-3)]:inp[k-(i-2)]

	ADDSP	.L2X	A6,B10,B10	; o sum5 = sum1 + sum5
||[A1]	LDDW	.D1	*A0++,A9:A8	; p @ (kk1:kk0) inp[k+1]:inp[k]
||[A1]	MV	.S1X	B4,A7		; p inp1 = inp

	ADDSP	.L1	A6,B10,A6	; o sum3 = sum3 + sum7
||[A1]	B	.S2	LOOP1		; o Branch to outer loop
||[A1]	LDW	.D1T2	*-A7(12),B7	; p @ (k1) inp[k-(i-1)] (or *-inp1[3] )

	ADDSP	.L1	A6,B10,A6	; o sum4 = sum4 + sum8
||	MV	.S2X	A4,B2		; o lcntr = acl
||	SUB	.S1	A4,16,A4	; o acl = acl - 16
||[A1]	LDDW	.D1T2	*A7,B9:B8	; p @ (k5:k4) inp[k-(i-5)]:inp[k-(i-4)]

;*

	STW	.D2T1	A6,*--B2[2]	; o acl[i-2] = sum2
||[A1]	LDDW	.D1T2	*A7++,B1:B0	; p @@ (k3:k2) inp[k-(i-3)]:inp[k-(i-2)]

	STW	.D2	B10,*++B2[1]	; o acl[i-1] = sum5
||[A1]	LDDW	.D1	*A0++,A9:A8	; p @@ (kk1:kk0) inp[k+1]:inp[k]
||[A1]	MV	.S1X	B6,A2		; p cntr2 = N

	STW	.D2T1	A6,*--B2[2]	; o acl[i-3] = sum3
||[A1]	LDW	.D1T2	*-A7(12),B7   	; p @@ (k1) inp[k-(i-1)] (or *-inp1[3] )
||[A1]	MPYSP	.M1X	A8,B0,A5	; p prod2 = inp[k]*inp[k-(i-2)]
||[A1]	MPYSP	.M2X	A9,B1,B5	; p prod6 = inp[k+1]*inp[k-(i-3)]

	STW	.D2T1	A6,*-B2[1]	; o acl[i-4] = sum4
||[A1]	SUB	.L1	A1,4,A1		; o cntr1 = cntr1 - 4
||[A1]	LDDW	.D1T2	*A7,B9:B8	; p @@ (k5:k4) inp[k-(i-5)]:inp[k-(i-4)]
||[A1]	MPYSP	.M1X	A8,B7,A5	; p prod1 = inp[k]*inp[k-(i-1)]
||[A1]	MPYSP	.M2X	A9,B0,B5	; p prod5 = inp[k+1]*inp[k-(i-2)]
||[A1]	SHRU	.S2	B6,1,B2		; p (load counter) lcntr = (N - 4)/2

;**
; BRANCH TO OUTER LOOP OCCURS
; END OF BENCHMARK TIMING
;*------------------------------------------------------------------*

	B	.S2	B3
	LDW	.D2	*++B15(4),B10	; pop B10
	NOP		4

; BRANCH TO CALLING FUNCTION OCCURS