*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	AUTOCORRELATION
*
*	Revision Date:  04/16/97
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*		int autcor(short ac[], short sd[], int N, int M)
*
*		ac[] --- Resulting array of autocorrelation 
*               sd[] --- Input array of autocorrelation
*		N    --- Length of Input array vector (sd[]) - M (MULTIPLE of 8)
*		M    --- Length of autocorrelation (MULTIPLE of 2)
*
*		If routine is not to be used as a C callable function then
*		you need to initialize values for all of the values passed
*		as these are assumed to be in registers as defined by the 
*		calling convention of the compiler, (refer to the C compiler
*		reference guide).
*
*	C CODE
*		This is the C equivalent of the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*		void autcor(short ac[],short sd[], int N, int M)
*		{
*		int i,k,sum;
*
*		for (i = 0; i < M; i++){
*		sum = 0;
*			for (k = M; k < N+M; k++)
*				sum += sd[k] * sd[k-i];
*			ac[i] = (sum >> 15);
*			}
*		}
*
*
*	DESCRIPTION
*
*		This routine performs the autocorrelation of the input array sd.
*		It is assumed that the length of the input array, sd, is a
*		multiple of 8 and the length of the output array, ac, is a 		*		multiple of 2.  The assembly routine performs 2 output samples
*		at a time.  This is typically used in vselp code.
*		
*	TECHNIQUES
*
*		The inner loop is unrolled eight times thus the length of 
*		the input array must be a multiple of eight.  The outer
*		loop is unrolled twice so the length of output array must
*		be a multiple of 2.
*
*		The outer loop is conditionally executed in parallel with the
*		inner loop.  This allows for a zero overhead outer loop.
*	
*	ASSUMPTIONS
*
*         	N is a multiple of 8 
*		M is a multiple of 2
*		sd[0] is on a word boundary
*		
*	MEMORY NOTE
*
*		One memory hit occurs every fourth outer loop cycle (2Nth inner
*		loop cycle) or M/4 times.
*
*	CYCLES
*
*		(N/2)*M + 16 + M/4
*
*================================================================================

********* ASSEMBLY CODE: *******************
	.global _autcor
	.text
_autcor:

	SUB	.L1x	B15,4,A9

	STW	.D1	A10,*A9--[2]	; push A10 on stack 
||	STW	.D2	B10,*B15--[2]	; push B10 on stack

	STW	.D1	A11,*A9--[2]	; push A11 on stack
||	STW	.D2	B11,*B15--[2]	; push B11 on stack

	STW	.D1	A12,*A9--[2]	; push A12 on stack
||	STW	.D2	B12,*B15--[2]	; push B12 on stack

*** BEGIN Benchmark Timing ***
B_START:
	SHL	.S1	A6,1,A14	; for inner lp setup; N Hwords
||	MPY	.M2X	A6,B6,B0	; N*M
||	STW	.D1	A13,*A9--[2]	; push A13 on stack
||	STW	.D2	B13,*B15--[2]	; push B13 on stack

	ADD	.L2X	2,A4,B14	; a[i+1] other reg file
||	ADDAH	.D2	B4,B6,B3	; sd[k] & sd[k+1]
||	MV	.L1X	B3,A13		; store return pointer
||	STW	.D1	A14,*A9--[2]	; push A14 on stack

	MV	.L1X	B3,A3		; sd[k] & sd[k+1]
||	SUB	.L2	B3,4,B4		; sd[k-i-2] & sd[k-i-1]
||	SHR	.S1	A6,3,A0		; for inner lp setup; N/8
||	STW	.D1	A15,*A9		; push A15 on stack
||	STW	.D2	B14,*B15	; push B14 on stack

	LDW	.D1	*A3++[2],A5	; sd[k] & sd[k+1]
||	MV	.L1	A3,A15		; sd[k-i] & sd[k-i+1]
||	MV	.S1	A0,A1		; Set inner loop count

	LDW	.D2	*B4++[2],B5	; sd[k-i-2]& sd[k-i-1]
||	LDW	.D1	*A15++[2],A6	; sd[k-i]& sd[k-i+1]
||	SHL	.S2X	A6,1,B1		; for inner lp setup; N Hwords
||	MPY	.M1	A8,0,A8		; initialize to zero
||	MPY	.M2	B9,0,B9		; initialize to zero
||	MVK	.S1	1,A2		; initialize count
||	ZERO	.L2	B2		; initialize to zero
||	ADD	.L1	A0,-1,A0	;

	LDW	.D2	*++B3[1],B6	; sd[k+2] & sd[k+3]
||	MVK	.S1	1,A2		; initialize priming cnt
||	MPY	.M1	A7,0,A7		; initialize to zero
||	MPY	.M2	B11,0,B11	; initialize to zero
||	ZERO	.D1	A5		; initialize to zero
||	ZERO	.L2	B6		; initialize to zero
||	SUB	.L1X	0,B10,A11	; initialize to zero (1st cycle 1st)
||	SUB	.S2	0,B8,B12	; initialize to zero (1st cycle 1st)

LOOP1:
	ADD	.L1X	B10,A11,A11	; tmp_aca2 +s= p2a2
|| [A2] ADD	.L2	B8,B12,B8	; tmp_acb0 +s= p2b0
||	MPY	.M1	A5,A6,A9	; p2a4 = sd[k+4]*sd[k-i+4]
||	MPYLH	.M2X	A5,B7,B8	; p2b4 = sd[k+4]*sd[k-i+3]
||	LDW	.D2	*B4++[2],B7	;*sd[k-i+2]&sd[k-i+3]
||[!A2] SHR	.S2	B12,15,B13	; (0x0000ffffL & (tmp_acb >> 16)

	ADD	.L1X	B11,A11,A11	; tmp_aca3 +s= p2a3
||	ADD	.L2X	A7,B8,B12	; tmp_acb1 +s= p2b1
||	MPY	.M2	B6,B7,B10	; p2a6 = sd[k+6]*sd[k-i+6]
||	MPYHL	.M1	A5,A6,A7	; p2b5 = sd[k+5]*sd[k-i+4]
||	LDW	.D1	*A3++[2],A5	;* sd[k+4] & sd[k+5]
||[!A1] ADD	.S1	A0,1,A1		; reset inner lp cntr
||[!B2]	SHR	.S2	B0,4,B0		; M*N/16
||[!B2]	ADDAW	.D2	B3,2,B3		; sd[k+2] & sd[k+3]

	ADD	.L2X	A8,B12,B12	; tmp_acb2 +s= p2b2
||	ADD	.L1	A9,A11,A9	; tmp_aca4 +s= p2a4
||	MPYH	.M2	B6,B7,B11	; p2a7 = sd[k+7]*sd[k-i+7]
||	MPYLH	.M1X	B6,A6,A8	; p2b6 = sd[k+6]*sd[k-i+5]
||[!A2] STH	.D2	B13,*B14++[2]	; ac[0][i+1]=(tmp_acb>> 16)
||	LDW	.D1	*A15++[2],A6	;*sd[k-i+4]& sd[k-i+5]
|| [B0] B	.S2	LOOP1		; Branch inner most loop
|| [A1] ADD	.S1	-1,A1,A1	; dec lp cntr

	ADD	.L2	B9,B12,B12	; tmp_acb3 +s= p2b3
|| [B2]	ADD	.L1	A10,A9,A11	; tmp_aca5 +s= p2a5
||	MPYHL	.M2	B6,B7,B9	; p2b7 = sd[k+7]*sd[k-i+6]
||	MPYH	.M1	A5,A6,A10	;* p2a1 = sd[k+1]*sd[k-i+1]
||	LDW	.D2	*B3++[2],B6	;* sd[k+6] & sd[k+7]
|| [B2]	SUB	.S1	A1,A0,A2	; dec lp cntr
||[!A2] STH	.D1	A12,*A4++[2]	; ac[0][i] =(tmp_aca >> 16)

	ADD	.L1X	B10,A11,A11	; tmp_aca6 +s= p2a6
||	ADD	.L2	B8,B12,B8	; tmp_acb4 +s= p2b4
||	MPY	.M1	A5,A6,A9	;* p2a0 = sd[k]*sd[k-i]
||	MPYLH	.M2X	A5,B5,B8	;* p2b0 = sd[k]*sd[k-i-1]
||	LDW	.D2	*B4,B7		;*sd[k-i+6]&sd[k-i+7]
||[!A1] SUB	.S2	B4,B1,B4	; reset ptr
||[!A1] SUB	.S1	A15,A14,A15	; reset ptr
||[!A1] SUB	.D1	A3,A14,A3	; reset ptr

	ADD	.L1X	B11,A11,A11	; tmp_aca7 +s= p2a7
||	ADD	.L2X	A7,B8,B12	; tmp_acb5 +s= p2b5
||	MPY	.M2	B6,B7,B10	;* p2a2 = sd[k+2]*sd[k-i+2]
||	MPYHL	.M1	A5,A6,A7	;* p2b1 = sd[k+1]*sd[k-i]
||	LDW	.D1	*A3++[2],A5	;** sd[k] & sd[k+1]
||[!A1] SUB	.D2	B3,B1,B3	; reset ptr
||[!A1] SUB	.S2	B4,4,B4		; reset ptr
||[!A1] SUB	.S1	A15,4,A15	; reset ptr

	ADD	.L2X	A8,B12,B12	; tmp_acb6 +s= p2b6
|| [A2] ADD	.L1	A9,A11,A9	;* tmp_aca0 +s= p2a0
||	MPYH	.M2	B6,B7,B11	;* p2a3 = sd[k+3]*sd[k-i+3]
||	MPYLH	.M1X	B6,A6,A8	;* p2b2 = sd[k+2]*sd[k-i+1]
||	LDW	.D2	*B4++[2],B5	;**sd[k-i-2]&sd[k-i-1]
||	LDW	.D1	*A15++[2],A6	;**sd[k-i]& sd[k-i+1]
||[!A2] SHR	.S1	A11,15,A12	; (0x0000ffffL & (tmp_aca >> 16)
||[!B2] MVK	.S2	1,B2		;

	ADD	.L2	B9,B12,B12	; tmp_acb7 +s= p2b7
||	ADD	.L1	A10,A9,A11	;* tmp_aca1 +s= p2a1
||	MPYHL	.M2	B6,B7,B9	;* p2b3 = sd[k+3]*sd[k-i+2]
||	MPYH	.M1	A5,A6,A10	;* p2a5 = sd[k+5]*sd[k-i+5]
||	LDW	.D2	*B3++[2],B6	;** sd[k+2] & sd[k+3]
|| [B0] ADD	.S2	-1,B0,B0	; dec outer lp cntr
					
; LOOP1 ENDS HERE

	SHR	.S2	B12,15,B13	; (0x0000ffffL & (tmp_acb >> 16)

	STH	.D1	A12,*A4		; ac[0][i] =(tmp_aca >> 16)
||	STH	.D2	B13,*B14	; ac[0][i+1]=(tmp_acb>> 16)
||	SUB	.L1x	B15,12,A4	;

B_END:
*** END Benchmark Timing ***

	LDW	.D1	*A4++[2], A15	; pop A15 off stack
||	LDW	.D2	*B15++[2], B14	; pop B14 off stack

	LDW	.D1	*A4++[2], A14	; pop A14 off stack
||	LDW	.D2	*B15++[2], B13	; pop B13 off stack

	LDW	.D1	*A4++[2], A13	; pop A13 off stack
||	LDW	.D2	*B15++[2], B12	; pop B12 off stack
||	MV	.L2x	A13,B3		; get return pointer

	LDW	.D1	*A4++[2], A12	; pop A12 off stack
||	LDW	.D2	*B15++[2], B11	; pop B11 off stack

	LDW	.D1	*A4++[2], A11	; pop A11 off stack
||	LDW	.D2	*B15, B10	; pop B10 off stack
||	B	.S2	B3		; Return from call

	LDW	.D1	*A4,A10		; pop A10

	NOP		4

