*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.
*
*	TI retains all rights, title and interest in this code and only *	*	authorizes the use of this code on TI TMS320 DSPs manufactured by TI.
*
*	Linear Time, Small Lookup Table: Bit Reversal
*
*	Revision Date: 2/26/98
*
*	USAGE	This routine is C Callable and can be called as:
*
*		void bitrev(int *x, short *index, int n){
*		
*		x	= Input Array to be Bit-Reversed
*		n	= Number of points in array (must be a power of 2)
*		index	= Array of ~sqrt(n) created by the routine
*			  digitrev_index
*			  found below to allow the fast implementation of the
*			  bit-reversal
*
*		If routine is not to be used as a C callable function
*		then all instructions relating to stack should be removed.
*		Refer to comments of individual instructions.  You will also
*		need to initialize values for all of the values passed as these
*		are assumed to be in registers as defined by the calling 
*		convention of the compiler, (refer to the C compiler reference
*		guide).
*
*	C Code 	This is the C equivalent of the Assembly Code without 
*		restrictions.  Note that the assembly code is hand optimized and
*		restrictions may apply
*
*	TI retains all rights, title and interest in this code and only
*	authorizes the use of this code on TI TMS320 DSPs manufactured by TI.
*
*	void bitrev(int *x, short *index, int n){
*	
*		int		i;
*		short		i0, i1, i2, i3;
*		short		j0, j1, j2, j3;
*		int		xi0, xi1, xi2, xi3;
*		int		xj0, xj1, xj2, xj3;
*		short		t;
*		int		a, b, ia, ib, ibs;
*		int		mask;
*		int		nbits, nbot, ntop, ndiff, n2, halfn;
*		short	*xs	= (short *) x;
*	
*		nbits = 0;
*		i = n;	
*		while (i > 1){
*			i = i >> 1;
*			nbits++;}
*	
*		nbot	= nbits >> 1;
*		ndiff	= nbits & 1;
*		ntop	= nbot + ndiff;
*		n2		= 1 << ntop;
*		mask	= n2 - 1;
*		halfn	= n >> 1;
*		
*		for	(i0 = 0; i0 < halfn; i0 += 2) {
*			b	= i0 & mask;
*			a	= i0 >> nbot;
*			if (!b) ia 	= index[a];
*			ib	= index[b];
*			ibs	= ib << nbot;
*			
*			j0	= ibs + ia;
*			t	= i0 < j0;
*			xi0	= x[i0];
*			xj0	= x[j0];
*	
*			if (t){x[i0] = xj0;
*				x[j0] = xi0;}
*		
*			i1	= i0 + 1;
*			j1	= j0 + halfn;
*			xi1	= x[i1];
*			xj1	= x[j1];
*			x[i1] = xj1;
*			x[j1] = xi1;
*			
*			i3	= i1 + halfn;
*			j3	= j1 + 1;
*			xi3	= x[i3];
*			xj3	= x[j3];
*			if (t){x[i3] = xj3;
*				x[j3] = xi3;}
*		}
*	}
*	
*	DESCRIPTION
*		This routine performs the bit-reversal of the input array x[].
*		where x[] is an array of length n 16-bit complex pairs of data.
*		This requires the index array provided by the program below.
*		This index should be generated at compile time not by the DSP.
*
*	ASSUMPTIONS
*		n is a power of 2
*
*	NOTE: If n <= 4K one can use the char (8-bit) data type for
*		the "index" variable. This would require changing the LDH when
*		loading index values in the assembly routine to LDB. This would
*		further reduce the size of the Index Table by half its size.
*
*	CYCLES	(n/4 + 2)*7 + 14;
*
*************************************************
* Use This Routine To Generate the Index Table for
* Bit/Digit Reversing of Radix-2 and Radix-4 Routines
*************************************************
* This routine calculates the index for digitrev of
* length n (length of index is 2^(radix*ceil(k/radix)) where n = 2^k
* in otherwords
* Either:sqrt(n) when n=2^even# Or: sqrt(2)*sqrt(n) when n=2^odd# [radix 2]
*	 sqrt(n) when n=4^even# Or: sqrt(4)*sqrt(n) when n=4^odd# [radix 4]
* Note: the variable "radix" is 2 for radix-2 and 4 for radix-4
*
*************************************************
*	 
*	void digitrev_index(short *index, int n, int radix){
*	
*		int		i,j,k;
*		short	nbits, nbot, ntop, ndiff, n2, raddiv2; 
*	
*		nbits = 0;
*		i = n;	
*		while (i > 1){
*			i = i >> 1;
*			nbits++;
*		}
*	
*		raddiv2	= radix >> 1;
*		nbot	= nbits >> raddiv2;
*		nbot	= nbot << raddiv2 - 1;
*		ndiff	= nbits & raddiv2;
*		ntop	= nbot + ndiff;
*		n2		= 1 << ntop;
*	
*		index[0] = 0;
*		for ( i = 1, j = n2/radix + 1; i < n2 - 1; i++){
*			index[i] = j - 1;
*			for (k = n2/radix; k*(radix-1) < j; k /= radix)
*					j -= k*(radix-1);
*			j += k;
*		}
*		index[n2 - 1] = n2 - 1;
*	}
*
*************************************************

	.global _bitrev
	.text
_bitrev:

		LMBD	.L1	1,	A6,	A1	; leftzeros = lmbd(1, n)
||		MV	.L2X	A4,	B8		; copy x
||		MVK	.S2	31,	B0		; constant 31
||		STW	.D2	A15,	*B15--		; push A15
||		SUB	.S1X	B15,	8,	A15	; copy stack pointer

		SUB	.L1X	B0,	A1,	A8	; nbits = 31 - leftzeros
||		SHR	.S2X	A6,	1,	B6	; halfn = n >> 1
||		ZERO	.S1	A3			; i0 = 0
||		STW	.D1	A10,	*A15--[2]	; push A10
||		STW	.D2	B10,	*B15--[2]	; push B10

		SHR	.S1	A8,	1,	A0	; nbot = nbits >> 1
||		AND	.L1	A8,	1,	A11	; ndiff = nbits & 1
||		SHR	.S2	B6,	1,	B5	; loop n/4 +2 times
||		STW	.D1	A11,	*A15--[2]	; push A11
||		STW	.D2	B11,	*B15--[2]	; push B11

		ADD	.D1	A0,	A11,	A11	; ntop = nbot + ndiff
||		MVK	.S1	1,	A2,		; constant 1
||		ADD	.L2	2,	B5,	B2	; loop n/4 +2	
||		MVK	.S2	1,	B1		; setup priming count
||		MV	.L1X	B4,	A5		; copy index
		
		SHL	.S1	A2,	A11,	A1 	; n2 = 1 << ntop
||		ZERO	.L1	A10			; zero A10
||		STW	.D1	A12,	*A15		; push A12
||		STW	.D2	B12,	*B15--[2]	; push B12

		SUB	.L2X	A1,	1,	B13	; mask = n2 - 1
||		ZERO	.L1	A1			; prevent stores on first iteration
||		STW	.D2	B13,	*B15--		; push B13

		SHR	.S1	A3,	A0,	A11	;** a = i0 >> nbot
||		AND	.L2X	A3,	B13,	B0	;** b = i0 & mask

		LDH	.D2	*B4[B0],	B0	;** ib = index[b]
||		ADD	.L2X	A3,	1,	B5	;** i1 = i0 + 1

		ADD		B5,	B6,	B7	;** i3 = i1 + halfn

		LDW	.D2	*B8[B7],	B9	;** xi3 = x[i3]
||		ZERO	.D1	A12			; zero A12
LOOP:
	[A1]	STW	.D2	B9,	*B8[B0]		; if (t) x[j3] = xi3
||	[B2]	SUB		B2,	1,	B2	; decrement loop counter
||		MPY	.M1	A1,	1,	A2	; copy t
||		LDW	.D1	*A4[A3],	A11	;* xi0 = x[i0]

	[A1]	STW	.D1	A11,	*A4[A10]	; if (t) x[j0] = xi0
||	[B2]	B	.S2	LOOP			; for loop
||		SHL	.S1X	B0,	A0,	A10	;* ibs = ib << nbot
||		ADD		A3,	2,	A3	;* ai0 += 2
||		MPY	.M2	B5,	1,	B10	;* copy ai1
||		LDW	.D2	*B8[B5],	B11	;* xi1 = x[i1]
||		MPY	.M1	A3,	1,	A9	;* copy ai0

	[!B1]	STW	.D2	A11,	*B8[B10]	; x[i1] = xj1
||	[!B1]	STW	.D1	B11,	*A4[A6]		; x[j1] = xi1
||		ADD		A10,	A12,	A10	;* j0 = ibs + ia
||		SHR	.S1	A3,	A0,	A11	;** a = i0 >> nbot
||		AND	.L2X	A3,	B13,	B0	;** b = i0 & mask

		ADD	.L1X	A10,	B6,	A6	;* j1 = j0 + halfn
||		MPY	.M2	B7,	1,	B12	;* copy ai3
||	[B1]	SUB		B1,	1,	B1	; decrement priming counter
||		LDH	.D2	*B4[B0],	B0	;** ib = index[b]
||		ADD	.L2X	A3,	1,	B5	;** i1 = i1 + 1
||	[!B0]	LDH	.D1	*A5[A11],	A12	;** if (!b) ia = index[a]

	[A1]	STW	.D2	B0,	*B8[B12]	; if (t) x[i3] = xj3
||		ADD	.L2X	A6,	1,	B0	;* j3 = j0 + 1
||	[!B1]	CMPLT	.L1	A9,	A10,	A1	;* t = i0 < j0
||		LDW	.D1	*A4[A6],	A11	;* xj1 = x[j1]
||	[B1]	MPY	.M1	A4,	0,	A1	; prime conditional store
||		ADD		B5,	B6,	B7	;** i3 = i1 + halfn

		LDW	.D1	*A4[A10],	A7	;* xj0 = x[j0]
||		LDW	.D2	*B8[B7],	B9	;** xi3 = x[i3]

	[A2]	STW	.D1	A7,	*A4[A8]		; if (t) x[i0] = xj0
||		LDW	.D2	*B8[B0],	B0	;* xj3 = x[j3]
||		MPY	.M1	A9,	1,	A8	;* copy ai0 again
END_LOOP:
		LDW	.D1	*A15,	A12		; pop A12
||		LDW	.D2	*++B15,	B13		; pop B13

		LDW	.D1	*++A15[2],	A11	; pop A11
||		LDW	.D2	*++B15[2],	B12	; pop B12

		LDW	.D1	*++A15[2],	A10	; pop A10
||		LDW	.D2	*++B15[2],	B11	; pop B11
||		B	.S2	B3			; return

		LDW	.D1	*++A15,		B10	; pop A15
||		LDW	.D2	*++B15[3],	A15	; pop B10

		NOP	4
