*==============================================================================
*
*	TEXAS INSTRUMENTS ,INC.
*
*	VITERBI V32 PSTN TRELLIS DECODER
*
*	Revision Date:  06/18/97
*
*	USAGE  This routine is C callable and can be called as
*	
*		short vitv32(short old[], short next[], short d[])
*
*		old    --- state metrics at previous instant	(input)
*		next   --- state metrics at current instant	(output)
*		d      --- computed Euclidean distances between (input)
*			   input data and closest points in the 
*			   constellation
*
*	C CODE
*		This is the C equivalent of the assembly code. Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*		short vitv32(short old[], short next[], short d[])
*		{
*			int             i, j, k;
*			int             a, max, path, index;
*			int             tr = 0;
*			for (i = 0; i < 2; i++) {
*			   for (j = 0; j < 4; j++) {
*			      max = old[i * 4] - d[i * 4 + j % 4];
*			      path = 0;
*			      index = j;
*			      for (k = 1; k < 4; k++) {
*			         index = -index;
*				 a = old[k + i * 4] - d[i*4 + (4+k+index) % 4];
*				 if (a > max) {
*				    max = a;
*				    path = k;
*				 }
*			      }
*			      next[j * 2 + i] = max;
*			      tr = tr << 2 | path;
*			   }  
*			}
*			return tr;
*		}
*	
*	DESCRIPTION
*
*		This routine implements one stage of Viterbi V32 trellis 
*	        decoder based on soft-decision Viterbi decoding technique.
*		The code is of rate 2/3 and of constraint lenght K = 4.  
*	
*		Current metrics are computed by subtracting the Euclidean
*		distance from the old state metrics.  The new metric with
*		the maximum value will have the shortest distance and the
*		most likely path for the output.  The path taken will be
*		return value of the function.
*
*	TECHNIQUES
*		
*		The k loop is completely unrolled.
*
*	ASSUMPTIONS
*		1.  Rate -> 2/3
*		2.  K = 4
*
*	MEMORY NOTE:
*
*		There are no memory hits. d[] must be aligned on a 4 halfword
*		(eight byte) boundary for circular addressing.
*	
*	CYCLES		
*		64
*==============================================================================
	
	.global _vitv32
	.text

*** BEGIN Benchmark Timing ***
_vitv32:
	MV	.L1X	B15,	A1		; copy stack pointer
||	STW	.D2	B10,	*B15--[2]	; push B10 on stack
||	ADD	.L2X	4,	A6,	B5	; d[i * 4 + (6 + j) % 4]
||	MVK	.S1	1,	A10		; i = 1

	STW	.D1	A10,	*--A1[1]	; push A10 on stack
||	STW	.D2	B11,	*B15--[2]	; push B11 on stack
||	MV	.L1	A4,	A8
||	MVK	.S2	4414h,	B0		; setup A5,B5,A6,B7 for circular
						; addressing BK0=1(size 4 halfs)

	ADD	.L2X	4,	A4,	B6	; copy old
||	ADD	.L1	2,	A6,	A5	; d[i * 4 + (5 - j) % 4]
||	STW	.D1	A11,	*--A1[2]	; push A11 on stack
||	STW	.D2	B12,	*B15--		; push B12 on stack

	LDH	.D1	*A8++, A11		; old0 = old[0 + i * 4]
||	LDH	.D2	*B6++, B9		; old2 = old[2 + i * 4]
||	MVKH	.S2	24414h, B0		; setup for circular addressing
||	ADD	.L2X	6,	A6,	B7	; d[i * 4 + (7 - j) % 4]


	LDH	.D1	*A8++[3],	A3	; old1 = old[1 + i * 4]
||	LDH	.D2	*B6++[3],	B12	; old3 = old[3 + i * 4]
||	B	.S1	JLOOP			; for
||	MVC	.S2	B0,	AMR		; set addressing modes

	LDH	.D2	*B7--,	B10		;*** d3 = d[i * 4 + (7 - j) % 4]
||	LDH	.D1	*A6++,	A9		;*** d0 = d[i * 4 + (4 + j) % 4]
||	MVK	.S2	4,	B0		; j < 4
||	MVK	.S1	2,	A2		; setup priming

JLOOP:
	CMPGT	.L2X	B8,	A7,	B2	; t1 = (a2 > a0)
||[!A2]	SHL	.S1	A4,	2,	A4	; tr <<= 2
||	SUB	.S2	B12,	B10,	B11	;* a3 = old3 - d3
||	LDH	.D2	*B5++,	B10		;** d2 = d[i * 4 + (6 + j) % 4]
||	LDH	.D1	*A5--,	A0		;** d1 = d[i * 4 + (5 - j) % 4]
||[A2]	MPY	.M2	0,	B2,	B2	; prevent extraneous OR's during
						; priming

  [B2]	MV	.S1X	B1,	A1		; if (t1) t0 = t2
||[!B2]	MV	.S2X	A7,	B10		; if (!t1) max = a0
||[B2]	MV	.L2	B8,	B10		; if (t1)  max = a2
||	SUB	.L1	A3,	A0,	A9	;* a1 = old1 - d1
||	SUB	.D2	B9,	B10,	B8	;* a2 = old2 - d2
||	SUB	.D1	A11,	A9,	A7	;* a0 = old0 - d0

  [!A2]	STH	.D2	B10,	*B4++[2]	; next[j * 2 + i] = max
||[!A2]	ADD	.D1	A1,	A4,	A4	; tr |= t0
||	CMPGT	.L1	A9,	A7,	A1	;* t0 = (a1 > a0)
||	CMPGT	.L2	B11,	B8,	B1	;* t2 = (a3 > a2)
||[B0]	B	.S2	JLOOP			;* for j
||[A2]	SUB	.S1	A2,	1,	A2	; decrement priming

  [B2]	ADD	.L1	2,	A4,	A4	; tr |= t1 << 1
||[A1]	MV	.S1	A9,	A7		;* if (t0)  a0 = a1
||[B1]	MV	.L2	B11,	B8		;* if (t2)  a2 = a3
||	LDH	.D2	*B7--,	B10		;*** d3 = d[i * 4 + (7 - j) % 4]
||	LDH	.D1	*A6++,	A9		;*** d0 = d[i * 4 + (4 + j) % 4]
||[B0]	SUB	.S2	B0,	1,	B0	; j++

JLOOP_END:
	LDH	.D1	*A8++, A11		; old0 = old[0 + i * 4]
||	LDH	.D2	*B6++, B9		; old2 = old[2 + i * 4]
||	ADD	.L1	4,	A5,	A5	; d[i * 4 + (5 - j) % 4]
||	ADD	.L2	12,	B5,	B5	; d[i * 4 + (6 + j) % 4]
||	MVK	.S1	1,	A2		; setup priming
||	MV	.S2X	A10,	B0		; get outer loop count

	LDH	.D1	*A8,		A3	; old1 = old[1 + i * 4]
||	LDH	.D2	*B6,		B12	; old3 = old[3 + i * 4]
||	ADD	.L1	2,	A6,	A6	; d[i * 4 + (4 + j) % 4]
||	ADD	.L2	14,	B7,	B7	; d[i * 4 + (7 - j) % 4]
||[B0]	B	.S1	JLOOP			; for i
||	MPY	.M1	0,	A10,	A10	; i < 2

	LDH	.D2	*B7--,	B10		;*** d3 = d[i * 4 + (7 - j) % 4]
||	LDH	.D1	*A6++,	A9		;*** d0 = d[i * 4 + (4 + j) % 4]

	LDH	.D2	*B5++,	B10		;** d2 = d[i * 4 + (6 + j) % 4]
||	LDH	.D1	*A5--,	A0		;** d1 = d[i * 4 + (5 - j) % 4]
||[!B0]	MV	.L1X	B15,	A1

	SUB	.S2	B4,	14,	B4	; next[j * 2 + i]
||[!B0]	LDW	.D2	*++B15,	B12		; pop B12 off stack

  [B0]	B	.S1	JLOOP			; for
||[!B0]	LDW	.D1	*++A1[2],	A11	; pop A11 off stack
||[!B0]	LDW	.D2	*++B15[2],	B11	; pop B11 off stack

	LDH	.D2	*B7--,	B10		;*** d3 = d[i * 4 + (7 - j) % 4]
||	LDH	.D1	*A6++,	A9		;*** d0 = d[i * 4 + (4 + j) % 4]
||	MVK	.S2	3,	B0		; j < 4

*** END Benchmark Timing ***

	LDW	.D1	*++A1[2],	A10	; pop A10 off stack
||	LDW	.D2	*++B15[2],	B10	; pop B10 off stack
||	B	.S2	B3

	NOP	5	
	

	
