******************************************************************************/
*
*	TEXAS INSTRUMENTS, INC.
*
*	WEIGHTED VECTOR SUM
*
*	Revision Data: 04/02/97
*
*	USAGE This routine is C callable and can be called as
* 
*	      void w_vec(short a[],short b[],short m,short c[],short n)
*
*	      a[] --- vector being weighted 	(input)
*	      b[] --- summation vector	    	(input)
*	      m   --- weighting factor		(input) 
*	      c[] --- output vector		(output) 
*	      n   --- dimension of the vectors	(input) 
*
*	      If the routine is not to be used as a C callable function,
*	      then all instructions relating to stack should be removed.
*	      Refer to comments of individual instructions. You will also
*	      need to initialize values for all the values passed as these
*	      are assumed to be in registers as defined by the calling
*	      convention of the compiler, (refer to the C compiler reference
*	      guide.)
*
*	C CODE
*	      This is the C equivalent of the Assembly Code without the 
*	      assumptions listed below. Note that the assembly code is hand
*             optimized and assumptions apply.
*
*		void w_vec(short a[],short b[],short m,short c[],short n)
*		{
*			short i;
*
*			for (i=0; i<n; i++) {
*				c[i] = ((m * a[i]) >> 15) + b[i];
*			}
*		}
*
*	DESCRIPTION
*		This routine is used to obtain the weighted vector sum.
*		Both the inputs and output are 16-bit numbers.
*
*	TECHNIQUES
*	      1. Loading the input in word to double the performance.
*	      2. Using AND (.L) instead of EXTU (.S) to obtaine b[2*i] from the
*	         word containing b[2*i+1] and b[2*i] to reduce the
*		 requirement on .S unit.
*
*	ASSUMPTIONS
*		n>=3
*		vectors a and b should be aligned on word boundary.
*	
*	MEMORY NOTE
*		If vectors a and b are not aligned on the same word boundary,
*		memory bank hits will occur once every two cycles.
*
*	CYCLES		n + 10	(even n)
*			n + 11	(odd n)
*
*	PERFORMANCE COMMENTS:
*		Limited by 6 ALUs/cycle.
********************************************************************

********* ASSEMBLY CODE: *******************

	.global _w_vec
	.text
_w_vec:

****	begin benchmark timing   ***
B_START:
  	LDW	.D1	*A4++,B2	; a[i] & a[i+1]
||	AND	.L2X	A8,1,B1		; check if n is odd or even

	SUB	.L2X	A8,3,B0		; set up loop counter
||	ADD	.L1X	B6,2,A8		; set pointer to c[i+1]

  	LDW	.D2	*B4++,A2	; b[i] & b[i+1]
||	LDW	.D1	*A4++,B2	;* a[i] & a[i+1]
||[!B1]	SUB	.L2	B0,1,B0		; adjust the loop counter for even n

	MVK	.S1	-1,A0		; set to all 1s

  	LDW	.D2	*B4++,A2	;* b[i] & b[i+1]
||	LDW	.D1	*A4++,B2	;** a[i] & a[i+1]
||	MVKH	.S1	0,A0		; clear upper 16 b[i]ts

  	MPY	.M2X	B2,A6,B5	; m * a[i]

  	MPYHL	.M1X	B2,A6,A5	; m * a[i+1]
||[B0]	B	.S1	LOOP		; branch to loop
||	LDW	.D2	*B4++,A2	;** b[i] & b[i+1]
||	LDW	.D1	*A4++,B2	;*** a[i] & a[i+1]

  	SHR	.S2	B5,15,B7	; (m * a[i]) >> 15
||	AND	.L1	A2,A0,A3	; b[i]
||	MPY	.M2X	B2,A6,B5	;* m * a[i]
||[B0]	SUB	.L2	B0,2,B0		; decrement loop counter

  	SHR	.S1	A2,16,A1	; b[i+1]	
||	ADD	.L2X	B7,A3,B9	; c[i] = (m * a[i]) >> 15 + b[i]
||	MPYHL	.M1X	B2,A6,A5	;* m * a[i+1]
||[B0]	B	.S2	LOOP		;* branch to loop
||	LDW	.D2	*B4++,A2	;*** b[i] & b[i+1]
||	LDW	.D1	*A4++,B2	;**** a[i] & a[i+1]

  	SHR	.S1	A5,15,A7	; (m * a[i+1]) >> 15
||	STH	.D2	B9,*B6++[2]	; store c[i]
||	SHR	.S2	B5,15,B7	;* (m * a[i]) >> 15
||[B0]	SUB	.L2	B0,2,B0		; decrement loop counter
||	AND	.L1	A2,A0,A3	;* b[i]
||	MPY	.M2X	B2,A6,B5	;** m * a[i]

LOOP:
  	ADD	.L1	A7,A1,A9	; c[i+1] = (m * a[i+1]) >> 15 + b[i+1]
||	SHR	.S1	A2,16,A1	;* b[i+1]	
||	ADD	.L2X	B7,A3,B9	;* c[i] = (m * a[i]) >> 15 + b[i]
||	MPYHL	.M1X	B2,A6,A5	;** m * a[i+1]
||[B0]	B	.S2	LOOP		;** branch to loop
||	LDW	.D2	*B4++,A2	;**** b[i] & b[i+1]
||	LDW	.D1	*A4++,B2	;***** a[i] & a[i+1]

  	STH	.D1	A9,*A8++[2]	; store c[i+1]
||[B0]	SUB	.L2	B0,2,B0		; decrement loop counter
||	SHR	.S1	A5,15,A7	;* (m * a[i+1]) >> 15
||	STH	.D2	B9,*B6++[2]	;* store c[i]
||	SHR	.S2	B5,15,B7	;** (m * a[i]) >> 15
||	AND	.L1	A2,A0,A3	;** b[i]
||	MPY	.M2X	B2,A6,B5	;*** m * a[i]
LOOP_END:
   
  [!B1]	ADD	.L1	A7,A1,A9	; c[i+1] = (m * a[i+1]) >> 15 + b[i+1]
||	B	.S2	B3

  [!B1]	STH	.D1	A9,*A8		; store c[i+1]

B_END:
***	end benchmarking timing	 ***

	NOP	4

********* END ASSEMBLY CODE ******************	
