*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	MATRIX VECTOR MULTIPLY  (floating point, LDW version)
*
*	Revision Date:	06/03/98
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*			mvmult(a, b, c, rows, columns);
*
*		If the routine is not to be used as a C callable function,
*		then you need to initialize values for all of the parameters
*		passed to the function since these are assumed to be in
*		registers as defined by the calling convention of the
*		compiler, (refer to the TMS320C6x Optimizing C Compiler
*		User's Guide).
*
*	C CODE
*		This is the C equivalent for the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*		void mvmult(float a[], float b[], float c[], short rows, short columns)
*		{
*			short i,j;
*			int cntr=0;	
*			float temp;
*
*			for (i=0; i<rows; i++) {
*	    		for (j=0; j<columns; j++) {
*				temp += a[cntr] * b[j] ;
*				cntr++;
*				}
*			c[i] = temp;
*			temp = 0;
*			}
*		}
*
*	DESCRIPTION
*
*		This routine calculates the product of a matrix vector multiplication
*		A[][] * B[] = C[]
*					
*		A has dimensions m by n (rows by colums)
*		B has dimensions n by 1
*		C has dimensions m by 1
*
*	TECHNIQUES
*
*		1.  The inner loop is unrolled 6 times and software pipelined.
*		2.  A load counter is used to prevent extraneous load in the
*			prologue and kernel.  The use of a load counter allows for 
*			the disposal of the epilog.  The load counter is the number 
*			of rows minus one (rows - 1).  Only the first LDW pair will 
*			execute unconditionally.
*		3.  The reseting of the inner counter has been moved from the top of the
*			outer loop to the bottom of the outer loop.  This saves one cycle
*			per outer loop iteration as an MV command replaces a NOP during 
*			the adding of the running sum.  For the first cycle the inner loop 
*			counter is set outside both loop (in parallel with the outer loop
*			counter setting).
*		4.  LDW are used to allow for processing of smaller matrix sizes.  An LDDW
*			version of this program, which has stricter restriction, is
*			avalilable.
*
*	ASSUMPTIONS
*
*		1.  Little Endian is assumed for LDW instruction.
*		2.  The entries of matrix A should be organized into an array, float a[], 
*			where the values are place by rows;  i.e.  a[] = [row1, row2, ...]
*			The array a[] has (n*m) entries.
*		3.  The column dimension must be greater than 1 (i.e. m>1).
*		4.  There is no restriction on the row dimension.  However, if the row
*			dimension is 1 the programs essentially performs a dot product.
*			A more efficient realization of the dot product algorithm is 
*			possible and avaliable.
*		
*	MEMORY NOTE
*
*		The a and b arrays should be placed on opposite word boundaries
*		to prevent internal data memory bank hits.
*
*       ARGUMENTS PASSED
*
*		a[]	 ->  A4
*		b[]	 ->  B4
*		c[]	 ->  A6
*		rows	 ->  B6
*		columns  ->  A8
*
*	CYCLES
*
*	(n + 20)*m + 1          (m= # of rows,	n= # of columns)
*
*===============================================================================
	.global _mvmult
	.text

aptr	.set	A4	;
bptr	.set	B4	;
cptr	.set	A6	;
rows	.set	B6	;
colms	.set	A8	;

aa0	.set	A7	;
bb0	.set	B7	;

sum0	.set	A3	;
temp1	.set 	A9	;
temp2	.set	A7	;
temp3	.set	A9	;

mult0	.set	A5	;

btmp	.set	B2	;

icntr	.set	A1	;
ocntr	.set	A2	;
lcntr	.set	B1	;

_mvmult:

*** BEGIN Benchmark Timing ***

*** begin piplining inner loop	

		SUB	.L1X	rows,1,ocntr
||		ADD 	.L2	bptr,4,btmp 	
||	   	LDW	.D1T1	*aptr++(4),aa0	;1  load a[i] from memory
||		LDW	.D2T2	*bptr,bb0  	;1  load b[i] from memory
||		SUB	.S2X	colms,1,lcntr	;   load cntr = comumns - 1

oloop:

   [lcntr]	LDW	.D1T1	*aptr++(4),aa0	;2  if(lcntr) load a[i] from memory
|| [lcntr]	LDW	.D2T2	*btmp++(4),bb0  ;2  if(lcntr) load b[i] from memory
|| [lcntr]	SUB	.L2	lcntr,1,lcntr	;2  if(lcntr) lcntr -= 1
||		SUB	.S1	colms,2,icntr	; 
||		ZERO	.L1	sum0		;   zero the running sum

   [lcntr]	LDW	.D1T1	*aptr++(4),aa0	;3  if(lcntr) load a[i] from memory
|| [lcntr]	LDW	.D2T2	*btmp++(4),bb0  ;3  if(lcntr) load b[i] from memory	
|| [lcntr]	SUB	.L2	lcntr,1,lcntr	;3  if(lcntr) lcntr -= 1

   [lcntr]	LDW	.D1T1	*aptr++(4),aa0	;4  if(lcntr) load a[i] from memory
|| [lcntr]	LDW	.D2T2	*btmp++(4),bb0  ;4  if(lcntr) load b[i] from memory
|| [lcntr]	SUB	.L2	lcntr,1,lcntr	;4  if(lcntr) lcntr -= 1

   [lcntr]	LDW	.D1T1	*aptr++(4),aa0	;5  if(lcntr) load a[i] from memory
|| [lcntr]	LDW	.D2T2	*btmp++(4),bb0 	;5  if(lcntr) load b[i] from memory
|| [lcntr]	SUB	.L2	lcntr,1,lcntr	;5  if(lcntr) lcntr -= 1
||		B	.S2	iloop		;1  branch to iloop

   [lcntr]	LDW	.D1T1	*aptr++(4),aa0	;6  if(lcntr) load a[i] from memory
|| [lcntr]	LDW	.D2T2	*btmp++(4),bb0 	;6  if(lcntr) load b[i] from memory
|| [lcntr]	SUB	.L2	lcntr,1,lcntr	;6  if(lcntr) lcntr -= 1
|| [icntr]	SUB	.L1	icntr,1,icntr	;6  if(icntr) icntr -= 1
||		MPYSP	.M1X	aa0,bb0,mult0	;1  mult0 = a[i]*b[i]
|| [icntr]	B	.S2	iloop		;2  if(icntr) branch to iloop

   [lcntr]	LDW	.D1T1	*aptr++(4),aa0	;7  if(lcntr) load a[i] from memory
|| [lcntr]	LDW	.D2T2	*btmp++(4),bb0 	;7  if(lcntr) load b[i] from memory
|| [lcntr]	SUB	.L2	lcntr,1,lcntr	;7  if(lcntr) lcntr -= 1
|| [icntr]	SUB	.L1	icntr,1,icntr	;7  if(icntr) icntr -= 1
||		MPYSP	.M1X	aa0,bb0,mult0	;2  mult0 = a[i]*b[i]
|| [icntr]	B	.S2	iloop		;3  if(icntr) branch to iloop

   [lcntr]	LDW	.D1T1	*aptr++(4),aa0	;8  if(lcntr) load a[i] from memory
|| [lcntr]	LDW	.D2T2	*btmp++(4),bb0 	;8  if(lcntr) load b[i] from memory
|| [lcntr]	SUB	.L2	lcntr,1,lcntr	;8  if(lcntr) lcntr -= 1
|| [icntr]	SUB	.L1	icntr,1,icntr	;8  if(icntr) icntr -= 1
||		MPYSP	.M1X	aa0,bb0,mult0	;3  mult0 = a[i]*b[i]
|| [icntr]	B	.S2	iloop		;4  if(icntr) branch to iloop

   [lcntr]	LDW	.D1T1	*aptr++(4),aa0	;9  if(lcntr) load a[i] from memory
|| [lcntr]	LDW	.D2T2	*btmp++(4),bb0 	;9  if(lcntr) load b[i] from memory
|| [lcntr]	SUB	.L2	lcntr,1,lcntr	;9  if(lcntr) lcntr -= 1
|| [icntr]	SUB	.L1	icntr,1,icntr	;9  if(icntr) icntr -= 1
||		MPYSP	.M1X	aa0,bb0,mult0	;4  mult0 = a[i]*b[i]
|| [icntr]	B	.S2	iloop		;5  if(icntr) branch to iloop

**************

iloop:

   [lcntr]	LDW	.D1T1	*aptr++(4),aa0	;10  if(lcntr) load a[i] from memory
|| [lcntr]	LDW	.D2T2	*btmp++(4),bb0	;10  if(lcntr) load b[i] from memory
|| [lcntr]	SUB	.L2	lcntr,1,lcntr	;10  if(lcntr) lcntr -= 1
|| [icntr]	SUB	.S1	icntr,1,icntr	;10  if(icntr) icntr -= 1
|| 		MPYSP	.M1X	aa0,bb0,mult0	;5   mult0 = a[i]*b[i]
||		ADDSP	.L1	mult0,sum0,sum0	;1   sum0 = sum0+mult0
|| [icntr]	B	.S2	iloop		;6   if(icntr) branch to iloop

**************
*** add up the running sums ***

		MV	.D1	sum0,temp1	;    temp1 = sum0
		ADDSP	.L1	sum0,temp1,temp2;    temp2 = temp1 + sum0 (2nd sum0)
		MV	.D1	sum0,temp1	;    temp1 = sum0 (the 3rd sum0)
		ADDSP	.L1	sum0,temp1,temp3;    temp3 = temp1 + sum0 (4th sum0)
		NOP		2		;    wait for temp3
   [ocntr]	B	.S2	oloop		;    if(ocntr) branch to oloop
		ADDSP	.L1	temp2,temp3,sum0;    sum0 = temp2 + temp3	
***
   [ocntr]	MV 	.D2	bptr,btmp	;    reset *b to beginning of b

		SUB	.S1	colms,2,icntr	;    inner cntr = columns - 2
||		SUB	.S2X	colms,1,lcntr	;    load cntr = comumns - 1

	   	LDW	.D1T1	*aptr++(4),aa0	;1   load a[i] from memory
||		LDW	.D2T2	*btmp++(4),bb0  ;1   load b[i] from memory

		STW	.D1	sum0,*cptr++(4)	;    c[i] = sum0
|| [ocntr]	SUB	.L1	ocntr,1,ocntr	;    if(ocntr) ocntr -= 1
		
*** END Benchmark Timing ***				
		B	.S2	B3		;   return from function
B_END:
		NOP		5		;  
