*===============================================================================
*
*   TEXAS INSTRUMENTS, INC.
*
*   LMS
*
*   Revision Date: 07/14/98
*
*   USAGE  This routine is C Callable and can be called as:
*
*    void lms(float *x,float *h,float *y,int NumH,float *d,float ar,short numY)
*		
*		x    =  input floating point array
*		h    =  coefficient floating point array
*		y    =  output floating point array
*		numH =  number of coefficients (MULTIPLE of 4 >= 4)
*		d    =  desired output floating point array
*		ar   =  adaptive rate mu
*		numY =  number of output samples
*
*	    If routine is not to be used as a C callable function
*	    then all instructions relating to stack should be removed.
*	    Refer to comments of individual instructions.  You will also
*	    need to initialize values for all of the values passed as these
*	    are assumed to be in registers as defined by the calling 
*	    convention of the compiler, (refer to the C compiler reference
*	    guide).
*
*    C Code This is the C equivalent of the Assembly Code without 
*	    restrictions.
*
*	    Note that the assembly code is hand optimized and restrictions 
*	    may apply
*
*	    void lms(float *x,float *h,float *y,int NumH,float *d,float ar,
*		     short NumY)
*	    {
*   		int i,j;
*		float sum;
*		float error = 0.0f;
*        			
*		for (i = 0; i <numY; i++)
*		{
*
*			for (j = 0; j < numH; j++) 
*			{
*			     h[j] = h[j] + (ar*error*x[i+j-1]);
*			}
*	
*			sum = 0.0f;
*			for (j = 0; j < N; j++) 
*			{
*				 sum += h[j] * x[i+j];
*			}
*		 	y[i] = sum;
*			error = d[i] - sum;			     			
*		}
*	    }
*
*    DESCRIPTION
*		This routine implements the LMS algorithm. The number of
*		coefficients is numH, the number of samples is numY, and the
*		number of input data samples is numH+numY-1.
*
*    TECHNIQUES
*		1. The inner loop is unrolled four times to allow update of
*	    	   four coefficients in the kernel.
*		2. LDDW instruction is used to load in the coefficients.
*		3. Register sharing is used to make optimal use of available
*		   registers.
*		4. The outer loop instructions are scheduled in parallel with
*		   epilog and prolog wherever possible.
*		5. The 'error' term needs to be computed in the outer loop 
*		   before a new iteration of the inner loop can start. As a
*		   result the prolog cannot be placed in parallel with epilog
*		   (after the loop kernel).
*		6. Pushing and popping variables from the stack does not
*		   really add any overhead except increase stack size. This
*		   is because the pops and pushes are done in the delay slots
*		   of the outer loop instructions.
*		   
*
*    ASSUMPTIONS
*		NumH must be a multiple of 4
*		Little endianness is assumed.
*		Extraneous loads are allowed in the program.
*
*    ARGUMENTS PASSED
*
*		x	 ->  A4
*		h	 ->  B4
*		y	 ->  A6 
*		numH	 ->  B6
*		d	 ->  A8
*		ar	 ->  B8
*		numY	 ->  A10
*
*    REGISTER USAGE
*		A0:  ptr_h 		  B0:  ptr_h1
*		A1:  icntr		  B1:  ocntr
*		A2:  ar,aerr		  B2:  ar,aerr
*		A3:  ptr_x		  B3:  ptr_x1
*		A4:  nh(i+1),nh(i+3).	  B4:  nh(i), nh(i+2) etc.
*		A5:  temp for h(i+2)      B5:  xold, temp x register
*		A6:  x(i)		  B6:  x(i+1)		
*		A7:  x(i+2)		  B7:  x(i+3)
*		A8:  x0*aerr,x2*aerr etc. B8:  x1*aerr,x3*aerr etc.
*		  :  x1*h1, x3*h3	    :  x0*h0, x2*h2
*		A9:  temp x(i+1)          B9:  temp for x(i)
*		A10: sum0		  B10: sum1
*		A11: sum2		  B11: sum3
*		A12: old h(i)		  B12: old h(i+2)
*		A13: old h(i+1)		  B13: old h(i+3)
*		A14: temp x(i+3)	  B14: temp x(i+2)
*
*    CYCLES	
*		((5*NumH)/4+27)*NumY + 17  with C overhead
*		((5*NumH)/4+27)*NumY	   without C overhead
*    
*    NOTATIONS
*
*		f = Function Prolog or Epilog
*		o = Outer Loop
*		p = Inner Loop Prolog
*
*===============================================================================
	.def	_lms
_lms:	
	 SUBAW	  .D2	  B15,16,B15	      ; f Make space to stack

	 STW	  .D2	  A15,*+B15[0]	      ; f Push A15
||	 MV	  .S1	  B15,A15	      ; f A15 = B15

	 STW	  .D1	  B15,*+A15[1]	      ; f Push B15
       
	 STW	  .D1	  A14,*+A15[2]	      ; f Push A14
||	 STW	  .D2	  B14,*+B15[3]	      ; f Push B14
     
	 STW	  .D1	  A13,*+A15[4]	      ; f Push A13
||	 STW	  .D2	  B13,*+B15[5]	      ; f Push B13
       
	 STW	  .D1	  A12,*+A15[6]	      ; f Push A12
||	 STW	  .D2	  B12,*+B15[7]	      ; f Push B12
     
	 STW	  .D1	  A11,*+A15[8]	      ; f Push A11
||	 STW	  .D2	  B11,*+B15[9]	      ; f Push B11
||	 MV	  .S1	  A4,A3 	      ; f Initialize ptr_x
     
	 STW	  .D1	  A10,*+A15[10]       ; f Push ocntr
||	 STW	  .D2	  B10,*+B15[11]       ; f Push B10
||	 MV	  .S1x	  B6,A1 	      ; f Initialize icntr
    
	 STW	  .D1	  A8,*+A15[12]	      ; f Push ptr_d
||	 STW	  .D2	  B8,*+B15[13]	      ; f Push ar
||	 MV	  .S1	  B8,A2 	      ; f Initialize ar
||	 MV	  .S2	  B8,B2 	      ; f Initialize ar
||	 ZERO	  .L2	  B5		      ; f Initialize xold=0

	 STW	  .D1	  A6,*+A15[14]	      ; f Push ptr_y
||	 STW	  .D2	  B6,*+B15[15]	      ; f Push icntr
||	 MV	  .S1x	  B4,A0 	      ; f Initialize ptr_h
||	 ADD	  .S2	  B4,8,B0	      ; f ptr_h1=ptr_h+8
||	 ZERO	  .L1	  A11		      ; f Clear error
||	 MPYSP	  .M2x	  A2,B5,B5	      ; f ar*xold

	 STW	  .D2	  B3,*+B15[16]	      ; f Push return address
||	 ADD	  .L2x	  A3,4,B3	      ; f ptr_x1=ptr_x+4
					
**** START Benchmark Timing *****
B_Start:
OLoop:
	 LDW	  .D1	  *A3++[2],A6	      ; load x0
||	 LDW	  .D2	  *B3++[2],B6	      ; load x1

	 LDW	  .D1	  *A3++[2],A7	      ; load x2
||	 LDW	  .D2	  *B3++[2],B7	      ; load x3
||	 MPYSP	  .M1	  A11,A2,A2	      ; o aerr=ar*error
||	 MPYSP	  .M2x	  A11,B2,B2	      ; o aerr=ar*error

	 LDDW	  .D1	  *A0++[2],A13:A12    ; load h1:h0
||	 LDDW	  .D2	  *B0++[2],B13:B12    ; load h3:h2
||	 MPYSP	  .M2x	  A11,B5,B5	      ; o ar*xold*error=xold*aerr

	 ZERO	  .D2	  B11		      ; o Clear sum2
||	 ZERO	  .D1	  A11		      ; o Clear sum3
||  [A1] SUB	  .L1	  A1,4,A1	      ; if(icntr) icntr -= 4

	 ZERO	  .D1	  A10		      ; o Clear sum0
||	 ZERO	  .D2	  B10		      ; o Clear sum1
     
	 LDW	  .D1	  *A3++[2],A6	      ; @ load x0
||	 LDW	  .D2	  *B3++[2],B6	      ; @ load x1
||	 MPYSP	  .M1	  A2,A6,A8	      ; x0*aerr
||	 MPYSP	  .M2	  B2,B6,B8	      ; x1*aerr

	 LDW	  .D1	  *A3++[2],A7	      ; @ load x2
||	 LDW	  .D2	  *B3++[2],B7	      ; @ load x3
||	 MPYSP	  .M1	  A2,A7,A8	      ; x2*aerr
||	 MPYSP	  .M2	  B2,B7,B5	      ; x3*aerr

	 LDDW	  .D1	  *A0++[2],A13:A12    ; @ load h1:h0
||	 LDDW	  .D2	  *B0++[2],B13:B12    ; @ load h3:h2

	 MV	  .S2x	  A6,B9 	      ; move x0->Bside
||	 MV	  .S1x	  B13,A5	      ; move h3->Aside

	 ADDSP	  .L2x	  A12,B5,B4	      ; nh0=h0+xold*aerr
||	 ADDSP	  .L1	  A13,A8,A4	      ; nh1=h1+x0*aerr
||	 MV	  .S1x	  B6,A9 	      ; move x1->Aside
|| [!A1] B	  .S2	  Skip_Kernel	      ; if(!icntr) branch to Skip_Kernel

	 LDW	  .D1	  *A3++[2],A6	      ; @@ load x0
||	 LDW	  .D2	  *B3++[2],B6	      ; @@ load x1
||	 MPYSP	  .M1	  A2,A6,A8	      ; @ x0*aerr
||	 MPYSP	  .M2	  B2,B6,B8	      ; @ x1*aerr
||	 ADDSP	  .L1	  A5,A8,A4	      ; nh3=h3+x2*aerr
||	 ADDSP	  .L2	  B12,B8,B4	      ; nh2=h2+x1*aerr
||	 MV	  .S1x	  B7,A14	      ; move x3->Aside
||	 MV	  .S2x	  A7,B14	      ; move x2->Bside

	 LDW	  .D1	  *A3++[2],A7	      ; @@ load x2
||	 LDW	  .D2	  *B3++[2],B7	      ; @@ load x3
||	 MPYSP	  .M1	  A2,A7,A8	      ; @ x2*aerr
||	 MPYSP	  .M2	  B2,B7,B5	      ; @ x3*aerr

	 LDDW	  .D1	  *A0++[2],A13:A12    ; @@ load h1:h0
||	 LDDW	  .D2	  *B0++[2],B13:B12    ; @@ load h3:h2
||  [A1] SUB	  .S1	  A1,4,A1	      ; if (icntr) icntr -= 4

	 MV	  .S2x	  A6,B9 	      ; @ move x0-> Bside
||	 MV	  .S1x	  B13,A5	      ; @ move h3->Aside
||	 MPYSP	  .M1	  A9,A4,A8	      ; x1*nh1
||	 MPYSP	  .M2	  B9,B4,B8	      ; x0*nh0
||	 STW	  .D1	  A4,*-A0[11]	      ; save nh1
||	 STW	  .D2	  B4,*-B0[14]	      ; save nh0

	 ADDSP	  .L2x	  A12,B5,B4	      ; @ nh0
||	 ADDSP	  .L1	  A13,A8,A4	      ; @ nh1
||	 MV	  .S1x	  B6,A9 	      ; @ move x1->Aside
||	 MPYSP	  .M1	  A14,A4,A8	      ; x3*nh3
||	 MPYSP	  .M2	  B14,B4,B8	      ; x2*nh2
||	 STW	  .D1	  A4,*-A0[9]	      ; save nh3
||	 STW	  .D2	  B4,*-B0[12]	      ; save nh2
||  [A1] B	  .S2	  loop		      ; if(icntr), branch to loop

loop:          
	 LDW	  .D1	  *A3++[2],A6	      ; @@@ load x0
||	 LDW	  .D2	  *B3++[2],B6	      ; @@@ load x1
||	 MPYSP	  .M1	  A2,A6,A8	      ; @@ x0*aerr
||	 MPYSP	  .M2	  B2,B6,B8	      ; @@ x1*aerr
||	 ADDSP	  .L1	  A5,A8,A4	      ; @ nh3
||	 ADDSP	  .L2	  B12,B8,B4	      ; @ nh2
||	 MV	  .S1x	  B7,A14	      ; @ move x3->Aside
||	 MV	  .S2x	  A7,B14	      ; @ move x2->Bside

	 LDW	  .D1	  *A3++[2],A7	      ; @@@ load x2
||	 LDW	  .D2	  *B3++[2],B7	      ; @@@ load x3
||	 MPYSP	  .M1	  A2,A7,A8	      ; @@ x2*aerr
||	 MPYSP	  .M2	  B2,B7,B5	      ; @@ xold*aerr

	 LDDW	  .D1	  *A0++[2],A13:A12    ; @@@ load h1:h0
||	 LDDW	  .D2	  *B0++[2],B13:B12    ; @@@ load h3:h2
||  [A1] SUB	  .S1	  A1,4,A1	      ; @ if(icntr) icntr -= 4
||	 ADDSP	  .L1	  A10,A8,A10	      ; sum1
||	 ADDSP	  .L2	  B10,B8,B10	      ; sum0

	 MV	  .S2x	  A6,B9 	      ; @@ move x0-> Bside
||	 MV	  .S1x	  B13,A5	      ; @@ move h3->Aside
||	 MPYSP	  .M1	  A9,A4,A8	      ; @ x1*nh1
||	 MPYSP	  .M2	  B9,B4,B8	      ; @ x0*nh0
||	 STW	  .D1	  A4,*-A0[11]	      ; @ save nh1
||	 STW	  .D2	  B4,*-B0[14]	      ; @ save nh0
||	 ADDSP	  .L1	  A11,A8,A11	      ; sum3
||	 ADDSP	  .L2	  B11,B8,B11	      ; sum2

	 ADDSP	  .L2x	  A12,B5,B4	      ; @@ nh0
||	 ADDSP	  .L1	  A13,A8,A4	      ; @@ nh1
||	 MV	  .S1x	  B6,A9 	      ; @@ move x1->Aside
||	 MPYSP	  .M1	  A14,A4,A8	      ; @ x3*nh3
||	 MPYSP	  .M2	  B14,B4,B8	      ; @ x2*nh2
||	 STW	  .D1	  A4,*-A0[9]	      ; @ save nh3
||	 STW	  .D2	  B4,*-B0[12]	      ; @ save nh2
||  [A1] B	  .S2	  loop		      ; @ if(icntr) branch to loop

Skip_Kernel:
	 LDW	  .D1	  *+A15[12],A6	      ; o Pop ptr_d
||	 MV	  .S2	  A15,B15	      ; o A15 = B15

	 LDW	  .D1	  *+A15[15],A1	      ; o Pop icntr
||	 LDW	  .D2	  *+B15[10],B1	      ; o Pop ocntr

	 ADDSP	  .L1	  A10,A8,A10	      ; e sum0
||	 ADDSP	  .L2	  B10,B8,B10	      ; e sum0
||	 LDW	  .D1	  *+A15[13],A2	      ; o Pop ar

	 ADDSP	  .L1	  A11,A8,A11	      ; e sum3
||	 ADDSP	  .L2	  B11,B8,B11	      ; e sum2

	 LDW	  .D1	  *+A15[14],A8	      ; o Pop ptr_y
       
	 LDW	  .D1	  *A6++,A7	      ; o load d
       
	 SHL	  .S2x	  A1,2,B6	      ; o ptr_reset=icntr*4
||	 ADDSP	  .L1X	  A10,B10,A10	      ; o sum0+sum1
||	 STW	  .D1	  A6,*+A15[12]	      ; o Push ptr_d
||  [B1] SUB	  .D2	  B1,1,B1	      ; o if(ocntr) ocntr-=1
       
	 ADDSP	  .L1x	  A11,B11,A11	      ; o sum2+sum3
||  [B1] ADDAW	  .D2	  B6,8,B6	      ; o h_reset=x_reset=ptr_reset+8
||  [B1] MV	  .S2	  A2,B2 	      ; o ar
 
    [B1] SUB	  .S1x	  A3,B6,A3	      ; o ptr_x-=x_reset

    [B1] LDW	  .D1	  *A3++,B5	      ; o load xold
||  [B1] SUB	  .S1x	  A0,B6,A0	      ; o ptr_h-=h_reset

	 SUBSP	  .L1	  A7,A10,A7	      ; o d-(sum0+sum1)
||	 ADD		  A8,4,A8	      ; o ptr_y+=8
||  [B1] ADD	  .L2	  A3,4,B3	      ; o ptr_x1+=8
||  [B1] STW	  .D2	  B1,*+B15[10]	      ; o Push ocntr

	 ADDSP	  .L1	  A10,A11,A10	      ; o y=sum0+sum1+sum2+sum3
||  [B1] ADD	  .L2x	  A0,8,B0	      ; o ptr_h1=ptr_h+8
||  [B1] B	  .S2	  OLoop 	      ; o if(ocntr),branch to OLoop
||  [B1] STW	  .D1	  A8,*+A15[14]	      ; o Push ptr_y
|| [!B1] LDW	  .D2	  *+B15[16],B3	      ; f if(!ocntr) pop return add

   [!B1] LDW	  .D2	  *+B15[11],B10       ; f if(!ocntr) pop B10
|| [!B1] LDW	  .D1	  *+A15[10],A10       ; f if(!ocntr) pop A10
				     	
   [!B1] LDW	  .D1	  *+A15[8],A11	      ; f if(!ocntr) pop A11
|| [!B1] LDW	  .D2	  *+B15[9],B11	      ; f if(!ocntr) pop B11
	
    [B1] SUBSP	  .L1	  A7,A11,A11	      ; o err=d-y
||  [B1] MPYSP	  .M2x	  A2,B5,B5	      ; o ar*xold
|| [!B1] LDW	  .D1	  *+A15[6],A12	      ; f if(!ocntr) pop A12
|| [!B1] LDW	  .D2	  *+B15[7],B12	      ; f if(!ocntr) pop B12
     
	 STW	  .D1	  A10,*--A8	      ; o store y

   [!B1] LDW	  .D1	  *+A15[4],A13	      ; f if(!ocntr) pop A13
|| [!B1] LDW	  .D2	  *+B15[5],B13	      ; f if(!ocntr) pop B13
  
B_End:
***** END Benchmark Timing *****
	 B	  .S2	  B3		      ; return
||	 LDW	  .D1	  *+A15[2],A14	      ; pop A14
||	 LDW	  .D2	  *+B15[3],B14	      ; pop A10

	 LDW	  .D1	  *+A15[1],B15	      ; pop A15

	 LDW	  .D2	  *+B15[0],A15	      ; pop B15

	 ADDAW	  .D2	  B15,16,B15	      ; remove space from stack
	 NOP		  2		      ; return delay slots
