*******************************************************************************
*
*	TEXAS INSTRUMENTS, INC.
*
*	GOURAUD SHADING
*
*	Revision Data: 03/25/97
*
*	USAGE This routine is C callable and can be called as
* 
*	      void gouraud(unsigned int n, unsigned int rd, unsigned int r, 
*                          unsigned int gd, unsigned int g, unsigned int bd, 
*		           unsigned int b, int p[])
*
*	      n   --- half of the pixels on a line		(input)
*	      rd  --- increment/decrement of the red color	(input)
*	      r   --- red color intensity 			(input)
*	      gd  --- increment/decrement of the green color	(input)
*	      g   --- green color intensity 			(input)
*	      bd  --- increment/decrement of the blue color	(input)
*	      b   --- blue color intensity			(input)
*	      p[] --- array of pixel's color intensity		(output)
*
*	      If the routine is not to be used as a C callable function,
*	      then all instructions relating to stack should be removed.
*	      Refer to comments of individual instructions. You will also
*	      need to initialize values for all the values passed as these
*	      are assumed to be in registers as defined by the calling
*	      convention of the compiler, (refer to the C compiler reference
*	      guide.)
*
*	C CODE
*	      This is the C equivalent of the Assembly Code without the 
*	      assumptions listed below. Note that the assembly code is hand
*             optimized and assumptions apply.
*
*	      void gouraud(unsigned int n, unsigned int rd, unsigned int r, 
*                          unsigned int gd, unsigned int g, unsigned int bd, 
*		     	   int b, int p[])
*	      {
*		  unsigned int    mask = 0xF800F800, i;
*		  for (i = 0; i < n; i++) {
*		    r += rd;
*		    g += gd;
*		    b += bd;
*		    p[i] = (r & mask) + ((g & mask) >> 5) + ((b & mask) >> 10);
*		  }
*	      }
*
*	DESCRIPTION
*	      This routine is used to obtain the intermediate pixels'
*	      intensity given the intensity values of pixels at -2 and 2*n.
*	      The initial values of r, g, and b are the color intensities
*	      of pixel at -2 of colors red, green, and blue, repectively.
*	      Let r', g', and b' are the color intersities of pixel at 2n.
*	      Then the increment/decrement of the colors are 
*			rd' = (r'-r)/(2*n+2)
*			gd' = (g'-g)/(2*n+2)
*			bd' = (b'-b)/(2*n+2).
*	***   HERE ALL THE VALUES ARE 16-BIT NUMBERS. 
*	      The gouraud shading algorithm obtains the intermediate
*	      pixels' intensities with linear interpolation method.
*	      That is, at k, the pixel color intensities are:
*			r" = r+(k+2)*rd' 
*		        g" = g+(k+2)*gd'
*			b" = b+(k+2)*bd'
*	      and the pixel's mixed color intensity is
*		        p[i]=(r"&0xF800)+((g"&0xF800)>>5)+((b"&0xF800)>>10).
*
*	TECHNIQUES
*	      1. We convert all the numbers into 32-bit, or word so that
*		 every iteration could compute 2 pixels.
*		 To do so, we use the following parameters defined as:
*		 	rd = (2*rd')<<16 + (2*rd')
*			gd = (2*gd')<<16 + (2*gd')
*			bd = (2*bd')<<16 + (2*bd')
*	         and p[i] actually represents two pixels, the upper half
*		 represent pixel p[2k] and the lower half represent pixel
*		 p[2k+1]. (The C code is shown in C CODE section above.)
*	       
*	     2. The code is actually implemented as
*
*	       {
*		 unsigned int    mask = 0xF800F800, i, j;
*		 for (i = 0; i < n; i+=2) {
*		     r += rd;
*		     g += gd;
*		     b += bd;
*		     p[i]=(r & mask) + ((g & mask) >> 5) + ((b & mask) >> 10);
*		     r += rd;
*		     g += gd;
*		     b += bd;
*		     p[i+1]=(r & mask) + ((g & mask) >> 5) + ((b & mask) >> 10);
*		 }
*	       }
*
*	       That is FOUR pixels are computed per one iteration.
*
*	ASSUMPTIONS:
*	        n >= 2, even. 
*	
*	MEMORY NOTE:
*		No memory bank hit under any conditions.
*
*	CYCLES		2*N + 7
*
*	PERFORMANCE COMMENTS:
*		Limited by 6 ALUs/cycle.
*******************************************************************************
	.global _gouraud
	.text

_gouraud:
****	begin benchmark timing   ***
B_START:
	ADD	.L1X	B8,A10,A5	; b  += bd,		i
||	MVK	.S1	0F800h,A9	; mask
||	STW	.D2	A10,*B15--	; push A10 on stack

	ADD	.L1X	B4,A6,A3	; r  += rd,		i
||	ADD	.L2X	B8,A5,B5	; b  += bd,		i+1
||	ADD	.S2	B8,B8,B2	; bd *= 2
||	MVKLH	.S1	0F800h,A9	; mask = 0xF800F800
||	STW	.D2	B11,*B15--	; push B11 on stack

	ADD	.L1X	B6,A8,A4	; g  += gd,		i
||	ADD	.L2X	B4,A3,B1	; r  += rd,		i+1
||	ADD	.S2	B4,B4,B0	; rd *= 2
||	AND	.S1	A5,A9,A12	; bm = b & mask,	i
||	SUB	.D1	A4,2,A1		; n - 4
||	STW	.D2	A12,*B15--	; push A12 on stack

	MV	.L2X	A9,B9		; copy mask
||	MV	.L1X	B2,A2		; copy bd
||	AND	.S1	A3,A9,A6	; rm = r & mask,	i
||	STW	.D2	B10,*B15--	; push B10 on stack

	MV	.L1X	B0,A10		; copy rd
||	ADD	.L2X	B6,A4,B4	; g  += gd,		i+1
||	ADD	.D2	B6,B6,B11	; gd *= 2
||	AND	.S1	A4,A9,A7	; gm = g & mask,	i
||	AND	.S2	B5,B9,B8	; bm = b & mask,	i+1

	MV	.L1X	B11,A11		; copy gd
||	AND	.L2	B1,B9,B6	; rm = r & mask,	i+1
||[A1]	SUB	.S1	A1,2,A1		; decrement loop counter
||[A1]	B	.S2	LOOP		; branch to the loop
||	STW	.D2	A11,*B15	; push A11 on stack

	SHRU	.S1	A12,10,A8	; bs = bm >> 10,	i
||	MV	.L1X	B10,A0		; copy p
||	ADD	.L2	4,B10,B10	; offset p

LOOP:
	ADD	.D1	A5,A2,A5	;* b += bd,		i
||	ADD	.L1	A6,A8,A8	; p0 = rm + bs,		i
||	ADD	.D2	B5,B2,B5	;* b += bd,		i+1
||	AND	.L2	B4,B9,B7	; gm = g & mask,	i+1
||	SHRU	.S2	B8,10,B8	; bs = bm >> 10,	i+1
||	SHRU	.S1	A7,5,A7		; gs = gm >> 5,		i

	ADD	.D1	A3,A10,A3	;* r += rd,		i
||	ADD	.L1	A4,A11,A4	;* g += gd,		i
||	ADD	.D2	B1,B0,B1	;* r += rd,		i
||	SHRU	.S2	B7,5,B7		; gs = gm >> 5,		i+1
||	ADD	.L2	B6,B8,B8	; p0 = rm + bs,		i+1
||	ADD	.S1	A8,A7,A8	; p[i] = p0 + gs,	i

	AND	.S1	A5,A9,A12	;* bm	 = b & mask,	i
||	ADD	.D2	B4,B11,B4	;* g += gd,		i+1
||	STW	.D1	A8,*A0++[2]	; store p[i]
||	ADD	.L2	B8,B7,B8	; p[i+1] = p0 + gs,	i+1
||[A1]	B	.S2	LOOP		; branch to the LOOP
||	AND	.L1	A4,A9,A7	; gm = g & mask,	i

	AND	.L1	A3,A9,A6	;* rm	 = r & mask,	i
||	SHRU	.S1	A12,10,A8	;* bs = bm >> 10,	i
||	AND	.L2	B5,B9,B8	;* bm	 = b & mask,	i+1
||	AND	.S2	B1,B9,B6	;* rm	 = r & mask,	i+1
||	STW	.D2	B8,*B10++[2]	; store p[i+1]
||[A1]	SUB	.D1	A1,2,A1		; decrement loop count
LOOP_END:

B_END:
***	end benchmarking timing	 ***

	LDW	.D2	*B15++,A11	; pop A11 off stack
||	ADD	.L1X	8,B15,A0	; copy stack pointer

	LDW	.D1	*A0++[2],A12	; pop A12 off stack
||	LDW	.D2	*B15++[2],B10	; pop B10 off stack
||	B	.S2	B3		; return

	LDW	.D1	*A0,A10		; pop A10 off stack
||	LDW	.D2	*B15++,B11	; pop B11 off stack

	NOP	4
