*===================================================================

*	TEXAS INSTRUMENTS, INC. 

*

*	3D GEOMETRY TRANSFORMATION

*				

*     	Revision Date: 11/5/97

*-------------------------------------------------------------------*     

*

*	USAGE	This routine is C callable and can be called as:

*

*		

*		void transf(float* tmt, float* inp, float* out, siz)         				      

*						      

*		tmt -	Transformation Matrix 		Xx Yx Zx Wx Sx Tx     

*			and Viewport Control    	Xy Yy Zy Wy Sy Ty     

*			(Scale and Translation) 	Xz Yz Zz Wz Sz Tz     

*			(# indicates not used)		Xw Yw Zw Ww  #  #     

*							     	   	      

*		inp - 	An array of polygon vertices	  

*			in object coordinates		    

*			(number of vertices = siz)	x  y  z  w	      

*			(each vertex has 4 values)

*								     	      

*		out - 	An array of polygon vertices 	          

*			and clipping parameters		w  w_ xh xc0 xc1 xv   

*			in screen coordinates	  	w  w_ yh yc0 yc1 yv   

*			(number of vertices = siz)    	w  w_ zh zc0 zc1 zv   

*			(each vertex has 18 values)

*			

*		siz -	number of vertices (input or output). The vertex before 

*			processing consists of 4 values and after processing it

*			consists of 18 values. Thus the total size of the input 

*			array is 4*siz words and the total size of the output 

*			array is 18*siz words.

*

*			If this routine is not used as a C callable function, then

*			you need to initialize the values for all of the parameters 

*			passed to the function in C. Those parameters are assumed to

*			be in the registers as defined by the calling convention of

*			the compiler (refer to the TMS320C6x Optimizing C Compiler 

*			User's Guide). In addition,if this routine is not used as C 

*			callable function, the code to push and pop A10, A11, A12, 

*			A13, A14, A15, B10, B11, B12, B13, B14 and B15 registers is

*			not needed and can be removed. This routine does not have a

*			return parameter - it only fills the "out" array.

*								       

*	PERFORMANCE:	approx 16 cycles per vertex				      

*				or					      

*			12.5 Million vertices per second (5ns cycles)	      

*									      

*			Included in the algorithm:	geometry transformation   

*						    	clipping preprocessing    

*						    	perspective projection    

*						    	viewport mapping  

*

*	C CODE:		This is the C equivalent of the assembly code. Note that the

*			assembly code is hand optimized and restrictions may apply. 

*

*			void transf(float *TMT, float *INP, float *OUT, int SIZ)

*			{

*			float	*inp;

*			float 	w, w_, h;

*			int	s, c,  r;

*

*			TMT += 18;				/*ptr to Xw Yw Zw Ww*/

*			for (s=0 ; s<SIZ ; s++)	

*				{				/* next vertex */

*				inp = INP;

*				w = 0.0;

*				for (r=0 ; r<4 ; r++)

*					{

*					w += *inp++ * *TMT++;

*					}

*				TMT -= 22;			/*ptr to Xx Yx Zx Wx*/

*				w_= 1.0/w;				

*				for (c=0 ; c<3 ; c++)

*					{

*					inp  = INP;

*					*OUT++ = w;		/* w ,  w ,  w   */

*					*OUT++ = w_;		/* w_,  w_,  w_  */

*					*OUT = 0.0;

*					for (r=0 ; r<4 ; r++)

*						{

*						*OUT += *inp++ * *TMT++; 

*						} 

*					h = *OUT++;		/* xh,  yh,  zh  */

*					*OUT++ = w - h;		/* xc0, yc0, zc0 */

*					*OUT++ = w + h;		/* xc1, yc1, zc1 */

*					h = h * w_;

*					h = h * *TMT++;

*					*OUT++ = h + *TMT++;	/* xv,  yv,  zv  */

*					}

*				INP = inp;

*				}

*			return;

*			}

*

*-----------------------This is the main function that was used to call the transf

*

*			# define 	SIZ	3

*			extern void transf(float*, float*, float*, int);

*

*			float tmt[24] = {1, 0, 0, 0, 1, 2,

*					 0, 1, 0, 0, 3, 4,

*					 0, 0, 1, 0, 5, 6,

*					 0, 0, 0, 1, 0, 0 };

*

*			float inp[4*SIZ] = {7, 7, 7, 7,

*		   			    8, 8, 8, 8,

*		    			    9, 9, 9, 9};

*

*			float out[18*(SIZ+4)] = {0.0}; /*reserve 72 words for epilog*/

*			int siz = SIZ;

*			int ret;

*

*			long main (void)

*				{

*				transf (tmt, inp, out, siz); 

*				return(ret);

*				} 

*

*	DESCRIPTION:	This routine represents the "front end" of the 3D graphics 

*			transformation pipeline. Before the 3D geometry is displayed

*			on the screen, the vertex of each polygon has to be 

*			transformed to the screen coordinate system. This routine 

*			performes the geometry transformation, clipping

*			preprocessing, perspective projection and viewport mapping.

*			This routine does not perform clipping, however it provides

*			the transformed vertices in a convenient format for the 

*			clipping function which is the next step in the 3D pipeline.

*			This routine applies applies a 4x6 transformation matrix to 

*			every vertex of the 4xSIZE input array. Each polygon vertex 

*			consists of x,y,z and w coordinates. The transformation 

*			matrix includes linear distance, direction cosines and scale.

*			The results are stored in the 18x(SIZE+4) output array that

*			includes for each vertex: w, 1/w, xyz in homogeneous

*			coordinates, xyz in viewport coordinates, and 6 clipping 

*			planes. The 4 "extra" vertices at the end of the output array

*			contain trash written at the end of each loop in absence of 

*			loop epiloques.

*			

*			

*	TECHNIQUES:	1. Load double word instruction is used to simultaneously

*			   load	two floating point values in a single clock cycle

*			2. Software pipelining is used to schedule instructions 

*			   so that multiple iterations of a loop execute in parallel

*			2. The www loop computes the w and 1/w for each vertex 

*			3. The xxx loop computes the xh, xc0, xc1 and xv parameters

*			4. The yyy loop computes the yh, yc0, yc1 and yv parameters

*			5. The zzz loop computes the zh, zc0, zc1 and zv parameters

*			6. The division matissa error is < 2^-16 resulting from one

*			   iteration of Newton-Rapson algorithm x[n+1]=x[n]*(2-v*x[n]

*			   with the v seed computed by the RCPSP(reciprocal estimate)

*			   instruction 

*

*	ASSUMPTIONS:	1. Little Endian is assumed for LDDW

*			2. No restrictions on number of vertices

*			3. Padd the output array with 72 extra words to catch the 

*			   loop epiloque trash.

*

*	ARGUMENTS PASSED:	*tmt	->	A4

*				*inp	->	B4

*				*out	->	A6

*				siz	->	B6

*===================================================================*/

	.global	_transf

	.bss	stack,68			; 68/4=17 regs (save C env

	.text					; +TMT,INP,OPT,SIZ)



_transf

	MVK	.S1	stack,A0		; new stack pointer in A0

	MVKH	.S1	stack,A0		; new stack pointer in A0



	MVK	.S2	stack,B0		; new stack pointer in B0

	MVKH	.S2	stack,B0		; new stack pointer in B0



	STW	.D2	B3,  *B0		; push return addr on stack

	STW	.D1	A10,*+A0[1]		; push A10 on stack

||	STW	.D2	B10,*+B0[2]		; push B10 on stack

	STW	.D1	A11,*+A0[3]		; push A11 on stack

||	STW	.D2	B11,*+B0[4]		; push B11 on stack

	STW	.D1	A12,*+A0[5]		; push A12 on stack

||	STW	.D2	B12,*+B0[6]		; push B12 on stack

	STW	.D1	A13,*+A0[7]		; push A13 on stack

||	STW	.D2	B13,*+B0[8]		; push B13 on stack

	STW	.D1	A14,*+A0[9]		; push A14 on stack

||	STW	.D2	B14,*+B0[10]		; push B14 on stack

	STW	.D1	A15,*+A0[11]		; push A15 on stack

||	STW	.D2	B15,*+B0[12]		; push B15 on stack

	STW	.D1	A4, *+A0[13]		; TMT pntr (save for xyz loops)

||	STW	.D2	B4, *+B0[14]		; INP pntr (save for xyz loops)

	STW	.D1	A6, *+A0[15]		; OUT pntr (save for xyz loops)

||	STW	.D2	B6, *+B0[16]		; SIZ      (save for xyz loops)























































*---------www:	loop (4 cycles per vertex)------*

*

	MV	.S1		A4,A0		; load TMTw ptr (tmp)	****	

	MV	.S1X		B4,A14		; load INP ptr (x,y)	****

	MV	.S1		A6,A15		; load OUT ptr (w)	****

	ADD	.S2		10,B6,B2	; init branch cnt/cond	****

	LDDW	.D1	*+A0[9],A13:A12		; load TMTw (Y,X)

	LDDW	.D1	*+A0[10],B13:B12	; load TMTw (W,Z)

	MVK	.S2	2,B3			; load 2

	INTSP	.L2	B3,B3			; 2 -> 2.0

	MVK	.S1	1,A0			; load 1

	INTSP	.L1	A0,A0			; 1 -> 1.0

	ADD	.S2X	8,A14,B14		; load INP ptr (z,w)

	ADD	.S2X	4,A15,B15		; load OUT ptr (1/w)

||	ZERO	.L2	B0			; init store count-up

||	ZERO	.D1	A1			; init store cond1 (W)

||	B	.S1	www			; prime the first branch

	ZERO	.D2	B1			; init store cond2 (1/W)

*-----------------------------------------------*

www:	MPYSP	.M1	A13,A11,A9		; Y * y = Yy

||	MPYSP	.M2	B13,B11,B9		; W * w = Ww

||	ADDSP	.L1X	A7,B7,A6		; yx+wz = W

||[A1]	STW	.D1	A5,*A15++[6]		; store W

||[B1]	STW	.D2	B4,*B15++[6]		; store 1/W

||	ADDK	.S2	1,B0			; incr store count

||	NOP

||	NOP

*

	MPYSP	.M1	A12,A10,A8		; X * x = Xx

||	MPYSP	.M2	B12,B10,B8		; Z * z = Zz

||	ADDSP	.L1	A9,A8,A7		; Yy+Xx = yx

||	ADDSP	.L2	B9,B8,B7		; Ww+Zz = wz

||[A1]	STW	.D1	A5,*A15++[6]		; store W

||[B1]	STW	.D2	B4,*B15++[6]		; store 1/W

||[B2]	ADDK	.S2	-1,B2			; decr branch count

||	NOP

*

	MV	.S1	A6,A5			; W -> W

||[A1]	STW	.D1	A5,*A15++[6]		; store (W)

||	MPYSP	.M1	A5,A4,A2		; W * 1/w = m1

||	SUBSP	.L2X	B3,A2,B6		; 2.0 - m1 = d

||[B1]	STW	.D2	B4,*B15++[6]		; store (1/W)

||	CMPLT	.L1X	6,B0,A1			; store cond1 (W)

||[B2]	B	.S2	www			; process next vertex

||	NOP

*

	LDDW	.D1	*A14++[2],A11:A10	; load INP (y,x)

||	LDDW	.D2	*B14++[2],B11:B10	; load INP (w,z)

||	RCPSP	.S1	A6,A4			; 1/w (seed)      	

||	MPYSP	.M1	A0,A4,A3		; 1/w -> 1/w		

||	MV	.S2X	A3,B5			; 1/W -> 1/w

||	MPYSP	.M2	B5,B6,B4		; 1/w * d = 1/W

||	CMPLT	.L2	10,B0,B1		; store cond2 (1/W)

||	NOP

*-----------------------------------------------*

*---------xxx:	loop (4 cycles per vertex)------*

*

	MVK	.S1	stack,A1		; new stack pntr in A0

	MVKH	.S1	stack,A1		; new stack pntr in A0

	LDW	.D1	*+A1[16],A2		; load SIZ

	LDW	.D1	*+A1[13],A0		; load TMT pntr

	LDW	.D1	*+A1[14],A14		; load INP pntr

	LDW	.D1	*+A1[15],A15		; load OUT pntr

	ZERO	.L1	A1			; init store condition

	ADD	.S2	9,A2,B2			; init branch count/cond

	LDDW	.D1	*+A0[0],A13:A12   	;xyz; load TMTw (Y,X)

	LDDW	.D1	*+A0[1],B13:B12  	;xyz; load TMTw (W,Z)

	LDDW	.D1	*+A0[2],B1:B0   	;xyz; load TMTw (T,S)

||	MV	.S2X	A15,B15			; init OUT ptr (c1,v)

	ADDK	.S2	-576,B15		; init OUT ptr (c1,v)

||	ADDK	.S1	-432,A15		; init OUT prt (w,1/w,h,c0)

	B	.S1	xxx 		   	;xyz; prime the first branch

	MVK	.S1	10,A2			; init store count-down

*-----------------------------------------------*

xxx:	MPYSP	.M1	A13,A11,A9	   	;xyz; Y * y = Yy

||	MPYSP	.M2	B13,B11,B9		; W * w = Ww

||	ADDSP	.L2X	A7,B7,B6		; yx+wz = h

||	LDDW	.D1	*+A15[9],A5:A4   	;xyz; load (1/W,W)

||[!A2] STW	.D2	B3,*-B15[31]		;xyz; store (v)

|| [A2]	ADDK	.S1	-1,A2			; store cond (v) and

||	NOP					; decr store count

||	NOP		

*

	MPYSP	.M1	A12,A10,A8		; X * x = Xx

||	MPYSP	.M2	B12,B10,B8		; Z * z = Zz

||	ADDSP	.L1	A9,A8,A7		; Yy+Xx = yx

||	ADDSP	.L2	B9,B8,B7		; Ww+Zz = wz

||	MV	.S1X	B6,A6			; h -> h

|| [A1] STW	.D1	A6,*-A15[16]  		;xyz; store (h)

||	ADD	.S2X	8,A14,B14	    	;^^^; load INP ptr (z,w)

|| [B2]	ADD	.D2	-1,B2,B2		; decr branch count

*

	ADDSP	.L1X	A4,B6,A3		; w + h = c0

||	SUBSP	.L2X	A4,B6,B5		; w - h = c1

|| [A1] STW	.D1	A3,*-A15[15]		;xyz; store (c0)

|| [A1] STW	.D2	B5,*+B15[22]		;xyz; store (c1)

||	ADDK	.S2	72,B15			; point to next OUT vertex

|| [B2] B	.S1	xxx		    	;xyz; process next vertex

||	NOP

||	NOP

*

	LDDW	.D1	*A14++[2],A11:A10	; load INP (y,x)

||	LDDW	.D2	*B14++[2],B11:B10	; load INP (w,z)

||	MPYSP	.M1X	B6,A5,A0		; h * 1/w = v0

||	MPYSP	.M2X	A0,B0,B4		; v0 * S = v1

||	ADDSP	.L2	B4,B1,B3		; v1 + T = v2

||	CMPGT	.L1	4,A2,A1			; store cond (h,c0,c1)

||	ADDK	.S1	72,A15			; point to next OUT vertex

||	NOP

*-----------------------------------------------*

*---------yyy:	loop (4 cycles per vertex)------*

* 

	MVK	.S1	stack,A1		; new stack pntr in A0

	MVKH	.S1	stack,A1		; new stack pntr in A0

	LDW	.D1	*+A1[16],A2		; load SIZ

	LDW	.D1	*+A1[13],A0		; load TMT pntr

	LDW	.D1	*+A1[14],A14		; load INP pntr

	LDW	.D1	*+A1[15],A15		; load OUT pntr

	ZERO	.L1	A1			; init store condition

	ADD	.S2	9,A2,B2			; init branch count/cond

	LDDW	.D1	*+A0[3],A13:A12   	;xyz; load TMTw (Y,X)

	LDDW	.D1	*+A0[4],B13:B12  	;xyz; load TMTw (W,Z)

	LDDW	.D1	*+A0[5],B1:B0   	;xyz; load TMTw (T,S)

||	MV	.S2X	A15,B15			; init OUT ptr (c1,v)

	ADDK	.S2	-576,B15		; init OUT ptr (c1,v)

||	ADDK	.S1	-432,A15		; init OUT prt (w,1/w,h,c0)

	B	.S1	yyy 		   	;xyz; prime the first branch

	MVK	.S1	10,A2			; init store count-down

*-----------------------------------------------*

yyy:	MPYSP	.M1	A13,A11,A9	   	;xyz; Y * y = Yy

||	MPYSP	.M2	B13,B11,B9		; W * w = Ww

||	ADDSP	.L2X	A7,B7,B6		; yx+wz = h

||	LDDW	.D1	*+A15[12],A5:A4   	;xyz; load (1/W,W)

||[!A2] STW	.D2	B3,*-B15[25]		;xyz; store (v)

|| [A2]	ADDK	.S1	-1,A2			; store cond (v) and

||	NOP					; decr store count

||	NOP		

*

	MPYSP	.M1	A12,A10,A8		; X * x = Xx

||	MPYSP	.M2	B12,B10,B8		; Z * z = Zz

||	ADDSP	.L1	A9,A8,A7		; Yy+Xx = yx

||	ADDSP	.L2	B9,B8,B7		; Ww+Zz = wz

||	MV	.S1X	B6,A6			; h -> h

|| [A1] STW	.D1	A6,*-A15[10]  		;xyz; store (h)

||	ADD	.S2X	8,A14,B14	    	;^^^; load INP ptr (z,w)

|| [B2]	ADD	.D2	-1,B2,B2		; decr branch count

*

	ADDSP	.L1X	A4,B6,A3		; w + h = c0

||	SUBSP	.L2X	A4,B6,B5		; w - h = c1

|| [A1] STW	.D1	A3,*-A15[9]		;xyz; store (c0)

|| [A1] STW	.D2	B5,*+B15[28]		;xyz; store (c1)

||	ADDK	.S2	72,B15			; point to next OUT vertex

|| [B2] B	.S1	yyy		    	;xyz; process next vertex

||	NOP

||	NOP

*

	LDDW	.D1	*A14++[2],A11:A10	; load INP (y,x)

||	LDDW	.D2	*B14++[2],B11:B10	; load INP (w,z)

||	MPYSP	.M1X	B6,A5,A0		; h * 1/w = v0

||	MPYSP	.M2X	A0,B0,B4		; v0 * S = v1

||	ADDSP	.L2	B4,B1,B3		; v1 + T = v2

||	CMPGT	.L1	4,A2,A1			; store cond (h,c0,c1)

||	ADDK	.S1	72,A15			; point to next OUT vertex

||	NOP

*-----------------------------------------------*

*----------zzz:	loop (4 cycles per vertex)------*

*

	MVK	.S1	stack,A1		; new stack pntr in A0

	MVKH	.S1	stack,A1		; new stack pntr in A0

	LDW	.D1	*+A1[16],A2		; load SIZ

	LDW	.D1	*+A1[13],A0		; load TMT pntr

	LDW	.D1	*+A1[14],A14		; load INP pntr

	LDW	.D1	*+A1[15],A15		; load OUT pntr

	ZERO	.L1	A1			; init store condition

	ADD	.S2	9,A2,B2			; init branch count/cond

	LDDW	.D1	*+A0[6],A13:A12   	;xyz; load TMTw (Y,X)

	LDDW	.D1	*+A0[7],B13:B12  	;xyz; load TMTw (W,Z)

	LDDW	.D1	*+A0[8],B1:B0   	;xyz; load TMTw (T,S)

||	MV	.S2X	A15,B15			; init OUT ptr (c1,v)

	ADDK	.S2	-576,B15		; init OUT ptr (c1,v)

||	ADDK	.S1	-432,A15		; init OUT prt (w,1/w,h,c0)

	B	.S1	zzz 		   	;xyz; prime the first branch

	MVK	.S1	10,A2			; init store count-down

*-----------------------------------------------*

zzz:	MPYSP	.M1	A13,A11,A9	   	;xyz; Y * y = Yy

||	MPYSP	.M2	B13,B11,B9		; W * w = Ww

||	ADDSP	.L2X	A7,B7,B6		; yx+wz = h

||	LDDW	.D1	*+A15[15],A5:A4   	;xyz; load (1/W,W)

||[!A2] STW	.D2	B3,*-B15[19]		;xyz; store (v)

|| [A2]	ADDK	.S1	-1,A2			; store cond (v) and

||	NOP					; decr store count

||	NOP		

*

	MPYSP	.M1	A12,A10,A8		; X * x = Xx

||	MPYSP	.M2	B12,B10,B8		; Z * z = Zz

||	ADDSP	.L1	A9,A8,A7		; Yy+Xx = yx

||	ADDSP	.L2	B9,B8,B7		; Ww+Zz = wz

||	MV	.S1X	B6,A6			; h -> h

|| [A1] STW	.D1	A6,*-A15[4]  		;xyz; store (h)

||	ADD	.S2X	8,A14,B14	    	;^^^; load INP ptr (z,w)

|| [B2]	ADD	.D2	-1,B2,B2		; decr branch count

*

	ADDSP	.L1X	A4,B6,A3		; w + h = c0

||	SUBSP	.L2X	A4,B6,B5		; w - h = c1

|| [A1] STW	.D1	A3,*-A15[3]		;xyz; store (c0)

|| [A1] STW	.D2	B5,*+B15[34]		;xyz; store (c1)

||	ADDK	.S2	72,B15			; point to next OUT vertex

|| [B2] B	.S1	zzz		    	;xyz; process next vertex

||	NOP

||	NOP

*

	LDDW	.D1	*A14++[2],A11:A10	; load INP (y,x)

||	LDDW	.D2	*B14++[2],B11:B10	; load INP (w,z)

||	MPYSP	.M1X	B6,A5,A0		; h * 1/w = v0

||	MPYSP	.M2X	A0,B0,B4		; v0 * S = v1

||	ADDSP	.L2	B4,B1,B3		; v1 + T = v2

||	CMPGT	.L1	4,A2,A1			; store cond (h,c0,c1)

||	ADDK	.S1	72,A15			; point to next OUT vertex

||	NOP

*-----------------------------------------------*

	MVK	.S1	stack,A0		; new stack pointer in A0

	MVKH	.S1	stack,A0		; new stack pointer in A0



	MVK	.S2	stack,B0		; new stack pointer in B0

	MVKH	.S2	stack,B0		; new stack pointer in B0



	LDW	.D2	*B0,B3 			; pop return addr off stack

	LDW	.D1	*+A0[1],A10		; pop A10 off stack

||	LDW	.D2	*+B0[2],B10		; pop B10 off stack

	LDW	.D1	*+A0[3],A11		; pop A11 off stack

||	LDW	.D2	*+B0[4],B11		; pop B11 off stack

	LDW	.D1	*+A0[5],A12		; pop A12 off stack

||	LDW	.D2	*+B0[6],B12		; pop B12 off stack

	LDW	.D1	*+A0[7],A13		; pop A13 off stack

||	LDW	.D2	*+B0[8],B13		; pop B13 off stack

	LDW	.D1	*+A0[9],A14		; pop A14 off stack

||	LDW	.D2	*+B0[10],B14		; pop B14 off stack

	LDW	.D1	*+A0[11],A15		; pop A15 off stack

||	LDW	.D2	*+B0[12],B15		; pop B15 off stack

	B	.S2	B3			; return to calling C progr

	NOP	6				; wait 6 cycles for the last

						; pop to occur before returning

