*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	COLLISION DETECTION
*
*	Revision Date:	02/26/98
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*		int colldet(float* x, float* p, float point,
*			    float distance, short num)
*
*		x -- array of 3-D points {x0, y0, z0, x1, y1, z1, ...}
*		p -- array of translation parameters {p0, p1, p2}
*		       to translate point for a single dimension
*		point -- single 1-D value to compare with the
*			 translated 1-D point
*		distance -- collision occurs if translated 1-D point
*			    is this distance or closer to 1-D "point"
*		num -- number of 3-D points in x[]
*
*		If the routine is not to be used as a C callable function,
*		then you need to initialize values for all of the values
*		passed to the function since these are assumed to be in
*		registers as defined by the calling convention of the
*		compiler, (refer to the TMS320C6x Optimizing C Compiler
*		User's Guide).
*
*	C CODE
*		This is the C equivalent of the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.	In the future, an intrinsic will probably be
*		available ( _apssp() ) to replace fabsf().
*
* #include <math.h>
*
* int colldet(float* x, float* p, float point, float distance, short num)
* {
*   int retval = 0;
*   short i;
*   float sum0, sum1, dist0, dist1;
*
*   for(i=0; i < (num*3); i+=6)
*   {
*      sum0 = x[i+0]*p[0] + x[i+1]*p[1] + x[i+2]*p[2];
*      sum1 = x[i+3]*p[0] + x[i+4]*p[1] + x[i+5]*p[2];
*      dist0 = sum0 - point;
*      dist1 = sum1 - point;
*      dist0 = fabsf(dist0);
*      dist1 = fabsf(dist1);
*      if (dist0 < distance)
*      {
*	  retval = (int)&x[i+0];
*	  break;
*      }
*      if (dist1 < distance)
*      {
*	  retval = (int)&x[i+3];
*	  break;
*      }
*   }
*   return retval;
* }
*
*	DESCRIPTION
*
*		This routine takes a vector of 3-D points and translates
*		them in one dimension.	The 1-D distance from the translated
*		point to "point" is calculated.  If the distance is less
*		than "distance", then a collision is detected and the address
*		of the x dimension value is returned.
*
*		If a collision is not detected, the return value is zero.  If
*		there are more than one collsions, only the first is detected
*		and the function exits as soon as possible.
*
*	TECHNIQUES
*
*		1.  LDDW instructions are used to load two SP floating point
*		    values simultaneously for the x and p arrays.
*		2.  The loop is unrolled once and software pipelined.
*		3.  The loop epilog is removed so that the function can exit
*		    as soon as a collision is detected.  Therefore, this
*		    function performs extraneous loads.
*		4.  The variables sub0, sum0, and sum1 share a register.
*		    The variables z0 and prod2 share a register.
*		    The variables x0 and prod1 share a register.
*		    The variables abs0 and prod0 share a register.
*		    The variables sub1, sum2, and sum3 share a register.
*		    The variables prod3 and prod4 share a register.
*		    The variables prod5 and abs1 share a register.
*		    The variables ptr_p and y1 share a register.
*		5.  Partial priming of the loop is performed to reduce
*		    code size without affecting performance.  Full priming
*		    of the loop is not possible due to the CMPLTSP instructions.
*	
*	ASSUMPTIONS
*
*		1.  Little Endian is assumed for LDDW
*		2.  The x and p arrays must be aligned on double-word
*		    boundaries.
*		3.  Must be 4 or more 3-D points and even {4, 6, 8, ...}
*		4.  Since single assignment of registers is not used,
*		    interrupts should be disabled before this function is
*		    called.
*
*	MEMORY NOTE
*
*		There are no memory hits reqardless of where the function
*		parameters are placed in memory.  This function performs
*		extraneous loads.  The arrays 'x' and 'p' should both be
*		aligned on double-word boundaries since LDDW instructions
*		are used to load them.
*
*       ARGUMENTS PASSED
*
*		x	 ->  A4 = ptr_x
*		p	 ->  B4 = ptr_p
*		point	 ->  A6 = pnt
*		distance ->  B6
*		num	 ->  A8
*
*	CYCLES
*
*	(N/2)*3 + 32  (worst case)     with C overhead
*	(N/2)*3 + 31  (worst case)     without C overhead
*
*===============================================================================
	.global _colldet
	.text
_colldet:
	STW	.D2	B10,*B15--[1]	; push B10 to stack
||	SUB	.L2X	A8,4,B2 	; B2 = num - 4
||	MV	.L1X	B6,A0		; dA = distance
*** BEGIN Benchmark Timing ***
B_START:
* Prolog Begins ****************************************************************
	LDDW	.D2	*B4++,B9:B8	; load p1:p0 from memory

	LDDW	.D1	*A4++[2],A9:A8	; load y0:x0 from memory

	LDDW	.D1	*A4++[1],B5:B4	; load z1:y1 from memory

	LDW	.D2	*B4,B6		; load p2 from memory
||	MVK	.S2	3,B0		; Prime the loop

	LDDW	.D1	*A4++[2],A9:A8	; @ load y0:x0 from memory
||	B	.S1	PLOOP		; Prime the loop
||	ZERO	.L1	A2		; Prime the loop

	LDDW	.D1	*A4--[4],B5:B4	; @ load z1:y1 from memory
||	ZERO	.L1	A3		; Prime the loop
||	ZERO	.S1	A5		; Prime the loop
||	ZERO	.L2	B1		; Prime the loop
||	ZERO	.S2	B7		; Prime the loop

	LDDW	.D1	*A4++[5],A3:A2	; @ load x1:z0 from memory
||	MPYSP	.M1X	A8,B8,A7	; prod0 = x0 * p0
||	ZERO	.L1	A7		; Prime the loop
||	ZERO	.S1	A8		; Prime the loop
||	ZERO	.L2	B10		; Prime the loop
*** Begin Prolog Loop **********************************************************
PLOOP:
	LDDW	.D1	*A4++[2],A9:A8	; @@@@@@ load y0:x0 from memory
||	SUBSP	.L1	A5,A6,A5	; sub0 = sum1 - pnt
||	ADDSP	.L2	B7,B1,B1	; @ sum3 = prod3 + sum2
||	MPYSP	.M1X	A9,B9,A8	; @@@@ prod1 = y0 * p1
||	MPYSP	.M2	B4,B9,B7	; @@@@ prod4 = y1 * p1
|| [B0] B	.S1	PLOOP

	LDDW	.D1	*A4--[4],B5:B4	; @@@@@@ load z1:y1 from memory
||	ADDSP	.L1	A7,A8,A5	; @@@ sum0 = prod0 + prod1
||	SUBSP	.L2X	B1,A6,B1	; sub1 = sum3 - pnt
||	MPYSP	.M1X	A2,B6,A2	; @@@ prod2 = z0 * p2
||	MPYSP	.M2	B5,B6,B10	; @@@@ prod5 = z1 * p2
|| [B0] SUB	.D2	B0,1,B0

	LDDW	.D1	*A4++[5],A3:A2	; @@@@@@ load x1:z0 from memory
||	ADDSP	.L1	A5,A2,A5	; @@ sum1 = sum0 + prod2
||	ADDSP	.L2	B7,B10,B1	; @@@ sum2 = prod4 + prod5
||	MPYSP	.M1X	A8,B8,A7	; @@@@@ prod0 = x0 * p0
||	MPYSP	.M2X	A3,B8,B7	; @@@ prod3 = x1 * p0
*** End Prolog Loop ************************************************************
	LDDW	.D1	*A4++[2],A9:A8	; @@@@@@@ load y0:x0 from memory
||	SUBSP	.L1	A5,A6,A5	; @ sub0 = sum1 - pnt
||	ADDSP	.L2	B7,B1,B1	; @@ sum3 = prod3 + sum2
||	B	.S1	LOOP		; if(cntr) branch to loop
||	MPYSP	.M1X	A9,B9,A8	; @@@@@ prod1 = y0 * p1
||	MPYSP	.M2	B4,B9,B7	; @@@@@ prod4 = y1 * p1
||	MV	.D2	B2,B0		; f cntr = B2

	LDDW	.D1	*A4--[4],B5:B4	; @@@@@@@ load z1:y1 from memory
||	ADDSP	.L1	A7,A8,A5	; @@@@ sum0 = prod0 + prod1
||	SUBSP	.L2X	B1,A6,B1	; @ sub1 = sum3 - pnt
||	ABSSP	.S1	A5,A7		; abs0 = abs(sub0)
||	MPYSP	.M1X	A2,B6,A2	; @@@@ prod2 = z0 * p2
||	MPYSP	.M2	B5,B6,B10	; @@@@@ prod5 = z1 * p2
||	ZERO	.S2	B2		; f retval = 0

	LDDW	.D1	*A4++[5],A3:A2	; @@@@@@@ load x1:z0 from memory
||	ADDSP	.L1	A5,A2,A5	; @@@ sum1 = sum0 + prod2
||	ADDSP	.L2	B7,B10,B1	; @@@@ sum2 = prod4 + prod5
||	CMPLTSP .S1	A7,A0,A1	; if(abs0 < dA) if0 = 1
||	ABSSP	.S2	B1,B10		; abs1 = abs(sub1)
||	MPYSP	.M1X	A8,B8,A7	; @@@@@@ prod0 = x0 * p0
||	MPYSP	.M2X	A3,B8,B7	; @@@@ prod3 = x1 * p0
********* Loop Begins **********************************************************
LOOP:
  [!B2] LDDW	.D1	*A4++[2],A9:A8	; @@@@@@@@ load y0:x0 from memory
||[B2]	ZERO	.D2	B0		; if(retval) cntr = 0
||	SUBSP	.L1	A5,A6,A5	; @@ sub0 = sum1 - pnt
||	ADDSP	.L2	B7,B1,B1	; @@@ sum3 = prod3 + sum2
||[B0]	B	.S1	LOOP		; if(cntr) branch to loop
||[!B2] CMPLTSP .S2X	B10,A0,B2	; if(abs1 < dA) retval = 1
||	MPYSP	.M1X	A9,B9,A8	; @@@@@@ prod1 = y0 * p1
||	MPYSP	.M2	B4,B9,B7	; @@@@@@ prod4 = y1 * p1

  [!B2] LDDW	.D1	*A4--[4],B5:B4	; @@@@@@@@ load z1:y1 from memory
||[B0]	SUB	.D2	B0,2,B0 	; if(cntr) cntr-=2
||	ADDSP	.L1	A7,A8,A5	; @@@@@ sum0 = prod0 + prod1
||	SUBSP	.L2X	B1,A6,B1	; @@ sub1 = sum3 - pnt
||	ABSSP	.S1	A5,A7		; @ abs0 = abs(sub0)
||[A1]	MVK	.S2	1,B2		; if(if0) retval = 1
||	MPYSP	.M1X	A2,B6,A2	; @@@@@ prod2 = z0 * p2
||	MPYSP	.M2	B5,B6,B10	; @@@@@@ prod5 = z1 * p2

  [!B2] LDDW	.D1	*A4++[5],A3:A2	; @@@@@@@@ load x1:z0 from memory
||	ADDSP	.L1	A5,A2,A5	; @@@@ sum1 = sum0 + prod2
||	ADDSP	.L2	B7,B10,B1	; @@@@@ sum2 = prod4 + prod5
||[!A1] CMPLTSP .S1	A7,A0,A1	; @ if(abs0 < dA) if0 = 1
||	ABSSP	.S2	B1,B10		; @ abs1 = abs(sub1)
||	MPYSP	.M1X	A8,B8,A7	; @@@@@@@ prod0 = x0 * p0
||	MPYSP	.M2X	A3,B8,B7	; @@@@@ prod3 = x1 * p0
********* Loop Ends ************************************************************
	B	.S2	B3		; return from function

	LDW	.D2	*++B15[1],B10	; pop B10

  [A1]	MVK	.S2	176,B0		; if(if0) B0 = 176

  [!A1] MVK	.S2	196,B0		; else	  B0 = 196

  [B2]	SUB	.L1X	A4,B0,A4	; if(retval) return (ptr_x - B0)

  [!B2] ZERO	.S1	A4		; else	     return 0

B_END:
*** END Benchmark Timing ***
