*================================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	32 Bit Unsigned Multiply With 64 Bit Result
*
*	Revision Date:  07/17/97
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*		void mpyu3264(unsigned int a, unsigned int b, int *c)
*
*		a --- first 32 bit input value
*               b --- second 32 bit input value
*               *c --- pointer to resultant 64 bit value
*
*		If routine is not to be used as a C callable function then
*		you need to initialize values for all of the values passed
*		as these are assumed to be in registers as defined by the 
*		calling convention of the compiler, (refer to the C compiler
*		reference guide).
*
*	C CODE
*		This is the C equivalent of the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*		void mpyu3264(unsigned int a, unsigned int b, int *c)
*		{
*		        unsigned int c0, c1l, c2l;
*		        unsigned int c1h, c2h, c1, c2, c3, ch, ch1;
*		        Ulong40 cl;
*		
*		        c0 = (unsigned short) a * (unsigned short) b;
*		        c1 = ((unsigned int) a>>16) * (unsigned short) b;
*		        c2 = ((unsigned int) b>>16) * (unsigned short) a;
*		        c3 = ((unsigned int) a>>16) * ((unsigned int) b>>16);
*		        c1l = c1 << 16;
*		        c2l = c2 << 16;
*		        c1h = (unsigned int) c1 >> 16;
*		        c2h = (unsigned int) c2 >> 16;
*		        cl = (Ulong40) c1l + (Ulong40) c2l;
*		        cl += (Ulong40) c0;
*		        ch1 = (unsigned int) c1h + (unsigned int) c2h;
*		        ch1 += (unsigned int) c3;
*		        ch = (Ulong40) ch1 + ((Ulong40) cl >> 32);
*		        c[0] = cl;
*		        c[1] = ch;
*		}
*		
*	DESCRIPTION
*
*		This routine takes two 32 bit unsigned integer values and
*               calculates their product.  The inputs are 32-bit unsigned
*		integer, and the result is a 64-bit unsigned integer.
*	
*	ASSUMPTIONS
*
*               1. Only one sum is computed with a pair of 32-bit values.
*               2. Multiple 32-bit mpys can yield 2 cycle per 32-bit mpy
*                  on average.
*
*	CYCLES
*
*		7 (STW 8)
*
*===============================================================================
	.global _mpyu3264
	.text

_mpyu3264:
*** BEGIN Benchmark Timing
B_START:

	MPYHLU	.M1x	A4,	B4,	A0	; c1=((u int) a>>16)*(u short)b
||	MPYHLU	.M2x	B4,	A4,	B0	; c2=((u int) b>>16)*(u short)a

	MPYHU	.M1x	A4,	B4,	A4	; c3=((u int)a>>16)*((u int)b>>16)
||	MPYU	.M2x	B4,	A4,	B4	; c0 = (u short)a*(u short)b

	SHL	.S1	A0, 	16,	A2	; c1L = c1 << 16
||	SHL	.S2	B0, 	16,	B2	; c2L = c2 << 16

	SHRU	.S1	A0, 	16,	A0	; c1H = (u int) c1 >> 16
||	SHRU	.S2	B0, 	16,	B0	; c2H = (u int) c2 >> 16
||	ADDU	.L1x	A2,	B2,	A3:A2	; cL = (Ulong40) c1L + (Ulong40) c2L
||      ADD     .L2x    A6,      4,     B8      ; update pointer to c[1]

	ADDU	.L2x	A0,	B0,	B1:B0	; cH1 = (u int) c1H + (u int)c2H
||	ADDU	.L1x	B4,	A3:A2,	A3:A2	; cL += (Ulong40) c0

	ADDU	.L2x	B0,	A4,	B1:B0	; cH1 += (u int) c3

	ADDU	.L2x	B0,	A3,	B1:B0	; cH += (Ulong40) cL >>32

B_END:
*** END Benchmark Timing
        STW     .D1     A2,     *A6             ; c[0] = cL
||      STW     .D2     B0,     *B8             ; c[1] = cH
||      B       .S2     B3                      ; return to calling function

STOP:
	NOP	5
