* ========================================================================= *
*                                                                           *
*   TEXAS INSTRUMENTS, INC.                                                 *
*                                                                           *
*   NAME                                                                    *
*       fdct_8x8 -- 8x8 Block FDCT With Rounding, Endian Neutral            *
*                                                                           *
*   REVISION HISTORY                                                        *
*       20-May-1999 Initial handcode version                                *
*                                                                           *
*   USAGE                                                                   *
*       This routine is C callable, and has the following C prototype:      *
*                                                                           *
*           void fdct_8x8(short fdct_data[], unsigned num_fdcts)            *
*                                                                           *
*       The fdct routine accepts a list of 8x8 pixel blocks and performs    *
*       FDCTs on each.  The array should be laid out identically to         *
*       "fdct_data[num_fdcts+1][8][8]".  All operations in this array are   *
*       performed entirely in-place.                                        *
*                                                                           *
*       Input values are stored in shorts, and may be in the range          *
*       [-512,511].  Larger input values may result in overflow,            *
*       although the 12-bit JPEG range [-1024,1023] overflows rarely.       *
*                                                                           *
*       This code requires '48 + 160 * num_fdcts' cycles to process         *
*       'num_fdcts' blocks, including function call overhead.  When         *
*       'num_fdcts' is zero, an early exit is taken and the function        *
*       runs for only 13 cycles (again, including call overhead).           *
*                                                                           *
*   DESCRIPTION                                                             *
*       The fdct_8x8 function implements a Chen FDCT.  Output values are    *
*       rounded, providing improved accuracy.  Input terms are expected     *
*       to be signed 11Q0 values, producing signed 15Q0 results.  (A        *
*       smaller dynamic range may be used on the input, producing a         *
*       correspondingly smaller output range.  Typical applications         *
*       include processing signed 9Q0 and unsigned 8Q0 pixel data,          *
*       producing signed 13Q0 or 12Q0 outputs, respectively.)  No           *
*       saturation is performed.                                            *
*                                                                           *
*       void fdct_8x8(short *dct_data, unsigned num_fdcts)                  *
*       {                                                                   *
*         /* -------------------------------------------------------- */    *
*         /*  Set up the cosine coefficients c0..c7.                  */    *
*         /* -------------------------------------------------------- */    *
*         const unsigned short c1 = 0x2C62, c3 = 0x25A0;                    *
*         const unsigned short c5 = 0x1924, c7 = 0x08D4;                    *
*         const unsigned short c0 = 0xB505, c2 = 0x29CF;                    *
*         const unsigned short c6 = 0x1151;                                 *
*                                                                           *
*         /* -------------------------------------------------------- */    *
*         /*  Intermediate calculations.                              */    *
*         /* -------------------------------------------------------- */    *
*         short f0, f1, f2, f3,                                             *
*               f4, f5, f6, f7;       /* Spatial domain samples.      */    *
*         int   g0, g1, h0, h1,                                             *
*               p0, p1;               /* Even-half intermediate.      */    *
*         short r0, r1;               /* Even-half intermediate.      */    *
*         int   P0, P1, R0, R1;       /* Even-half intermediate.      */    *
*         short g2, g3, h2, h3;       /* Odd-half intermediate.       */    *
*         short q0a,s0a,q0, q1,                                             *
*               s0, s1;               /* Odd-half intermediate.       */    *
*         short Q0, Q1, S0, S1;       /* Odd-half intermediate.       */    *
*         int   F0, F1, F2, F3,                                             *
*               F4, F5, F6, F7;       /* Freq. domain results.        */    *
*         int   F0r,F1r,F2r,F3r,                                            *
*               F4r,F5r,F6r,F7r;      /* Rounded, truncated results.  */    *
*                                                                           *
*         /* -------------------------------------------------------- */    *
*         /*  Input and output pointers, loop control.                */    *
*         /* -------------------------------------------------------- */    *
*         unsigned i, j;                                                    *
*         short    *dct_io_ptr;                                             *
*                                                                           *
*         /* -------------------------------------------------------- */    *
*         /*  Outer vertical loop -- Process each 8x8 block.          */    *
*         /* -------------------------------------------------------- */    *
*         dct_io_ptr = dct_data;                                            *
*         for (i = 0; i < num_fdcts; i++)                                   *
*         {                                                                 *
*             /* ---------------------------------------------------- */    *
*             /*  Perform Vert 1-D FDCT on columns within each block. */    *
*             /* ---------------------------------------------------- */    *
*             for (j = 0; j < 8; j++)                                       *
*             {                                                             *
*                 /* ------------------------------------------------ */    *
*                 /*  Load the spatial-domain samples.                */    *
*                 /* ------------------------------------------------ */    *
*                 f0 = dct_io_ptr[ 0];                                      *
*                 f1 = dct_io_ptr[ 8];                                      *
*                 f2 = dct_io_ptr[16];                                      *
*                 f3 = dct_io_ptr[24];                                      *
*                 f4 = dct_io_ptr[32];                                      *
*                 f5 = dct_io_ptr[40];                                      *
*                 f6 = dct_io_ptr[48];                                      *
*                 f7 = dct_io_ptr[56];                                      *
*                                                                           *
*                 /* ------------------------------------------------ */    *
*                 /*  Stage 1:  Separate into even and odd halves.    */    *
*                 /* ------------------------------------------------ */    *
*                 g0 = f0 + f7;               h2 = f0 - f7;                 *
*                 g1 = f1 + f6;               h3 = f1 - f6;                 *
*                 h1 = f2 + f5;               g3 = f2 - f5;                 *
*                 h0 = f3 + f4;               g2 = f3 - f4;                 *
*                                                                           *
*                 /* ------------------------------------------------ */    *
*                 /*  Stage 2                                         */    *
*                 /* ------------------------------------------------ */    *
*                 p0 = g0 + h0;               r0 = g0 - h0;                 *
*                 p1 = g1 + h1;               r1 = g1 - h1;                 *
*                 q1 = g2;                    s1 = h2;                      *
*                                                                           *
*                 s0a= h3 + g3;               q0a= h3 - g3;                 *
*                 s0 = (s0a * c0 + 0x7FFF) >> 16;                           *
*                 q0 = (q0a * c0 + 0x7FFF) >> 16;                           *
*                                                                           *
*                 /* ------------------------------------------------ */    *
*                 /*  Stage 3                                         */    *
*                 /* ------------------------------------------------ */    *
*                 P0 = p0 + p1;               P1 = p0 - p1;                 *
*                 R1 = c6 * r1 + c2 * r0;     R0 = c6 * r0 - c2 * r1;       *
*                                                                           *
*                 Q1 = q1 + q0;               Q0 = q1 - q0;                 *
*                 S1 = s1 + s0;               S0 = s1 - s0;                 *
*                                                                           *
*                 /* ------------------------------------------------ */    *
*                 /*  Stage 4                                         */    *
*                 /* ------------------------------------------------ */    *
*                 F0 = P0;                    F4 = P1;                      *
*                 F2 = R1;                    F6 = R0;                      *
*                                                                           *
*                 F1 = c7 * Q1 + c1 * S1;     F7 = c7 * S1 - c1 * Q1;       *
*                 F5 = c3 * Q0 + c5 * S0;     F3 = c3 * S0 - c5 * Q0;       *
*                                                                           *
*                 /* ------------------------------------------------ */    *
*                 /*  Store the frequency domain results.             */    *
*                 /* ------------------------------------------------ */    *
*                 dct_io_ptr[ 0] = F0;                                      *
*                 dct_io_ptr[ 8] = F1 >> 13;                                *
*                 dct_io_ptr[16] = F2 >> 13;                                *
*                 dct_io_ptr[24] = F3 >> 13;                                *
*                 dct_io_ptr[32] = F4;                                      *
*                 dct_io_ptr[40] = F5 >> 13;                                *
*                 dct_io_ptr[48] = F6 >> 13;                                *
*                 dct_io_ptr[56] = F7 >> 13;                                *
*                                                                           *
*                 dct_io_ptr++;                                             *
*             }                                                             *
*             /* ---------------------------------------------------- */    *
*             /*  Update pointer to next 8x8 FDCT block.              */    *
*             /* ---------------------------------------------------- */    *
*             dct_io_ptr += 56;                                             *
*         }                                                                 *
*                                                                           *
*         /* -------------------------------------------------------- */    *
*         /*  Perform Horizontal 1-D FDCT on each 8x8 block.          */    *
*         /* -------------------------------------------------------- */    *
*         dct_io_ptr = dct_data;                                            *
*         for (i = 0; i < 8 * num_fdcts; i++)                               *
*         {                                                                 *
*             /* ---------------------------------------------------- */    *
*             /*  Load the spatial-domain samples.                    */    *
*             /* ---------------------------------------------------- */    *
*             f0 = dct_io_ptr[0];                                           *
*             f1 = dct_io_ptr[1];                                           *
*             f2 = dct_io_ptr[2];                                           *
*             f3 = dct_io_ptr[3];                                           *
*             f4 = dct_io_ptr[4];                                           *
*             f5 = dct_io_ptr[5];                                           *
*             f6 = dct_io_ptr[6];                                           *
*             f7 = dct_io_ptr[7];                                           *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Stage 1:  Separate into even and odd halves.        */    *
*             /* ---------------------------------------------------- */    *
*             g0 = f0 + f7;               h2 = f0 - f7;                     *
*             g1 = f1 + f6;               h3 = f1 - f6;                     *
*             h1 = f2 + f5;               g3 = f2 - f5;                     *
*             h0 = f3 + f4;               g2 = f3 - f4;                     *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Stage 2                                             */    *
*             /* ---------------------------------------------------- */    *
*             p0 = g0 + h0;               r0 = g0 - h0;                     *
*             p1 = g1 + h1;               r1 = g1 - h1;                     *
*             q1 = g2;                    s1 = h2;                          *
*                                                                           *
*             s0a= h3 + g3;               q0a= h3 - g3;                     *
*             q0 = (q0a * c0 + 0x7FFF) >> 16;                               *
*             s0 = (s0a * c0 + 0x7FFF) >> 16;                               *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Stage 3                                             */    *
*             /* ---------------------------------------------------- */    *
*             P0 = p0 + p1;               P1 = p0 - p1;                     *
*             R1 = c6 * r1 + c2 * r0;     R0 = c6 * r0 - c2 * r1;           *
*                                                                           *
*             Q1 = q1 + q0;               Q0 = q1 - q0;                     *
*             S1 = s1 + s0;               S0 = s1 - s0;                     *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Stage 4                                             */    *
*             /* ---------------------------------------------------- */    *
*             F0 = P0;                    F4 = P1;                          *
*             F2 = R1;                    F6 = R0;                          *
*                                                                           *
*             F1 = c7 * Q1 + c1 * S1;     F7 = c7 * S1 - c1 * Q1;           *
*             F5 = c3 * Q0 + c5 * S0;     F3 = c3 * S0 - c5 * Q0;           *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Round and truncate values.                          */    *
*             /*                                                      */    *
*             /*  Note: F0 and F4 have different rounding since no    */    *
*             /*  MPYs have been applied to either term.  Also, F0's  */    *
*             /*  rounding is slightly different to offset the        */    *
*             /*  truncation effects from the horizontal pass (which  */    *
*             /*  does not round).                                    */    *
*             /* ---------------------------------------------------- */    *
*             F0r = (F0 + 0x0006) >>  3;                                    *
*             F1r = (F1 + 0x7FFF) >> 16;                                    *
*             F2r = (F2 + 0x7FFF) >> 16;                                    *
*             F3r = (F3 + 0x7FFF) >> 16;                                    *
*             F4r = (F4 + 0x0004) >>  3;                                    *
*             F5r = (F5 + 0x7FFF) >> 16;                                    *
*             F6r = (F6 + 0x7FFF) >> 16;                                    *
*             F7r = (F7 + 0x7FFF) >> 16;                                    *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Store the results                                   */    *
*             /* ---------------------------------------------------- */    *
*             dct_io_ptr[0] = F0r;                                          *
*             dct_io_ptr[1] = F1r;                                          *
*             dct_io_ptr[2] = F2r;                                          *
*             dct_io_ptr[3] = F3r;                                          *
*             dct_io_ptr[4] = F4r;                                          *
*             dct_io_ptr[5] = F5r;                                          *
*             dct_io_ptr[6] = F6r;                                          *
*             dct_io_ptr[7] = F7r;                                          *
*                                                                           *
*             /* ---------------------------------------------------- */    *
*             /*  Update pointer to next FDCT row.                    */    *
*             /* ---------------------------------------------------- */    *
*             dct_io_ptr += 8;                                              *
*         }                                                                 *
*                                                                           *
*         return;                                                           *
*       }                                                                   *
*                                                                           *
*                                                                           *
*       Note:  This code guarantees correct operation, even in the case     *
*       that 'num_fdcts == 0'.  In this case, the function runs for only    *
*       13 cycles (counting 6 cycles of function-call overhead), due to     *
*       early-exit code.  The early-exit case performs no accesses to the   *
*       fdct_data[] array and minimal access to the stack.                  *
*                                                                           *
*   TECHNIQUES                                                              *
*       The loop nest in the vertical pass has been collapsed into a        *
*       single-level loop.  Both vertical and horizontal loops have         *
*       been software pipelined.                                            *
*                                                                           *
*       For performance, portions of the code outside the loops have been   *
*       inter-scheduled with the prolog and epilog code of the loops.       *
*       Also, twin stack-pointers are used to accelerate stack accesses.    *
*       Finally, pointer values and cosine term registers are reused        *
*       between the horizontal and vertical loops to reduce the impact of   *
*       pointer and constant reinitialization.                              *
*                                                                           *
*       To save codesize, prolog and epilog collapsing have been performed  *
*       to the extent that it does not impact performance.  Also, code      *
*       outside the loops has been scheduled to pack as tightly into        *
*       fetch packets as possible to avoid alignment padding NOPs.          *
*                                                                           *
*       To reduce register pressure and save some code, the horizontal      *
*       loop uses the same pair of pointer register for both reading and    *
*       writing.  The pointer increments are on the LDs to permit prolog    *
*       and epilog collapsing, since LDs can be speculated.                 *
*                                                                           *
*       Additional section-specific optimization notes are provided below.  *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       Stack is aligned to a word boundary.                                *
*                                                                           *
*   MEMORY NOTE                                                             *
*       No bank conflicts occur, regardless of fdct_data[]'s alignment.     *
*                                                                           *
*       The code requires 16 words of stack space to save Save-On-Entry     *
*       (SOE) registers, CSR, IRP, and a spill value.                       *
*                                                                           *
*       Bank usage on C6201:  1 of 4 banks for 40% of loop cycles           *
*                             2 of 4 banks for 60% of loop cycles           *
*                                                                           *
*       Nearly every cycle of this function performs at least one           *
*       memory access.                                                      *
*                                                                           *
*   NOTES                                                                   *
*       This code masks interrupts for nearly its entire duration.          *
*       Interrupts are locked out for '40 + 160 * num_fdcts' cycles.  As    *
*       a result, the code is interrupt-tolerant, but not interruptible.    *
*                                                                           *
*       The cosine terms have all been scaled by sqrt(2), so that the       *
*       "c4" term is basically an even power of 2.                          *
*                                                                           *
*       The code is completely endian neutral.                              *
*                                                                           *
*   SOURCE                                                                  *
*       Chen FDCT.                                                          *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 1999 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *

                .sect   ".data:copyright_h"
_Copyright:     .string "Copyright (C) 1999 Texas Instruments Incorporated. "
                .string "All Rights Reserved.",0
                .sect   ".text:hand"
                .global _fdct_8x8_asm

_fdct_8x8_asm:
; ========================== SYMBOLIC CONSTANTS =========================== ;
        .asg            0xB505,     cst_c0  ; Cosine term c0
        .asg            0x2C62,     cst_c1  ; Cosine term c1
        .asg            0x29CF,     cst_c2  ; Cosine term c2
        .asg            0x25A0,     cst_c3  ; Cosine term c3
        .asg            0x1924,     cst_c5  ; Cosine term c5
        .asg            0x1151,     cst_c6  ; Cosine term c6
        .asg            0x08D4,     cst_c7  ; Cosine term c7

; =============== SYMBOLIC REGISTER ASSIGNMENTS: VERT LOOP ================ ;
        .asg            A11,        A_k1c0  ; 1, Cosine term c0     (packed)
        .asg            A12,        A_c1c7  ; Cosine terms c1, c7   (packed)
        .asg            A13,        A_c2c6  ; Cosine terms c2, c6   (packed)
        .asg            B11,        B_k1c0  ; 1, Cosine term c0     (packed)
        .asg            B12,        B_c1c7  ; Cosine terms c1, c7   (packed)
        .asg            B13,        B_c2c6  ; Cosine terms c2, c6   (packed)
        .asg            B14,        B_c3c5  ; Cosine terms c3, c5   (packed)
        .asg            A4,         A_i_ptr ; Input pointer
        .asg            B10,        B_o_ptr ; Output pointer
        .asg            A9,         A_f0    ; Spatial domain sample f0
        .asg            B8,         B_f1    ; Spatial domain sample f1
        .asg            B6,         B_f2    ; Spatial domain sample f2
        .asg            A5,         A_f3    ; Spatial domain sample f3
        .asg            A7,         A_f4    ; Spatial domain sample f4
        .asg            B7,         B_f5    ; Spatial domain sample f5
        .asg            B15,        B_f6    ; Spatial domain sample f6
        .asg            A6,         A_f7    ; Spatial domain sample f7
        .asg            A6,         A_g0    ; Node g0 in flow graph
        .asg            B8,         B_g1    ; Node g1 in flow graph
        .asg            B6,         B_h1    ; Node h1 in flow graph
        .asg            A7,         A_h0    ; Node h0 in flow graph
        .asg            A0,         A_s1    ; Node s1 (h2) in flow graph
        .asg            B4,         B_h3    ; Node h3 in flow graph
        .asg            B15,        B_g3    ; Node g3 in flow graph
        .asg            A15,        A_q1    ; Node q1 (g2) in flow graph
        .asg            A6,         A_p0    ; Node p0 in flow graph
        .asg            B6,         B_p1    ; Node p1 in flow graph
        .asg            B15,        B_s0a   ; Node s0 intermediate result
        .asg            B5,         B_s0b   ; Node s0 intermediate result
        .asg            B15,        B_s0    ; Node s0 in flow graph
        .asg            A3,         A_r0    ; Node r0 in flow graph
        .asg            B15,        B_r1    ; Node r1 in flow graph
        .asg            B4,         B_q0a   ; Node q0 intermediate result
        .asg            A14,        A_q0b   ; Node q0 intermediate result
        .asg            A3,         A_q0    ; Node q0 in flow graph
        .asg            A10,        A_Q1    ; Node Q1 in flow graph
        .asg            B5,         B_S1    ; Node S1 in flow graph
        .asg            A3,         A_Q0    ; Node Q0 in flow graph
        .asg            B4,         B_S0    ; Node S0 in flow graph
        .asg            A14,        A_c1Q1  ; Intermediate value c1 * Q1
        .asg            A6,         A_c2r0  ; Intermediate value c2 * r0
        .asg            A7,         A_c3Q0  ; Intermediate value c3 * Q0
        .asg            A3,         A_c5Q0  ; Intermediate value c5 * Q0
        .asg            A14,        A_c6r0  ; Intermediate value c6 * r0
        .asg            A8,         A_c7Q1  ; Intermediate value c7 * Q1
        .asg            B5,         B_c1S1  ; Intermediate value c1 * S1
        .asg            B0,         B_c2r1  ; Intermediate value c2 * r1
        .asg            B0,         B_c3S0  ; Intermediate value c3 * S0
        .asg            B3,         B_c5S0  ; Intermediate value c5 * S0
        .asg            B6,         B_c6r1  ; Intermediate value c6 * r1
        .asg            B5,         B_c7S1  ; Intermediate value c7 * S1
        .asg            B9,         B_F0    ; Frequency domain term F0
        .asg            A8,         A_F1    ; Frequency domain term F1
        .asg            A5,         A_F2    ; Frequency domain term F2
        .asg            B4,         B_F3    ; Frequency domain term F3
        .asg            B3,         B_F4    ; Frequency domain term F4
        .asg            A9,         A_F5    ; Frequency domain term F5
        .asg            A10,        A_F6    ; Frequency domain term F6
        .asg            B4,         B_F7    ; Frequency domain term F7
        .asg            A8,         A_F1t   ; Truncated result for F1
        .asg            A5,         A_F2t   ; Truncated result for F2
        .asg            B7,         B_F3t   ; Truncated result for F3
        .asg            A10,        A_F5t   ; Truncated result for F5
        .asg            A10,        A_F6t   ; Truncated result for F6
        .asg            B5,         B_F7t   ; Truncated result for F7
        .asg            B2,         B_i     ; Inner loop counter #1
        .asg            A1,         A_i     ; Inner loop counter #2
        .asg            B1,         B_o     ; Outer loop counter
        .asg            A2,         A_c     ; Prolog collapse counter
; ========================================================================= ;


* ========================================================================= *
*   Initialization code / Stack Management                                  *
*                                                                           *
*   This code is responsible for saving registers to the stack, disabling   *
*   interrupts, and setting up for the vertical loop.                       *
*                                                                           *
*   This function requires 16 words of stack.  A10...A15, B10...B14, CSR,   *
*   IRP, and 'num_fdcts' are all pushed on the stack.  For speed, this      *
*   code uses twin stack pointers to offload registers onto the stack as    *
*   quickly as possible.                                                    *
*                                                                           *
*   The majority of the code in this function is not interruptible.         *
*   Therefore, interrupts are disabled almost immediately after entry       *
*   into the function, and the previous interruptibility state is restored  *
*   on exit.  The previous value of CSR is pushed on the stack and          *
*   restored on exit.                                                       *
*                                                                           *
*   Since all 32 registers are used by the vertical loop, the stack         *
*   pointer is saved in the IRP register.  The previous contents of IRP     *
*   are also pushed on the stack.                                           *
*                                                                           *
*   Initialization for constants (cosine terms, etc.) is overlapped with    *
*   the prolog of the vertical loop to save time.  Pointer setup for the    *
*   output pointer is also hidden in the prolog.                            *
*                                                                           *
*   Early exit code suppresses most of the function's activity (including   *
*   most of the stack accesses) if num_fdcts (in B4) is zero.  It is not    *
*   possible to exit the function faster.                                   *
* ========================================================================= *

;-
        STW     .D2T1   A15,        * B15--[16]     ; Save A15, get stk frame
||      MV      .L1X    B15,        A15             ; Twin Stack Pointer
||      SHL     .S2     B4,         3,          B_o ; iters == num_fdcts * 8

  [ B_o]STW     .D1T2   B14,        *-A15  [ 2]     ; Save B14 (SP[14])
||[ B_o]ADD     .L2     B_o,        -1,         B_o ; Adj. for parallel iters
||[ B_o]ADDK    .S1     48,         A_i_ptr         ; Point to row 3, col 0
||[!B_o]B       .S2     B3                          ; Abort if num_fdcts == 0
||[!B_o]LDW     .D2T1   *++B15[16], A15             ; Restore A15 on abort
; ===== Interrupts masked by branch delay slots =====
;-
  [ B_o]STW     .D1T1   A13,        *-A15  [ 3]     ; Save A13 (SP[13])
||[ B_o]STW     .D2T2   B11,        *+B15  [ 8]     ; Save B11
||[ B_o]MVC     .S2     CSR,        B0              ; Snapshot CSR

  [ B_o]STW     .D1T1   A12,        *-A15  [ 5]     ; Save A12 (SP[11])
||[ B_o]STW     .D2T2   B12,        *+B15  [10]     ; Save B12

  [ B_o]STW     .D1T2   B13,        *-A15  [ 7]     ; Save B13 (SP[ 9])
||[ B_o]STW     .D2T1   A11,        *+B15  [12]     ; Save A11 
||[ B_o]MVC     .S2     IRP,        B5              ; Snapshot IRP
;-
  [ B_o]STW     .D1T1   A10,        *-A15  [ 9]     ; Save A10 (SP[ 7])
||[ B_o]STW     .D2T2   B10,        *+B15  [ 6]     ; Save B10
||[ B_o]AND     .L2     B0,         -2,         B2  ; Clear GIE bit in CSR
||[ B_o]MV      .L1X    B5,         A1              ; Partitioning MV

  [ B_o]STW     .D2T2   B3,         *+B15  [ 5]     ; Save return address
||[ B_o]STW     .D1T1   A1,         *-A15  [12]     ; Save IRP (SP[ 4])
||[ B_o]MV      .L1X    B0,         A0              ; Partitioning MV
||[ B_o]MVC     .S2     B2,         CSR             ; Mask interrupts
; ===== Branch Occurs ===== 
;-
; =========================== PIPE LOOP PROLOG ============================ ;
        LDH     .D1T1   *+A_i_ptr  [ 8],        A_f4            ;[ 1,1]
||      MVC     .S2     B15,        IRP             ; Save Stack Pointer

        LDH     .D1T2   *-A_i_ptr  [16],        B_f1            ;[ 2,1]
||      MVK     .S1     4,          A_i             ; Inner loop counter #1

        LDH     .D1T1   * A_i_ptr++[ 1],        A_f3            ;[ 3,1]
||      MVKL    .S1     cst_c7,     A_c1c7          ; Cosine term C7

        LDH     .D1T1   *-A_i_ptr  [25],        A_f0            ;[ 4,1]
||      MVKL    .S1     cst_c0,     A_k1c0          ; Cosine term C0
;-
        LDH     .D1T2   *+A_i_ptr  [15],        B_f5            ;[ 5,1]
||      MVKL    .S1     cst_c6,     A_c2c6          ; Cosine term C6
||      MVKL    .S2     cst_c6,     B_c2c6          ; Cosine term C6
||      MV      .L2X    A_c1c7,     B_c1c7          ; Twin constant register

        LDH     .D1T2   *-A_i_ptr  [ 9],        B_f2            ;[ 6,1]
||      MVKLH   .S1     cst_c2,     A_c2c6          ; Cosine term C2
||      SUB     .L1     A_i,        2,          A_c ; Prolog collapse cnt = 2
||      ADD     .L2X    A_i_ptr,    -2,         B_o_ptr
;-
        LDH     .D1T2   *+A_i_ptr  [23],        B_f6            ;[ 7,1]
||      MVKLH   .S2     cst_c1,     B_c1c7          ; Cosine term C1

        LDH     .D1T1   *+A_i_ptr  [31],        A_f7            ;[ 8,1]
||      MVKLH   .S1     cst_c1,     A_c1c7          ; Cosine term C1
||      MVKLH   .S2     cst_c2,     B_c2c6          ; Cosine term C2

        MVKL    .S2     cst_c5,     B_c3c5          ; Cosine term C5
||      MVKLH   .S1     1,          A_k1c0          ; Constant: 0x0001
||      STW     .D2T1   A14,        *+B15  [15]     ; Save A14
;-
        SUB     .L1     A_f3,       A_f4,       A_q1            ;[ 9,1] q1=g2
||      ADD     .S1     A_f3,       A_f4,       A_h0            ;[10,1]
||      MVKLH   .S2     cst_c3,     B_c3c5          ; Cosine term C3
||      STW     .D2T2   B_o,        *+B15  [ 3]     ; Spill horiz loop count 
||      STW     .D1T1   A0,         *-A15  [14]     ; Save CSR (SP[ 2])

        LDH     .D1T1   *+A_i_ptr  [ 8],        A_f4            ;[ 1,2]
||      MV      .L2X    A_k1c0,     B_k1c0          ; Twin constant register
||      MVK     .S2     16,         B_i             ; Inner loop counter #2
;-
; =========================== PIPE LOOP KERNEL ============================ ;
v_loop:
        SHR     .S1     A_F6,       13,         A_F6t           ;[22,1]
||      MPY     .M2     B_S0,       B_c3c5,     B_c5S0          ;[22,1]
||      MPY     .M1X    A_Q0,       B_c3c5,     A_c5Q0          ;[22,1]
||      ADD     .D2     B_f1,       B_f6,       B_g1            ;[12,2]
||      SUB     .S2     B_f2,       B_f5,       B_g3            ;[12,2]
||      SUB     .L2     B_f1,       B_f6,       B_h3            ;[12,2]
||      LDH     .D1T2   *-A_i_ptr  [16],        B_f1            ;[ 2,3]
||[ A_c]ADD     .L1     A_c,        -1,         A_c             ;pro. collapse
;-
v_loop_1:
  [!A_c]STH     .D2T2   B_F4,       *+B_o_ptr  [ 8]             ;[23,1]
||      MPY     .M2     B_S1,       B_c1c7,     B_c7S1          ;[23,1]
||      MPYLH   .M1X    A_Q0,       B_c3c5,     A_c3Q0          ;[23,1]
||      ADD     .L2     B_h3,       B_g3,       B_s0a           ;[13,2]
||      SUB     .S2     B_h3,       B_g3,       B_q0a           ;[13,2]
||      SUB     .S1     A_f0,       A_f7,       A_s1            ;[13,2] s1=h2
||      ADD     .L1     A_f0,       A_f7,       A_g0            ;[13,2]
||      LDH     .D1T1   * A_i_ptr++[ 1],        A_f3            ;[ 3,3]
;-
v_loop_2:
  [!A_c]STH     .D2T2   B_F0,       *-B_o_ptr  [24]             ;[24,1]
||      SUB     .S2X    B_c3S0,     A_c5Q0,     B_F3            ;[24,1]
||      MPYLH   .M2     B_S1,       B_c1c7,     B_c1S1          ;[24,1]
||      ADD     .L2     B_f2,       B_f5,       B_h1            ;[14,2]
||      SUB     .S1     A_g0,       A_h0,       A_r0            ;[14,2]
||      ADD     .L1     A_g0,       A_h0,       A_p0            ;[14,2]
||      MPYSU   .M1X    B_q0a,      A_k1c0,     A_q0b           ;[14,2]
||      LDH     .D1T1   *-A_i_ptr  [25],        A_f0            ;[ 4,3]
;-
v_loop_3:
  [!A_c]SHR     .S1     A_F2,       13,         A_F2t           ;[25,1]
||[!A_c]MPY     .M1     A_i,        4,          A_i             ;[25,1]
||      SHR     .S2     B_F3,       13,         B_F3t           ;[25,1]
||      SUB     .L2X    B_c7S1,     A_c1Q1,     B_F7            ;[25,1]
||      ADD     .L1X    A_c3Q0,     B_c5S0,     A_F5            ;[25,1]
||      SUB     .D2     B_g1,       B_h1,       B_r1            ;[15,2]
||      MPYSU   .M2     B_s0a,      B_k1c0,     B_s0b           ;[15,2]
||      LDH     .D1T2   *+A_i_ptr  [15],        B_f5            ;[ 5,3]
;-
v_loop_4:
        ADD     .L1X    A_c7Q1,     B_c1S1,     A_F1            ;[26,1]
||[ B_o]B       .S2     v_loop                                  ;[26,1]
||[!A_c]STH     .D2T1   A_F6t,      *+B_o_ptr  [24]             ;[26,1]
||      ADD     .L2     B_g1,       B_h1,       B_p1            ;[16,2]
||      ADDK    .S1     07FFFh,     A_q0b                       ;[16,2]
||      MPY     .M1     A_r0,       A_c2c6,     A_c6r0          ;[16,2]
||      MPY     .M2     B_i,        4,          B_i             ;[ 6,3]
||      LDH     .D1T2   *-A_i_ptr  [ 9],        B_f2            ;[ 6,3]
;-
v_loop_5:
  [!A_c]STH     .D2T1   A_F2t,      *-B_o_ptr  [ 8]             ;[27,1]
||      SHR     .S1     A_F5,       13,         A_F5t           ;[27,1]
||      MPY     .M2     B_r1,       B_c2c6,     B_c6r1          ;[17,2]
||      SUB     .L2X    A_p0,       B_p1,       B_F4            ;[17,2]
||      ADDK    .S2     07FFFh,     B_s0b                       ;[17,2]
||      MPYH    .M1     A_q0b,      A_k1c0,     A_q0            ;[17,2]
||      LDH     .D1T2   *+A_i_ptr  [23],        B_f6            ;[ 7,3]
;-
v_loop_6:
  [!A_c]STH     .D2T2   B_F3t,      * B_o_ptr++[ 1]             ;[28,1]
||      SHR     .S1     A_F1,       13,         A_F1t           ;[28,1]
||      ADD     .L2X    A_p0,       B_p1,       B_F0            ;[18,2]
||      MPYLH   .M1     A_r0,       A_c2c6,     A_c2r0          ;[18,2]
||      MPYH    .M2     B_s0b,      B_k1c0,     B_s0            ;[18,2]
||[ B_o]SUB     .S2     B_o,        1,          B_o             ;[18,2]
||      LDH     .D1T1   *+A_i_ptr  [31],        A_f7            ;[ 8,3]
v_loop_7:
;-
        SHR     .S2     B_F7,       13,         B_F7t           ;[29,1]
||[!A_c]STH     .D2T1   A_F5t,      *+B_o_ptr  [15]             ;[29,1]
||      MPYLH   .M2     B_r1,       B_c2c6,     B_c2r1          ;[19,2]
||      SUB     .L1     A_q1,       A_q0,       A_Q0            ;[19,2]
||      ADD     .D1     A_q1,       A_q0,       A_Q1            ;[19,2]
||      SUB     .S1     A_f3,       A_f4,       A_q1            ;[ 9,3] q1=g2
;-
v_loop_8:
  [!A_c]STH     .D2T1   A_F1t,      *-B_o_ptr  [17]             ;[30,1]
||      ADD     .L1X    B_c6r1,     A_c2r0,     A_F2            ;[20,2]
||      SUB     .L2X    A_s1,       B_s0,       B_S0            ;[20,2]
||      MPYLH   .M1     A_Q1,       A_c1c7,     A_c1Q1          ;[20,2]
||      ADD     .D1     A_f3,       A_f4,       A_h0            ;[10,3]
||[!B_i]ADD     .S2     B_i,        4,          B_i             ;[10,3]
||[!B_i]ADDK    .S1     112,        A_i_ptr                     ;[10,3]
;-
v_loop_9:
  [!A_c]STH     .D2T2   B_F7t,      *+B_o_ptr  [31]             ;[31,1]
||[!A_i]ADDK    .S2     112,        B_o_ptr                     ;[31,1]
||[!A_i]ADD     .S1     A_i,        4,          A_i             ;[31,1]
||      SUB     .L1X    A_c6r0,     B_c2r1,     A_F6            ;[21,2]
||      MPYLH   .M2     B_S0,       B_c3c5,     B_c3S0          ;[21,2]
||      ADD     .L2X    A_s1,       B_s0,       B_S1            ;[21,2]
||      MPY     .M1     A_Q1,       A_c1c7,     A_c7Q1          ;[21,2]
||      LDH     .D1T1   *+A_i_ptr  [ 8],        A_f4            ;[ 1,4]
; =========================== PIPE LOOP EPILOG ============================ ;

* ========================================================================= *
*   Epilog / Inter-loop / Prolog Code                                       *
*                                                                           *
*   The code from the vertical loop's epilog has been interscheduled        *
*   with inter-loop code and prolog code for the horizontal loop.           *
*   This allows hiding some of the overhead as we pipe-down one loop and    *
*   pipe-up the next.                                                       *
*                                                                           *
*   Notably, we restore B15 and IRP here (rather than after the loop)       *
*   and unspill our loop trip count from the stack, all in parallel with    *
*   the prolog and epilog code.  Also, the epilog of the first loop has     *
*   been heavily overlapped with the prolog of the second loop.  Since      *
*   a handful of symbolic names have been assigned to different registers,  *
*   and others have conflicting names between the two loops, we use a set   *
*   of intermediate symbolic names that bridge the transition.              *
*                                                                           *
*   To save a STH/LDH pair, the value of "F7t" from the first loop is       *
*   forwarded directly to the input "f7" of the second loop.  (The last     *
*   FDCT performed by the vertical loop overlaps the first FDCT performed   *
*   by the second loop.)  This is done through a "sign extension", to       *
*   exactly mimic the overflow behavior of the original C code.             *
*                                                                           *
*   For speed, we twin the stack pointer in a spare slot here so that the   *
*   stack restore after the loop can proceed as quickly as possible.        *
* ========================================================================= *

        .asg            A4,         Ah_io_ptr   ; Horiz Input/output pointer
        .asg            A14,        Av_c1Q1     ; Vert: Intermediate c1 * Q1
        .asg            A6,         Av_c3Q0     ; Vert: Intermediate c3 * Q0
        .asg            A8,         Av_c7Q1     ; Vert: Intermediate c7 * Q1
        .asg            B6,         Bv_c1S1     ; Vert: Intermediate c1 * S1
        .asg            B3,         Bv_c5S0     ; Vert: Intermediate c5 * S0
        .asg            B5,         Bv_c7S1     ; Vert: Intermediate c7 * S1
        .asg            A8,         Av_F1       ; Vert: Freq. domain term F1
        .asg            A5,         Av_F2       ; Vert: Freq. domain term F2
        .asg            B4,         Bv_F3       ; Vert: Freq. domain term F3
        .asg            A9,         Av_F5       ; Vert: Freq. domain term F5
        .asg            B4,         Bv_F7       ; Vert: Freq. domain term F7
        .asg            A8,         Av_F1t      ; Vert: Trunc. result for F1
        .asg            A5,         Av_F2t      ; Vert: Trunc. result for F2
        .asg            B7,         Bv_F3t      ; Vert: Trunc. result for F3
        .asg            A9,         Av_F5t      ; Vert: Trunc. result for F5
        .asg            B5,         Bv_F7t      ; Vert: Trunc. result for F7
;-
        SHR     .S1     A_F6,       13,         A_F6t           ;[22,4]
||      MPYLH   .M2     B_S1,       B_c1c7,     Bv_c1S1         ;[24,4]
||      MPY     .M1X    A_Q0,       B_c3c5,     A_c5Q0          ;[22,4]
||      STH     .D2T2   B_F4,       *+B_o_ptr  [ 8]             ;[23,4]

        MPY     .M2     B_S0,       B_c3c5,     Bv_c5S0         ;[22,4]
||      MPYLH   .M1X    A_Q0,       B_c3c5,     Av_c3Q0         ;[23,4]
||      STH     .D2T1   A_F6t,      *+B_o_ptr  [24]             ;[26,4]
||      MVC     .S2     IRP,        B15
;-
        STH     .D2T2   B_F0,       *-B_o_ptr  [24]             ;[24,4]
||      MPY     .M2     B_S1,       B_c1c7,     B_c7S1          ;[23,4]
||      SUB     .S2X    B_c3S0,     A_c5Q0,     B_F3            ;[24,4]
||      ADD     .L1X    A_c7Q1,     Bv_c1S1,    A_F1            ;[26,4]
||      ADDK    .S1     -54,        Ah_io_ptr   ; Adjust pointer

        SHR     .S2     B_F3,       13,         B_F3t           ;[25,4]
||      LDW     .D2T1   *B15[4],    A2          ; Load IRP's value
||      SHR     .S1     A_F1,       13,         Av_F1t          ;[28,4]
;-
; ========================================================================= ;

; =============== SYMBOLIC REGISTER ASSIGNMENTS: HORIZ LOOP =============== ;
        .asg            A14,        A_c3c5  ; Cosine terms c3, c5   (packed)
        .asg            B1,         B_k_rnd ; Rounding constant 0x7FFF
        .asg            A12,        A_k_rnd ; Rounding constant 0x7FFF
        .asg            B2,         B_io_ptr; Input/output pointer
        .asg            A4,         A_io_ptr; Input/output pointer
        .asg            A7,         A_f0    ; Spatial domain sample f0
        .asg            B13,        B_f1    ; Spatial domain sample f1
        .asg            B3,         B_f2    ; Spatial domain sample f2
        .asg            A10,        A_f3    ; Spatial domain sample f3
        .asg            A0,         A_f4    ; Spatial domain sample f4
        .asg            B7,         B_f5    ; Spatial domain sample f5
        .asg            B9,         B_f6t   ; Spatial domain sample f6 (tmp)
        .asg            B10,        B_f6    ; Spatial domain sample f6
        .asg            A8,         A_f7    ; Spatial domain sample f7
        .asg            A0,         A_g0    ; Node g0 in flow graph
        .asg            B3,         B_g1    ; Node g1 in flow graph
        .asg            B8,         B_h1    ; Node h1 in flow graph
        .asg            A9,         A_h0    ; Node h0 in flow graph
        .asg            A1,         A_s1    ; Node s1 in flow graph
        .asg            B13,        B_h3    ; Node h3 in flow graph
        .asg            B10,        B_g3    ; Node g3 in flow graph
        .asg            A10,        A_q1    ; Node q1 in flow graph
        .asg            A5,         A_p0    ; Node p0 in flow graph
        .asg            B4,         B_p1    ; Node p1 in flow graph
        .asg            B4,         B_s0a   ; Node s0 intermediate value
        .asg            B5,         B_s0b   ; Node s0 intermediate value
        .asg            B5,         B_s0c   ; Node s0 intermediate value
        .asg            B9,         B_s0    ; Node s0 in flow graph
        .asg            A0,         A_r0    ; Node r0 in flow graph
        .asg            B7,         B_r1    ; Node r1 in flow graph
        .asg            B5,         B_q0a   ; Node q0 intermediate value
        .asg            A3,         A_q0b   ; Node q0 intermediate value
        .asg            A0,         A_q0c   ; Node q0 intermediate value
        .asg            A6,         A_q0    ; Node q0 in flow graph
        .asg            A9,         A_Q1    ; Node Q1 in flow graph
        .asg            B8,         B_S1    ; Node S1 in flow graph
        .asg            A6,         A_Q0    ; Node Q0 in flow graph
        .asg            B5,         B_S0    ; Node S0 in flow graph
        .asg            A0,         A_c1Q1  ; Intermediate value c1 * Q1
        .asg            A5,         A_c2r0  ; Intermediate value c2 * r0
        .asg            A3,         A_c3Q0  ; Intermediate value c3 * Q0
        .asg            A9,         A_c5Q0  ; Intermediate value c5 * Q0
        .asg            A3,         A_c6r0  ; Intermediate value c6 * r0
        .asg            A3,         A_c7Q1  ; Intermediate value c7 * Q1
        .asg            B7,         B_c1S1  ; Intermediate value c1 * S1
        .asg            B3,         B_c2r1  ; Intermediate value c2 * r1
        .asg            B4,         B_c3S0  ; Intermediate value c3 * S0
        .asg            B10,        B_c5S0  ; Intermediate value c5 * S0
        .asg            B4,         B_c6r1  ; Intermediate value c6 * r1
        .asg            B4,         B_c7S1  ; Intermediate value c7 * S1
        .asg            B5,         B_F0    ; Frequency domain term F0
        .asg            A6,         A_F1    ; Frequency domain term F1
        .asg            B9,         B_F2    ; Frequency domain term F2
        .asg            B3,         B_F3    ; Frequency domain term F3
        .asg            A1,         A_F4    ; Frequency domain term F4
        .asg            A9,         A_F5    ; Frequency domain term F5
        .asg            A3,         A_F6    ; Frequency domain term F6
        .asg            B7,         B_F7    ; Frequency domain term F7
        .asg            B5,         B_F0r   ; Rounded value for F0
        .asg            A6,         A_F1r   ; Rounded value for F1
        .asg            B3,         B_F2r   ; Rounded value for F2
        .asg            B3,         B_F3r   ; Rounded value for F3
        .asg            A7,         A_F4r   ; Rounded value for F4
        .asg            A9,         A_F5r   ; Rounded value for F5
        .asg            A3,         A_F6r   ; Rounded value for F6
        .asg            B5,         B_F7r   ; Rounded value for F7
        .asg            B6,         B_F0t   ; Truncated result for F0
        .asg            A8,         A_F1t   ; Truncated result for F1
        .asg            B6,         B_F2t   ; Truncated result for F2
        .asg            B4,         B_F3t   ; Truncated result for F3
        .asg            A7,         A_F4t   ; Truncated result for F4
        .asg            A0,         A_F5t   ; Truncated result for F5
        .asg            A5,         A_F6t   ; Truncated result for F6
        .asg            B13,        B_F7t   ; Truncated result for F7
        .asg            A2,         A_o     ; Outer loop counter
        .asg            B0,         B_c     ; Prolog collapse counter
        .asg            A1,         A_c     ; Prolog collapse counter copy
; ========================================================================= ;

* ========================================================================= *
*   (Instructions marked "(v)" in the prolog below are from the vertical    *
*   loop's epilog.)                                                         *
* ========================================================================= *

; =========================== PIPE LOOP PROLOG ============================ ;
        LDH     .D1T2   *-A_io_ptr  [ 4],       B_f2            ;[ 1,1]
||      SUB             A_io_ptr,   12,         B_io_ptr
||      STH     .D2T1   Av_F1t,     *-B_o_ptr  [16]             ;[30,4] (v)
||      SHR     .S1     Av_F2,      13,         Av_F2t          ;[25,4] (v)

        LDH     .D1T1   *-A_io_ptr  [ 3],       A_f3            ;[ 2,1]
||      LDH     .D2T2   *+B_io_ptr  [ 5],       B_f5            ;[ 2,1]
||      SUB     .L2X    Bv_c7S1,    Av_c1Q1,    Bv_F7           ;[25,4] (v)
||      ADD     .L1X    Av_c3Q0,    Bv_c5S0,    Av_F5           ;[25,4] (v)
;-
        LDH     .D2T1   *+B_io_ptr  [ 4],       A_f4            ;[ 3,1]
||      LDH     .D1T2   * A_io_ptr--[ 7],       B_f6t           ;[ 3,1]
||      MVK     .S1     0x7FFF,     A_k_rnd         ; Rounding value
||      MVK     .S2     0x7FFF,     B_k_rnd         ; Rounding value

        LDH     .D2T1   * B_io_ptr--[ 8],       A_f0            ;[ 5,1]
||      LDH     .D1T2   *+A_io_ptr  [ 2],       B_f1            ;[ 4,1]
||      SHR     .S1     Av_F5,      13,         Av_F5t          ;[27,4] (v)

;-
        SHL     .S1X    Bv_F7,      3,          A_f7            ;[29,4] (v)
||      STH     .D2T2   Bv_F3t,     * B_o_ptr  [ 0]             ;[28,4] (v)

        CLR     .S2     B_k1c0,     0,15,       B_c ; Prolog collapse: 0x10000
||      MV      .L1X    B_c3c5,     A_c3c5          ; Twin constant register
||      STH     .D2T1   Av_F5t,     *+B_o_ptr  [16]             ;[29,4] (v)

        MV      .L1X    B15,        A15             ; Twin stack pointer
||      MVC     .S2X    A2,         IRP             ; Restore IRP
||      STH     .D2T1   Av_F2t,     *-B_o_ptr  [ 8]             ;[27,4] (v)
;-

        ADD     .L1     A_f3,       A_f4,       A_h0            ;[ 8,1]
||      ADD     .S2     B_f2,       B_f5,       B_h1            ;[ 8,1]
||      SUB     .L2     B_f2,       B_f5,       B_g3            ;[ 9,1]
||      LDW     .D2T1   *B15[3],    A_o

        SUB     .L1     A_f3,       A_f4,       A_q1            ;[ 9,1] q1=g2
||      ADD     .L2     B_f1,       B_f6t,      B_g1            ;[ 9,1]
||      SUB     .S2     B_f1,       B_f6t,      B_h3            ;[ 9,1]
||      SHR     .S1     A_f7,       16,         A_f7
;-
; =========================== PIPE LOOP KERNEL ============================ ;
h_loop:
  [!B_c]STH     .D1T2   B_F0t,      *+A_io_ptr[ 9]              ;[20,1]
||      MPY     .M1     A_Q0,       A_c3c5,     A_c5Q0          ;[20,1]
||      MPYLH   .M2     B_S0,       B_c3c5,     B_c3S0          ;[20,1]
||      ADD     .S1X    A_c7Q1,     B_c1S1,     A_F1            ;[20,1]
||      SUB     .S2X    B_c7S1,     A_c1Q1,     B_F7            ;[20,1]
||      ADD     .L1     A_f0,       A_f7,       A_g0            ;[10,2]
||      ADD     .D2     B_h3,       B_g3,       B_s0a           ;[10,2]
||      SUB     .L2     B_h3,       B_g3,       B_q0a           ;[10,2]
;-
h_loop_1:
        ADD     .L2     B_F7,       B_k_rnd,    B_F7r           ;[21,1]
||[!B_c]STH     .D2T1   A_F6t,      *+B_io_ptr[22]              ;[21,1]
||      SUB     .S2     B_g1,       B_h1,       B_r1            ;[11,2]
||      MPYSU   .M2     B_s0a,      B_k1c0,     B_s0b           ;[11,2]
||      MPYSU   .M1X    B_q0a,      A_k1c0,     A_q0b           ;[11,2]
||      ADD     .S1     A_g0,       A_h0,       A_p0            ;[11,2]
||      SUB     .L1     A_g0,       A_h0,       A_r0            ;[11,2]
||      LDH     .D1T2   *-A_io_ptr  [ 2],       B_f5            ;[ 1,3]
;-
h_loop_2:
        SUB     .S2X    B_c3S0,     A_c5Q0,     B_F3            ;[22,1]
||      ADD     .S1X    A_c3Q0,     B_c5S0,     A_F5            ;[22,1]
||      ADD     .L1     A_F1,       A_k_rnd,    A_F1r           ;[22,1]
||      MPYH    .M2     B_F7r,      B_k1c0,     B_F7t           ;[22,1]
||      ADD     .L2     B_g1,       B_h1,       B_p1            ;[12,2]
||      MPY     .M1     A_r0,       A_c2c6,     A_c6r0          ;[12,2]
||      LDH     .D1T1   *-A_io_ptr  [ 4],       A_f3            ;[ 2,3]
||      LDH     .D2T2   *+B_io_ptr  [ 2],       B_f2            ;[ 2,3]
;-
h_loop_3:
        ADD     .S2     B_F3,       B_k_rnd,    B_F3r           ;[23,1]
||      SUB     .S1X    A_p0,       B_p1,       A_F4            ;[13,2]
||      ADD     .L1     A_q0b,      A_k_rnd,    A_q0c           ;[13,2]
||      ADD     .L2     B_s0b,      B_k_rnd,    B_s0c           ;[13,2]
||      MPYLH   .M2X    B_r1,       A_c2c6,     B_c2r1          ;[13,2]
||      MPYLH   .M1     A_r0,       A_c2c6,     A_c2r0          ;[13,2]
||      LDH     .D2T1   *+B_io_ptr  [ 4],       A_f4            ;[ 3,3]
||      LDH     .D1T2   *-A_io_ptr  [ 1],       B_f6            ;[ 3,3]
;-
h_loop_4:
        ADD     .L1     A_F5,       A_k_rnd,    A_F5r           ;[24,1]
||      SHR     .S2     B_F3r,      16,         B_F3t           ;[24,1]
||[ A_o]B       .S1     h_loop                                  ;[24,1]
||      ADD     .L2X    A_p0,       B_p1,       B_F0            ;[14,2]
||      MPYH    .M2     B_s0c,      B_k1c0,     B_s0            ;[14,2]
||      MPYH    .M1     A_q0c,      A_k1c0,     A_q0            ;[14,2]
||      LDH     .D1T1   * A_io_ptr--[ 8],       A_f7            ;[ 4,3]
||      LDH     .D2T2   *+B_io_ptr  [ 1],       B_f1            ;[ 4,3]
;-
h_loop_5:
        ADD     .S2     B_F2,       B_k_rnd,    B_F2r           ;[25,1]
||      MPYH    .M1     A_F1r,      A_k1c0,     A_F1t           ;[25,1]
||      ADD     .D1     A_F4,       4,          A_F4r           ;[15,2]
||      SUB     .S1X    A_c6r0,     B_c2r1,     A_F6            ;[15,2]
||      SUB     .L1     A_f0,       A_f7,       A_s1            ;[15,2] s1=h2
||      ADD     .L2     B_F0,       6,          B_F0r           ;[15,2]
||      MPY     .M2X    B_r1,       A_c2c6,     B_c6r1          ;[15,2]
||      LDH     .D2T1   * B_io_ptr--[ 8],       A_f0            ;[ 5,3]
;-
h_loop_6:
        SHR     .S1     A_F5r,      16,         A_F5t           ;[26,1]
||      SHR     .S2     B_F2r,      16,         B_F2t           ;[26,1]
||[!B_c]STH     .D2T2   B_F3t,      *+B_io_ptr[27]              ;[26,1]
||      SUB     .L1     A_q1,       A_q0,       A_Q0            ;[16,2]
||      ADD     .L2X    A_s1,       B_s0,       B_S1            ;[16,2]
||      ADD     .D1     A_q1,       A_q0,       A_Q1            ;[16,2]
||      MPYUS   .M2     B_c,        2,          B_c             ;pro. collapse
||      MPYHL   .M1X    B_c,        A_c2c6,     A_c             ;pro. collapse
;-
h_loop_7:
  [!B_c]STH     .D1T1   A_F5t,      *+A_io_ptr[22]              ;[27,1]
||[!B_c]STH     .D2T2   B_F2t,      *+B_io_ptr[26]              ;[27,1]
||      ADD     .L1     A_F6,       A_k_rnd,    A_F6r           ;[17,2]
||      SHR     .S2     B_F0r,      3,          B_F0t           ;[17,2]
||      SHR     .S1     A_F4r,      3,          A_F4t           ;[17,2]
||      SUB     .L2X    A_s1,       B_s0,       B_S0            ;[17,2]
||      MPY     .M2     B_S1,       B_c1c7,     B_c7S1          ;[17,2]
||      MPY     .M1X    A_Q1,       B_c1c7,     A_c7Q1          ;[17,2]
;-
h_loop_8:
  [!A_c]STH     .D1T1   A_F1t,      *+A_io_ptr[18]              ;[28,1]
||[!A_c]STH     .D2T2   B_F7t,      *+B_io_ptr[31]              ;[28,1]
||      SHR     .S1     A_F6r,      16,         A_F6t           ;[18,2]
||      ADD     .L2X    B_c6r1,     A_c2r0,     B_F2            ;[18,2]
||      MPYLH   .M2     B_S1,       B_c1c7,     B_c1S1          ;[18,2]
||      MPYLH   .M1X    A_Q1,       B_c1c7,     A_c1Q1          ;[18,2]
||      ADD     .L1     A_f3,       A_f4,       A_h0            ;[ 8,3]
||      ADD     .S2     B_f2,       B_f5,       B_h1            ;[ 8,3]
;-
h_loop_9:
  [ A_o]SUB     .S1     A_o,        1,          A_o             ;[19,2]
||[!B_c]STH     .D1T1   A_F4t,      *+A_io_ptr[13]              ;[19,2]
||      MPYLH   .M1     A_Q0,       A_c3c5,     A_c3Q0          ;[19,2]
||      MPY     .M2     B_S0,       B_c3c5,     B_c5S0          ;[19,2]
||      SUB     .L1     A_f3,       A_f4,       A_q1            ;[ 9,3] q1=g2
||      ADD     .L2     B_f1,       B_f6,       B_g1            ;[ 9,3]
||      SUB     .D2     B_f1,       B_f6,       B_h3            ;[ 9,3]
||      SUB     .S2     B_f2,       B_f5,       B_g3            ;[ 9,3]
; =========================== PIPE LOOP EPILOG ============================ ;
; EPILOG:
;-
* ========================================================================= *
*   Epilog / Final Cleanup Code.                                            *
*                                                                           *
*   This code performs the final stores from the epilog while retoring      *
*   Save-On-Entry values from the stack.  The two processes are heavily     *
*   interwoven in the interest of speed.  For instance, the return addr.    *
*   is loaded immediately and branched to as soon as it lands in the        *
*   register file.  Meanwhile, the final epilog stores complete as the      *
*   return-branch is taken.                                                 *
*                                                                           *
*   Note that a handful of symbolic names have been reassigned in the       *
*   epilog to avoid interfering with the values being loaded from the       *
*   stack.                                                                  *
* ========================================================================= *
        .asg            B5,         B_F7t
        .asg            B9,         B_F2r
        .asg            B8,         B_F3
        .asg            B8,         B_F3r
        .asg            A9,         A_F5t

        MPY     .M1     A_Q0,       A_c3c5,     A_c5Q0
||      MPYLH   .M2     B_S0,       B_c3c5,     B_c3S0
||      ADD     .S1X    A_c7Q1,     B_c1S1,     A_F1
||      SUB     .S2X    B_c7S1,     A_c1Q1,     B_F7
||      ADD     .L2     B_F2,       B_k_rnd,    B_F2r
||      LDW     .D2T1   *+ B15[ 2], A0              ; Load CSR's value
||      LDW     .D1T2   *+ A15[ 5], B3              ; Load return address
;-
        ADD     .L2     B_F7,       B_k_rnd,    B_F7r
||      ADD     .L1     A_F1,       A_k_rnd,    A_F1r
||      LDW     .D2T2   *+ B15[ 8], B11             ; Restore B11
||      LDW     .D1T1   *+ A15[13], A13             ; Restore A13

        MPYH    .M2     B_F7r,      B_k1c0,     B_F7t
||      MPYH    .M1     A_F1r,      A_k1c0,     A_F1t
||      LDW     .D1T2   *+ A15[ 6], B10             ; Restore B10
||      LDW     .D2T1   *+ B15[ 7], A10             ; Restore A10
;-
        ADD     .S1X    A_c3Q0,     B_c5S0,     A_F5
||      SUB     .S2X    B_c3S0,     A_c5Q0,     B_F3
||      LDW     .D1T2   *+ A15[14], B14             ; Restore B14
||      LDW     .D2T1   *+ B15[15], A14             ; Restore A14

        ADD     .L2     B_F3,       B_k_rnd,    B_F3r
||      ADD     .L1     A_F5,       A_k_rnd,    A_F5r
||      LDW     .D1T2   *+ A15[10], B12             ; Restore B12
||      LDW     .D2T1   *+ B15[11], A12             ; Restore A12
;-
        B       .S2     B3                          ; Return to caller
||      LDW     .D2T1   *+ B15[12], A11             ; Restore A11

        SHR     .S2     B_F3r,      16,         B_F3t
||      LDW     .D2T1   *++B15[16], A15             ; Rst. A15, release stack
||      LDW     .D1T2   *+ A15[ 9], B13             ; Restore B13

        STH     .D1T1   A_F1t,      *+A_io_ptr[10]
||      STH     .D2T2   B_F7t,      *+B_io_ptr[23]
||      SHR     .S1     A_F5r,      16,         A_F5t
;-
        STH     .D2T1   A_F6t,      *+B_io_ptr[22]
||      STH     .D1T2   B_F0t,      *+A_io_ptr[ 9]

        SHR     .S2     B_F2r,      16,         B_F2t
||      STH     .D2T2   B_F3t,      *+B_io_ptr[19]

        STH     .D1T1   A_F5t,      *+A_io_ptr[14]
||      STH     .D2T2   B_F2t,      *+B_io_ptr[18]
||      MVC     .S2X    A0,         CSR             ; Restore CSR
;-
; ===== Interruptibility state restored here =====
; ===== Branch Occurs =====

* ========================================================================= *
*   End of file:  fdct_8x8_h.asm                                            *
* ------------------------------------------------------------------------- *
*             Copyright (c) 1999 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *
