;------------------------------------------------------------------
; FFT1024.ASM
; Keith Larson
; TMS320 DSP Applications
; (C) Copyright 1995,1996
; Texas Instruments Incorporated
;
; This is unsupported freeware with no implied warranties or
; liabilities.  See the disclaimer document for details
;
; This application is written to run with the DOS executable FFT_1024.EXE
; The FFT code used in this example is written for readability and ease of
; understanding not for size and speed.  It can be easily optimized!
;
;Floating Point Pack Routines
;----------------------------
; FFT1024.ASM packs a complete 1024 point complex radix-2 FFT application
; into the on-chip memory of the C31 without even requiring the removal
; of the communications and debug kernel.  This is accomplished by packing
; the FFT's  real/imaginary data arrays into 16 bit short floats which have
; either a 8:1:7 or 4:1:11 (exponent:sign:mantissa) format depending on an
; assembler switch in the code (default is 4:12).
;
; NOTES: Choosing the 8:1:7 format will significantly speed up the FFT
;        at the expense of signal to noise ratio.  However, since most of
;        the time is spent collecting the input data or moving the data
;        to the PC there will be very little actual performance gain.
;
;        A 2048 point Real FFT is possible but will require a remving the
;        debugger functions from the communications kernel to fit in the
;        extra code.  For FFT1024.ASM the code and unmodified communications
;        kernel barely fit.
;
;FFT Windowing
;-------------
; By using a post convolution of the window function after the FFT is
; complete a table for the window function is not required. Furthermore
; the time domain coefficients for a raised cosine window are very simple
; (-0.5,+1.0,-0.5).
;
;Bit Reversal of Twiddle Table
;-----------------------------
; By bit-reversing the twiddle tables the size of an FFT is not dependent
; on the size of the table loaded.  In this case simply redefining the
; FFT size will result in a correctly coded FFT.
;
;STARTUP STUB
;------------
; The initialization code which is used only on startup is placed inside
; the volatile data memory array to gain back internal memory.
;
;SAMPLING RATES
;--------------
; The AIC sampling rates can be set to any combination which can still
; communicate with the DSP.  Some combinations will work as high as 130 KHz
; with substantial performance degradation.  Experimentation is required to
; find values that work best, or at all, since some will crash the
; communications entirely.
;
;HOST SYNCHRONIZATION
;--------------------
; Since the host should not disturb the ADC while data is being collected
; an interlock is used to keep the host from timing out.  This application
; uses a message box.
;
;SERIAL PORT REFRESH
;-------------------
; If the serial port is not serviced once every frame synch an underrun
; condition will occur and the port will shut down.  This is important for
; the transmit section since putting a new value in the DXR will cause the
; previous value to move to the shift register irregardless of the state
; of FSX.  Since the bit patterns are data dependent and in this case shifted
; by an unknown amount the AIC could be reprogrammed if steps are not taken
; to avoid a bad restart.  This is done by placing a zero in the transmitter
; whenever the serial port and AIC are likely to underrun.
;
; To prevent this the communications kernel is written such that the serial
; port tranmit channel is updated with zero whenever the kernel enters the
; wait for command loop 'spin0'.  When spin0 is exited (RUN command) the
; serial port is restarted with the original value.
;-------------------------------------------------------------------------
         .include "C3XMMRS.ASM"
TA       .set   12                    ; Use AIC startup == host side app
TB       .set   14
RA       .set   12
RB       .set   14
N        .set   1024
Samples  .set   N
N2       .set   N/2
N4       .set   N/4
PI       .set   3.141592654
PIN      .set   PI/N
;==================================================
; Create the Twiddle, FFT and I/O buffer arrays
;==================================================
DATA_ARRAY .set  0x809800
TWID_ARRAY .set  0x809C00

         .start  "TWIDDLES",TWID_ARRAY
         .sect   "TWIDDLES"
;-------------------------------------------
; Set F16 to 1 for a [8:1: 7] float format
; -or-       0 for a [4:1:11] float format
;-------------------------------------------
F16      .set      0  ;<- Change this value
;-------------------------------------------
  .if F16

TW
TWSCALE  .set      1.0
Cnt      .sdef     0.0
         .loop     N4
         .pfloat16 cos(Cnt*PIN),-sin(Cnt*PIN)
Cnt      .sdef     br($-TW,N)
         .endloop

         .loop     N4
         .pfloat16 cos(Cnt*PIN),-sin(Cnt*PIN)
Cnt      .sdef     br($-TW,N)
         .endloop

  .else
;----------------------------------------------------------
; 1) An sdef definition can eliminate some calculations,
;    speeding up DSK3A
; 2) Each code loop is limited to 256x to prevent accidentally
;    overwriting the entire hard drive space
;- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
;         .loop     N4
;         .psfloat  TWSCALE*cos(Cnt),-TWSCALE*sin(Cnt)
;Cnt      .sdef     br($-TW,N)*PIN
;          .endloop
;----------------------------------------------------------
TW
TWSCALE  .set      256.0

Cnt      .sdef     0.0
         .loop     N4       ; Loop is limited to 256 times...
         .psfloat  TWSCALE*cos(br($-TW,N)*PIN),-TWSCALE*sin(br($-TW,N)*PIN)
         .endloop
         .loop     N4       ; loop next 256 times
         .psfloat  TWSCALE*cos(br($-TW,N)*PIN),-TWSCALE*sin(br($-TW,N)*PIN)
         .endloop
  .endif
          .start  "FFTCODE",0x809E00  ; 256 words before comm kernel
          .sect   "FFTCODE"           ;
;==================================================
MSG_BOX   .word  0                 ; 0x809E00
LOAD      .word  1                 ; 0x809E01
A_REG     .word  (TA<<9)+(RA<<2)+0 ; 0x809E02
B_REG     .word  (TB<<9)+(RB<<2)+2 ; 0x809E03
C_REG     .word  00000011b         ; 0x809E04  +/- 1.5 V
FFTSIZE   .word  N                 ; 0x809E05
FFTSIZE2  .word  N/2               ; 0x809E05
;
TW_ADDR   .word TWID_ARRAY      ; IMAG/REAL twiddle values
DR_ADDR   .word DATA_ARRAY      ; REAL/IMAG data
DR_ADDR2  .word DATA_ARRAY+256  ; swap array used for inplace bit reversal

;--------------------------------------
; pfloat16 pack/unpack
;   pack F6:F7 into F6
;   unpack R4 into F4:F5
;--------------------------------------
  .if F16
Min_sf    .float 1.0            ;
DMASK     .word 0xFF000000      ; chop down to 8 bit mantissa
Pack      lsh   -24,R6          ; clear 16 lsbs
          lsh   24,R6           ;
          pushf R7              ; move to integer field
          pop   R7              ;
          lsh   -8,R7           ; move to correct position
          or    R7,R6           ; concatenate
          pushf R6
          pop   R6
          rets                  ;
;--------------------------------------
unpack    ldi    R4,R5          ;
          push   R4             ; Convert to 32 bit float
          popf   R4             ;
          lsh    16,R5          ;
          push   R5             ;
          popf   R5             ;
          and    @DMASK,R4      ;
          and    @DMASK,R5      ;
          rets                  ;
  .else
;--------------------------------------  11111000
; pack F6:F7 into pfloat16 R6
;--------------------------------------

TMASK     .word 0xFFF00000      ; chop down to 12 bit mantissa
Max_sf    .word 0x077FF000     ; max magnitude for 4:1:11 format
Min_sf    .word 0xF8000001     ; min magnitude for 4:1:11 format
;
Clip_sf   cmpf  0,R7
          ldfge 1,R5
          ldflt -1,R5
          absf  R7,R7
          cmpf  @Max_sf,R7
          ldfge @Max_sf,R7
          cmpf  @Min_sf,R7
          ldflt @Min_sf,R7
          mpyf  R5,R7
          rets
          ;- - - - - - - - - - -
Pack      pushf R5
          pushf R7
          ldf   R6,R7
          call  Clip_sf
          ldf   R7,R6
          popf  R7
          call  Clip_sf
          pushf R6               ; clear 16 lsbs
          pop   R6               ;
          lsh   -12,R6           ; move to upper 16 bits (lsbs = 0)
          lsh   16,R6            ;
          pushf R7               ;
          pop   R7               ; move to lower 16 bits
          lsh   4,R7             ;
          lsh   -16,R7           ; concatenate
          or    R7,R6            ;
          popf  R5               ;
          rets                   ;
          ;- - - - - - - - - - -
unpack    ldi    R4,R5
          ash    -4,R4          ; Sign extend the exponent
          push   R4             ; Convert to 32 bit float
          popf   R4             ;
          lsh    16,R5          ;
          ash    -4,R5          ;
          push   R5             ;
          popf   R5             ;
          and    @TMASK,R4
          and    @TMASK,R5
          rets                  ;
  .endif
;--------------------------------------
TSCALE    .float 1.0/TWSCALE
main      ldi   0x30,IE         ;
          ldi   0,R0            ; Clear flags,DXR etc...
          sti   R0,@MSG_BOX     ;
          sti   R0,@LOAD        ;
          sti   R0,@S0_xdata    ;
          sti   R0,@S0_xdata    ;
          sti   R0,@FLAGS       ;
          idle                  ; Flush first ADC value (might be trash)
          ;---------------------
          ldi   @DR_ADDR,AR0    ;
          ldi   @FFTSIZE,RC     ; Now get samples
          subi  1,RC            ; RC+1 repeats
          rptb  samples         ;
          ;---------------------
          call  GETADC          ; Get ADC value
          float R0,R6           ; convert to float
          mpyf  @Min_sf,R6
          ldf   0,R7            ; pack ADC:0.0 and store in R:I data
          call  Pack            ;
samples   sti   R6,*AR0++       ;
          ;---------------------
          ldi   0,R0            ; Put 0 into the DXR when it is not
          sti   R0,@S0_xdata    ; going to be used for awhile
          ldi   0,IE            ; Turn off all (ADC) interrupts
;==========================================================
; Perform FFT
;
; NOTE: This FFT is written with the intent of fitting the
;       maximum size possible on chip.  To do this the FFT relies
;       on data packing and unpacking routines which substantialy
;       slow down the FFT.  However since the host cannot typicly
;       transfer and display the results as fast as the DSP the
;       impact is not much.
;----------------------------------------------------------
FFT:      ldi   @FFTSIZE2,IR0    ;
          ldi   @FFTSIZE ,IR1    ; IR0 = IR1/2 ... used to BR step twiddles
          ldi   @TW_ADDR,AR2     ; The twiddle base is modified using
                                 ; bit-reversal and wraps around (circular)
New_Stg   ldi   @DR_ADDR,AR0     ;
          lsh   -1,IR1           ; decimate IR0 and IR1 (divide by two)
          lsh   -1,IR0           ;  Note: this does not set flags
          ldi   IR1,R0           ;  Check if index is zero (end of FFT)
          bzd   FFT_END          ;
          ldi   @FFTSIZE2,RC     ; Loop for FFTSIZE/2 butterflies
          subi  1,RC             ;
          ;---------------------
Blk_Top   rptb  B_Fly            ; Start by getting all 6 Butterfly inputs
          ;
          ldi    *AR0,R4         ; Top R/I pair
          call   unpack          ; Unpack REAL/IMAG data to R2,R3
          ldf    R4,R1           ;
          ldf    R5,R3           ;
          ;
          ldi   *+AR0(IR1),R4    ; Bottom R/I pair
          call   unpack          ; Unpack REAL/IMAG data to R2,R3
          ldf    R4,R0           ;
          ldf    R5,R2           ;
          ;
          ldi   *AR2++(IR0)B,R4  ; Load and unpack twiddles
          call   unpack          ; NOTE: Since R4:R5 pair is used by FFT
          mpyf   @TSCALE,R4
          mpyf   @TSCALE,R5

        ; ldf    R4,R4           ;       R4:R5 copy is not required
        ; ldf    R5,R5           ;

          addf3 R0,R1,R6         ; Top sum REAL
          addf3 R2,R3,R7         ; Top sum IMAG
    .if F16
    .else
          mpyf  0.5,R6
          mpyf  0.5,R7
    .endif
          call  Pack             ; Pack R6:R7
          sti   R6,*AR0          ; store Top REAL/IMAG

          subf3 R0,R1,R6         ; R6 = Rt-Rb           (R1 free)
          subf3 R2,R3,R7         ; R7 = It-Ib           (R3 free)
          mpyf3 R6,R4,R1         ; R1 = R*TR
          mpyf3 R7,R5,R3         ; R3 = I*TI
          subf  R3,R1            ; R1 = R1-R3 = R*TR-I*TI
          ldf   R1,R0
          ;
          mpyf3 R6,R5,R1         ; R1 = (Rt-Rb)*TI
          mpyf3 R7,R4,R3         ; R3 = (It-Ib)*TR
          addf3 R3,R1,R7         ; R1 = (Rt-Rb)*TI + (It-Ib)*TR
          ;
          ldf   R0,R6            ;
        ; ldf   R1,R7            ;
    .if F16
    .else
          mpyf  0.5,R6
          mpyf  0.5,R7
    .endif
          call  Pack             ;
          sti   R6,*+AR0(IR1)    ; Store Bottom REAL/IMAG

          ldi   *++AR0,R0        ; Point to next data address
          ;-----------------------------------------
          ; Identify EOB by twiddle wraparound
          ;-----------------------------------------
          ldi   IR1,R7           ; save the index (test/load will corrupt it)
          cmpi  @TW_ADDR,AR2     ; At end of block the twiddle pointers will
          ldinz 0,IR1            ; have completed the bit-rev circular address
          ldi   *AR0++(IR1),R0   ; Point R/I data pointers to next block
B_Fly     ldi    R7,IR1          ; Restore the index
          b     New_Stg          ; Exit is from top of routine
;======================================================================
; When the FFT is complete a convolution with the response of the desired
; window function is used to filter (clean up) the non-windowed FFT. A
; raised cosine window is used since the coefficients are -0.5,1.0,-0.5.
; After filtering, the R^2 + I^2 magnitude is calculated and packed into
; 4 log scaled bytes for display on the host
;
;    AR0 = OUTPUT BUFFER = IMAG []
;    AR1 = REAL[N-1] (unrolled to point to one sample before DC)
;    AR4 = IMAG[N-1]
;    IR0 = FFTsize/2
;======================================================================
FFT_END
          ldi   @DR_ADDR,AR0    ; OUTPUT ptr
          ldi   @DR_ADDR,AR4    ; REAL/IMAG ptr
          ldi   @FFTSIZE2,IR0   ; IR0=SIZE/2 for bit-reverse access
          ldi   IR0,R0          ;
          subi  1,R0            ; To unroll the response for DC a bit
          or    IR0,R0          ; reversed add of the twos compliment of the
          addi  R0,AR4          ; index is used
Log_Mag   ldi   @FFTSIZE,RC     ;
          lsh   -3,RC           ; Cvrt 1/2 data array at 4 points/loop
          subi  1,RC            ;
          rptb  WINDOW          ;
          ;- - - - - - - - - - -
          ldi   3,AR7           ; AR7 for 4 X loop counter
Loop_LM
          ldi   *AR4++(IR0)B,R4 ; Load three packed R/I pairs into Regs
          call  unpack
          ldf   R4,R0
          ldf   R5,R1

          push  AR4             ; Save AR4 of mid value for next loop

          ldi   *AR4++(IR0)B,R4 ;
          call  unpack
          ldf   R4,R2
          ldf   R5,R3

          ldi   *AR4++(IR0)B,R4 ;
          call  unpack
       ;  ldf   R4,R4
       ;  ldf   R5,R5

          pop   AR4             ;

          addf  R0,R4           ; Perform convolution window on REAL data
          mpyf  -0.5,R4         ;
          addf  R2,R4           ;
          mpyf  @VU_scale,R4    ; Scale FFT data growth
          mpyf  R4,R4           ; REAL^2

          addf  R1,R5           ; Perform convolution window on IMAG data
          mpyf  -0.5,R5         ;
          addf  R3,R5           ;
          mpyf  @VU_scale,R5    ;
          mpyf  R5,R5           ; IMAG^2
          ;- - - - - - - - - - -
          addf3 R4,R5,R2        ; REAL^2 + IMAG^2
          cmpf  @MaxL,R2        ; Clip exponent magnitude to 2^16 & 2^-16
          ldfgt @MaxL,R2        ;
          cmpf  @MinL,R2        ;
          ldflt @MinL,R2        ;
          lsh   1,R2            ; Quick log using float equivelency
          pushf R2              ; See Designer Notebook Page DNP-22
          pop   R0              ;
          ;- - - - - - - - - - -
          lsh   -8,R7           ; OR into result
          lsh   -21,R0          ;
          lsh   24,R0           ;
          or    R0,R7           ;
          dbu   AR7,Loop_LM     ; Loop until four samples are packed
          lsh   -2,IR0          ;
          sti   R7,*AR0++(IR0)B ; store packed data in free IMAG[] slots
WINDOW    lsh   2,IR0           ;
          ;---------------------
          lsh   -2,IR0          ;

BR_DATA   ldi   @DR_ADDR,AR0    ; Convert Bit-Reverse IMAG[] -> Linear REAL[]
          ldi   @DR_ADDR2,AR1    ;
       ;  addi  256,AR1         ;
          ldi   *AR0++(IR0)B,R0 ;
          rpts  127             ;
          ldi   *AR0++(IR0)B,R0 ;
     ||   sti   R0,*AR1++       ;

          ldi   @DR_ADDR2,AR0   ; Move data back to bottom of data array
          ldi   @DR_ADDR,AR1    ;
        ; addi  256,AR0         ;


          ldi   *AR0++,R0       ;
          rpts  127             ;
          ldi   *AR0++,R0       ;
     ||   sti   R0,*AR1++       ;
          ;---------------------
          ldi   0x4,IE          ; Interlock with host only uses INT2
NO_START  ldi   @MSG_BOX,R0     ; Restart when START message is received
          bz    NO_START        ;
          ldi   @LOAD,R2        ; Check to see if the host requested an
          bz    main            ; AIC reinitialization
          call  AIC_INIT        ; Restart with new AIC setup
          b     main            ; Do it all over again!
;***************************************************************************
MaxL  .float  65535.0           ; Clip to 2^16 and 2^-16
MinL  .float  1/65536.0         ;

  .if  F16
VU_scale  .float  1/(N*128.0)   ; FFT data growth factor and -128 exp offset
  .else
VU_scale  .float  1.0           ; -128 exp offset
  .endif
FLAGS     .word 0               ;
;- - - - - - - - - - - - - - - -
GETADC    ldi   0x30,IE         ; Come here and wait for ADC interrupt
          IDLE                  ; confirmation to save power and code space
          ldi   @FLAGS,R0       ;
          bz    GETADC          ;
          ldi   0,R0            ;
          sti   R0,@FLAGS       ;
          ldi   @S0_rdata,R0    ; Return sign extended ADC value
          lsh   16,R0           ;
          ash   -15,R0          ;
          rets                  ;
          ;- - - - - - - - - - -
ADC       push  ST              ; On interrupt, set a software flag to
          push  R0              ; let the CPU know that the RINT has been
          ldi   @S0_rdata,R0    ;
          ldi   1,R0            ;
          sti   R0,@FLAGS       ;
          b     DACRET
          ;- - - - - - - - - - -
DAC       push  ST              ;
          push  R0              ;
          ldi   0,R0            ;
          sti   R0,@S0_xdata    ; RAMP sent to DAC
DACRET    pop   R0              ;
          pop   ST              ;
          reti                  ;
;--------------------------------
prog_AIC  andn  0x30,IF         ;
          ldi   0,R1            ;
          sti   R1,@S0_xdata    ;
          idle                  ;
          ldi   3,R1            ; Request 2 ndy XMIT
          sti   R1,@S0_xdata    ;
          idle                  ;
          sti   R0,@S0_xdata    ; Send register value
          idle                  ;
          andn  3,R1            ;
          sti   R1,@S0_xdata    ; Leave with original safe value in DXR
          rets                  ;
;======================================================;
; This section of code is called by the initialization ;
; code as well as by the main program loop.  It is     ;
; therfor assembled into the regular program RAM       ;
;======================================================;
AIC_INIT  LDI   0x10,IE         ; Enable XINT interrupt
          andn  0x34,IF         ;
AIC_reset
          ldi   0,R0            ;
          sti   R0,@S0_xdata    ;
          RPTS  0x040           ;
          LDI   2,IOF           ; XF0=0 resets AIC
          rpts  0x40            ;
          LDI   6,IOF           ; XF0=1 runs AIC
          ldi   @S0_rdata,R0
          ldi   0,R0
          sti   R0,@S0_xdata
          ;-----------------------------
          ldi   0xfffc  ,R0     ; Program the AIC to be real slow
          call  prog_AIC        ;
          ldi   0xfffc|2,R0     ;
          call  prog_AIC        ;
          ldi   @C_REG,R0       ; Setup control register
          call  prog_AIC        ;
          ldi   @B_REG,R0       ; Bump up the Fs to final rate
          call  prog_AIC        ; (smallest divisor should be last)
          ldi   @A_REG,R0       ;
          call  prog_AIC        ;
          ldi   0,R0            ; Put a safe 0 in DXR
          sti   R0,@S0_xdata    ;
          ldi   @S0_rdata,R0    ; Clear receive underrun
          rets                  ;
;*****************************************************;
; Startup stub...                                     ;
;                                                     ;
; The following section of code is used only once for ;
; initialization and can be safely overwritten by     ;
; assembling it into the stack or volatile data       ;
; storage.                                            ;
;*****************************************************;
          .start   "STUB",DATA_ARRAY
          .sect    "STUB"       ; Place this code in the data buffer
          .entry   ST_STUB      ; area as this is the first to go
ST_STUB   ldp   T0_ctrl         ; Use kernel data page and stack
          ldi   1,R0            ; Set periods to 1
          sti   R0,@T0_prd      ;
          sti   R0,@T1_prd      ;
          ldi   0,R0            ; Halt TIM0 & TIM1
          sti   R0,@T0_ctrl     ;
          sti   R0,@T1_ctrl     ;
          sti   R0,@T0_count    ; Set counts to 0
          sti   R0,@T1_count    ;

                                ;                       1  2  3  4  5
          ldi   0x2C1,R0        ; Pulse md clock       12, 6, 4, 3, 1.4 Mhz
      ;   ldi   0x3C1,R0        ; Clock md (bi-quinry) 12, 6, 3, 2, 1   Mhz
                                ;                       0  1  2  3  4

          sti   R0,@T0_ctrl     ;
          sti   R0,@T1_ctrl     ;
          ;---------------------
          ldi   @S0_xctrl_val,R0;
          sti   R0,@S0_xctrl    ; transmit control
          ldi   @S0_rctrl_val,R0;
          sti   R0,@S0_rctrl    ; receive control
          ldi   0,R0            ;
          sti   R0,@S0_xdata    ; DXR data value
          ldi   @S0_gctrl_val,R0; Setup serial port
          sti   R0,@S0_gctrl    ; global control
          ;---------------------
          call  AIC_INIT        ; Initialize the AIC
          ldi   0x30,IE         ; Service both RINT/XINT
          ldi   @S0_rdata,R0    ;
          b     main            ;
;0_gctrl_val  .word  0x0E973300 ; CLKR/X active low
S0_gctrl_val  .word  0x0E970300 ; CLKR/X active high, use for higher speed DSP
S0_xctrl_val  .word  0x00000111 ;
S0_rctrl_val  .word  0x00000111 ;

;****************************************************;
; Install the XINT/RINT ISR handler directly into    ;
; the vector RAM location it will be used in         ;
;****************************************************;
          .start   "SP0VECTS",0x809FC5
          .sect    "SP0VECTS"
          B     DAC           ; XINT0
          B     ADC           ; RINT0
