mbed port of FFT routines from STM32 DSP library and Ivan Mellen's implementation. Tested on LPC2368 mbed but should work on 1768 too (original code was written for Cortex-M3)

Dependencies:   mbed

Revision:
0:90ade34a3b71
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cr4_fft_64_stm32.s	Sun Dec 13 07:14:57 2009 +0000
@@ -0,0 +1,329 @@
+;******************** (C) COPYRIGHT 2009  STMicroelectronics ********************
+;* File Name          : cr4_fft_64_stm32.s
+;* Author             : MCD Application Team
+;* Version            : V2.0.0
+;* Date               : 04/27/2009
+;* Description        : Optimized 64-point radix-4 complex FFT for Cortex-M3
+;********************************************************************************
+;* THE PRESENT FIRMWARE WHICH IS FOR GUIDANCE ONLY AIMS AT PROVIDING CUSTOMERS
+;* WITH CODING INFORMATION REGARDING THEIR PRODUCTS IN ORDER FOR THEM TO SAVE TIME.
+;* AS A RESULT, STMICROELECTRONICS SHALL NOT BE HELD LIABLE FOR ANY DIRECT,
+;* INDIRECT OR CONSEQUENTIAL DAMAGES WITH RESPECT TO ANY CLAIMS ARISING FROM THE
+;* CONTENT OF SUCH SOFTWARE AND/OR THE USE MADE BY CUSTOMERS OF THE CODING
+;* INFORMATION CONTAINED HEREIN IN CONNECTION WITH THEIR PRODUCTS.
+;*******************************************************************************/
+
+    ;THUMB
+    REQUIRE8
+    PRESERVE8
+
+    AREA |.text|, CODE, READONLY, ALIGN=4
+                
+    EXPORT cr4_fft_64_stm32      
+    EXTERN TableFFT
+
+
+pssK          RN R0
+pssOUT        RN R0
+pssX          RN R1
+pssIN         RN R1
+butternbr     RN R2
+Nbin          RN R2
+index         RN R3
+Ar            RN R4
+Ai            RN R5
+Br            RN R6
+Bi            RN R7
+Cr            RN R8
+Ci            RN R9
+Dr            RN R10
+Di            RN R11
+cntrbitrev     RN R12
+tmp           RN R12
+pssIN2        RN R14
+tmp2          RN R14
+
+NPT EQU 64
+
+;----------------------------- MACROS ----------------------------------------
+     
+         MACRO
+         DEC  $reg
+         SUB  $reg,$reg,#1
+         MEND
+
+         MACRO
+         INC  $reg
+         ADD  $reg,$reg,#1
+         MEND
+
+
+          MACRO
+         QUAD  $reg
+         MOV  $reg,$reg,LSL#2
+         MEND
+
+;sXi = *(PssX+1); sXr = *PssX; PssX += offset; PssX= R1
+
+          MACRO
+          LDR2Q  $sXr,$sXi, $PssX, $offset
+          LDRSH $sXi, [$PssX, #2]
+          LDRSH $sXr, [$PssX]
+          ADD $PssX, $PssX, $offset
+          MEND
+
+;!! Same macro, to be used when passing negative offset value !!
+        MACRO
+        LDR2Qm  $sXr, $sXi, $PssX, $offset
+        LDRSH $sXi, [$PssX, #2]
+        LDRSH $sXr, [$PssX]
+        SUB $PssX, $PssX, $offset
+        MEND
+
+;(PssX+1)= sXi;  *PssX=sXr; PssX += offset;
+        MACRO
+        STR2Q  $sXr, $sXi, $PssX, $offset
+        STRH  $sXi, [$PssX, #2]
+        STRH  $sXr, [$PssX]
+        ADD $PssX, $PssX, $offset
+        MEND
+
+; YY = Cplx_conjugate_mul(Y,K)
+;  Y = YYr + i*YYi
+; use the following trick
+;  K = (Kr-Ki) + i*Ki
+        MACRO
+        CXMUL_V7  $YYr, $YYi, $Yr, $Yi, $Kr, $Ki,$tmp,$tmp2
+        SUB  $tmp2, $Yi, $Yr         ; sYi-sYr
+        MUL  $tmp, $tmp2, $Ki        ; (sYi-sYr)*sKi
+        ADD  $tmp2, $Kr, $Ki, LSL#1  ; (sKr+sKi)
+        MLA  $YYi, $Yi, $Kr, $tmp     ; lYYi = sYi*sKr-sYr*sKi
+        MLA  $YYr, $Yr, $tmp2, $tmp   ; lYYr = sYr*sKr+sYi*sKi
+        MEND
+
+; Four point complex Fast Fourier Transform        
+        MACRO
+        CXADDA4  $s
+        ; (C,D) = (C+D, C-D)
+        ADD     Cr, Cr, Dr
+        ADD     Ci, Ci, Di
+        SUB     Dr, Cr, Dr, LSL#1
+        SUB     Di, Ci, Di, LSL#1
+        ; (A,B) = (A+(B>>s), A-(B>>s))/4
+        MOV     Ar, Ar, ASR#2
+        MOV     Ai, Ai, ASR#2
+        ADD     Ar, Ar, Br, ASR#(2+$s)
+        ADD     Ai, Ai, Bi, ASR#(2+$s)
+        SUB     Br, Ar, Br, ASR#(1+$s)
+        SUB     Bi, Ai, Bi, ASR#(1+$s)
+        ; (A,C) = (A+(C>>s)/4, A-(C>>s)/4)
+        ADD     Ar, Ar, Cr, ASR#(2+$s)
+        ADD     Ai, Ai, Ci, ASR#(2+$s)
+        SUB     Cr, Ar, Cr, ASR#(1+$s)
+        SUB     Ci, Ai, Ci, ASR#(1+$s)
+        ; (B,D) = (B-i*(D>>s)/4, B+i*(D>>s)/4)
+        ADD     Br, Br, Di, ASR#(2+$s)
+        SUB     Bi, Bi, Dr, ASR#(2+$s)
+        SUB     Di, Br, Di, ASR#(1+$s)
+        ADD     Dr, Bi, Dr, ASR#(1+$s)
+        MEND
+
+        
+        MACRO
+        BUTFLY4ZERO_OPT  $pIN,$offset, $pOUT
+        LDRSH Ai, [$pIN, #2]
+        LDRSH Ar, [$pIN],#NPT
+        LDRSH Ci, [$pIN, #2]
+        LDRSH Cr, [$pIN],#NPT
+        LDRSH Bi, [$pIN, #2]
+        LDRSH Br, [$pIN],#NPT
+        LDRSH Di, [$pIN, #2]
+        LDRSH Dr, [$pIN],#NPT
+        ; (C,D) = (C+D, C-D)
+        ADD     Cr, Cr, Dr
+        ADD     Ci, Ci, Di
+        SUB     Dr, Cr, Dr, LSL#1  ; trick
+        SUB     Di, Ci, Di, LSL#1  ;trick
+        ; (A,B) = (A+B)/4, (A-B)/4
+        MOV     Ar, Ar, ASR#2
+        MOV     Ai, Ai, ASR#2
+        ADD     Ar, Ar, Br, ASR#2
+        ADD     Ai, Ai, Bi, ASR#2
+        SUB     Br, Ar, Br, ASR#1
+        SUB     Bi, Ai, Bi, ASR#1
+        ; (A,C) = (A+C)/4, (A-C)/4
+        ADD     Ar, Ar, Cr, ASR#2
+        ADD     Ai, Ai, Ci, ASR#2
+        SUB     Cr, Ar, Cr, ASR#1
+        SUB     Ci, Ai, Ci, ASR#1
+        ; (B,D) = (B-i*D)/4, (B+i*D)/4
+        ADD     Br, Br, Di, ASR#2
+        SUB     Bi, Bi, Dr, ASR#2
+        SUB     Di, Br, Di, ASR#1
+        ADD     Dr, Bi, Dr, ASR#1
+        ;
+        STRH    Ai, [$pOUT, #2]
+        STRH    Ar, [$pOUT], #4
+        STRH    Bi, [$pOUT, #2]
+        STRH    Br, [$pOUT], #4
+        STRH    Ci, [$pOUT, #2]
+        STRH    Cr, [$pOUT], #4
+        STRH    Dr, [$pOUT, #2]  ; inversion here
+        STRH    Di, [$pOUT], #4
+        MEND
+
+        MACRO
+        BUTFLY4_V7   $pssDin,$offset,$pssDout,$qformat,$pssK
+        LDR2Qm   Ar,Ai,$pssDin, $offset;-$offset
+        LDR2Q    Dr,Di,$pssK, #4
+        ; format CXMUL_V7 YYr, YYi, Yr, Yi, Kr, Ki,tmp,tmp2
+        CXMUL_V7 Dr,Di,Ar,Ai,Dr,Di,tmp,tmp2
+        LDR2Qm   Ar,Ai,$pssDin,$offset;-$offset
+        LDR2Q    Cr,Ci,$pssK,#4
+        CXMUL_V7 Cr,Ci,Ar,Ai,Cr,Ci,tmp,tmp2
+        LDR2Qm    Ar,Ai, $pssDin, $offset;-$offset
+        LDR2Q    Br,Bi, $pssK, #4
+        CXMUL_V7  Br,Bi,Ar,Ai,Br,Bi,tmp,tmp2
+        LDR2Q    Ar,Ai, $pssDin, #0
+        CXADDA4  $qformat
+        STRH    Ai, [$pssDout, #2]
+        STRH    Ar, [$pssDout]
+        ADD     $pssDout, $pssDout, $offset
+        STRH    Bi, [$pssDout, #2]
+        STRH    Br, [$pssDout]
+        ADD     $pssDout, $pssDout, $offset
+        STRH    Ci, [$pssDout, #2]
+        STRH    Cr, [$pssDout]
+        ADD     $pssDout, $pssDout, $offset
+        STRH    Dr, [$pssDout, #2]  ; inversion here
+        STRH    Di, [$pssDout], #4
+        MEND
+
+;-------------------             CODE             --------------------------------
+;===============================================================================
+;*******************************************************************************
+;* Function Name  : cr4_fft_64_stm32
+;* Description    : complex radix-4 64 points FFT
+;* Input          : - R0 = pssOUT: Output array .
+;*                  - R1 = pssIN: Input array 
+;*                  - R2 = Nbin: =64 number of points, this optimized FFT function  
+;*                    can only convert 64 points.
+;* Output         : None 
+;* Return         : None
+;*******************************************************************************
+cr4_fft_64_stm32
+
+        STMFD   SP!, {R4-R11, LR}
+        
+        MOV cntrbitrev, #0
+        MOV index,#0
+preloop_v7
+        ADD     pssIN2, pssIN, cntrbitrev, LSR#26 ;64-pts
+        BUTFLY4ZERO_OPT pssIN2,Nbin,pssOUT
+        INC index
+     IF :DEF:TARGET_LPC1768        
+        RBIT cntrbitrev,index
+     ELSE
+        ; since we need only 16 values, use a lookup table
+        ADR cntrbitrev, BITREV16
+        LDR cntrbitrev, [cntrbitrev, index, LSL #2]
+     ENDIF   
+        CMP index,#16  ;64-pts
+        BNE  preloop_v7
+
+
+        SUB     pssX, pssOUT, Nbin, LSL#2
+        MOV     index, #16
+        MOVS    butternbr, Nbin, LSR#4   ;dual use of register 
+        
+;------------------------------------------------------------------------------
+;   The FFT coefficients table can be stored into Flash or RAM. 
+;   The following two lines of code allow selecting the method for coefficients 
+;   storage. 
+;   In the case of choosing coefficients in RAM, you have to:
+;   1. Include the file table_fft.h, which is a part of the DSP library, 
+;      in your main file.
+;   2. Decomment the line LDR.W pssK, =TableFFT and comment the line 
+;      ADRL    pssK, TableFFT_V7
+;   3. Comment all the TableFFT_V7 data.
+;------------------------------------------------------------------------------
+        ADR    pssK, TableFFT_V7    ; Coeff in Flash 
+        ;LDR.W pssK, =TableFFT      ; Coeff in RAM 
+
+;................................
+passloop_v7
+        STMFD   SP!, {pssX,butternbr}
+        ADD     tmp, index, index, LSL#1
+        ADD     pssX, pssX, tmp
+        SUB     butternbr, butternbr, #1<<16
+;................
+grouploop_v7
+        ADD     butternbr,butternbr,index,LSL#(16-2)
+;.......
+butterloop_v7
+        BUTFLY4_V7  pssX,index,pssX,14,pssK
+        SUBS        butternbr,butternbr, #1<<16
+        BGE     butterloop_v7
+;.......
+        ADD     tmp, index, index, LSL#1
+        ADD     pssX, pssX, tmp
+        DEC     butternbr
+        MOVS    tmp2, butternbr, LSL#16
+        IT      NE
+        SUBNE   pssK, pssK, tmp
+        BNE     grouploop_v7
+;................
+        LDMFD   sp!, {pssX, butternbr}
+        QUAD    index
+        MOVS    butternbr, butternbr, LSR#2    ; loop nbr /= radix 
+        BNE     passloop_v7
+;................................
+       LDMFD   SP!, {R4-R11, PC}
+
+;=============================================================================
+
+TableFFT_V7
+         ;N=16
+        DCW 0x4000,0x0000, 0x4000,0x0000, 0x4000,0x0000
+        DCW 0xdd5d,0x3b21, 0x22a3,0x187e, 0x0000,0x2d41
+        DCW 0xa57e,0x2d41, 0x0000,0x2d41, 0xc000,0x4000
+        DCW 0xdd5d,0xe782, 0xdd5d,0x3b21, 0xa57e,0x2d41
+        ; N=64
+        DCW 0x4000,0x0000, 0x4000,0x0000, 0x4000,0x0000
+        DCW 0x2aaa,0x1294, 0x396b,0x0646, 0x3249,0x0c7c
+        DCW 0x11a8,0x238e, 0x3249,0x0c7c, 0x22a3,0x187e
+        DCW 0xf721,0x3179, 0x2aaa,0x1294, 0x11a8,0x238e
+        DCW 0xdd5d,0x3b21, 0x22a3,0x187e, 0x0000,0x2d41
+        DCW 0xc695,0x3fb1, 0x1a46,0x1e2b, 0xee58,0x3537
+        DCW 0xb4be,0x3ec5, 0x11a8,0x238e, 0xdd5d,0x3b21
+        DCW 0xa963,0x3871, 0x08df,0x289a, 0xcdb7,0x3ec5
+        DCW 0xa57e,0x2d41, 0x0000,0x2d41, 0xc000,0x4000
+        DCW 0xa963,0x1e2b, 0xf721,0x3179, 0xb4be,0x3ec5
+        DCW 0xb4be,0x0c7c, 0xee58,0x3537, 0xac61,0x3b21
+        DCW 0xc695,0xf9ba, 0xe5ba,0x3871, 0xa73b,0x3537
+        DCW 0xdd5d,0xe782, 0xdd5d,0x3b21, 0xa57e,0x2d41
+        DCW 0xf721,0xd766, 0xd556,0x3d3f, 0xa73b,0x238e
+        DCW 0x11a8,0xcac9, 0xcdb7,0x3ec5, 0xac61,0x187e
+        DCW 0x2aaa,0xc2c1, 0xc695,0x3fb1, 0xb4be,0x0c7c
+
+; bit reversal table values from 0 to 16
+BITREV16
+        DCD 2_0000 << 28
+        DCD 2_1000 << 28
+        DCD 2_0100 << 28
+        DCD 2_1100 << 28
+        DCD 2_0010 << 28
+        DCD 2_1010 << 28
+        DCD 2_0110 << 28
+        DCD 2_1110 << 28
+        DCD 2_0001 << 28
+        DCD 2_1001 << 28
+        DCD 2_0101 << 28
+        DCD 2_1101 << 28
+        DCD 2_0011 << 28
+        DCD 2_1011 << 28
+        DCD 2_0111 << 28
+        DCD 2_1111 << 28
+        DCD 2_0000 << 28
+       END
+;******************* (C) COPYRIGHT 2009  STMicroelectronics *****END OF FILE****