CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_cfft_radix4_q31.c Source File

arm_cfft_radix4_q31.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015 
00005 * $Revision:    V.1.4.5  
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_cfft_radix4_q31.c    
00009 *    
00010 * Description:  This file has function definition of Radix-4 FFT & IFFT function and    
00011 *               In-place bit reversal using bit reversal table    
00012 *    
00013 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00014 *  
00015 * Redistribution and use in source and binary forms, with or without 
00016 * modification, are permitted provided that the following conditions
00017 * are met:
00018 *   - Redistributions of source code must retain the above copyright
00019 *     notice, this list of conditions and the following disclaimer.
00020 *   - Redistributions in binary form must reproduce the above copyright
00021 *     notice, this list of conditions and the following disclaimer in
00022 *     the documentation and/or other materials provided with the 
00023 *     distribution.
00024 *   - Neither the name of ARM LIMITED nor the names of its contributors
00025 *     may be used to endorse or promote products derived from this
00026 *     software without specific prior written permission.
00027 *
00028 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00029 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00030 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00031 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00032 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00033 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00034 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00035 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00036 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00037 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00038 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00039 * POSSIBILITY OF SUCH DAMAGE.     
00040 * -------------------------------------------------------------------- */
00041 
00042 #include "arm_math.h"
00043 
00044 void arm_radix4_butterfly_inverse_q31(
00045 q31_t * pSrc,
00046 uint32_t fftLen,
00047 q31_t * pCoef,
00048 uint32_t twidCoefModifier);
00049 
00050 void arm_radix4_butterfly_q31(
00051 q31_t * pSrc,
00052 uint32_t fftLen,
00053 q31_t * pCoef,
00054 uint32_t twidCoefModifier);
00055 
00056 void arm_bitreversal_q31(
00057 q31_t * pSrc,
00058 uint32_t fftLen,
00059 uint16_t bitRevFactor,
00060 uint16_t * pBitRevTab);
00061 
00062 /**    
00063  * @ingroup groupTransforms    
00064  */
00065 
00066 /**    
00067  * @addtogroup ComplexFFT    
00068  * @{    
00069  */
00070 
00071 /**    
00072  * @details    
00073  * @brief Processing function for the Q31 CFFT/CIFFT.    
00074  * @deprecated Do not use this function.  It has been superseded by \ref arm_cfft_q31 and will be removed
00075  * @param[in]      *S    points to an instance of the Q31 CFFT/CIFFT structure.   
00076  * @param[in, out] *pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place.   
00077  * @return none.    
00078  *     
00079  * \par Input and output formats:    
00080  * \par    
00081  * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.   
00082  * Hence the output format is different for different FFT sizes.    
00083  * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:   
00084  * \par   
00085  * \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"    
00086  * \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"    
00087  *    
00088  */
00089 
00090 void arm_cfft_radix4_q31(
00091   const arm_cfft_radix4_instance_q31 * S,
00092   q31_t * pSrc)
00093 {
00094   if(S->ifftFlag == 1u)
00095   {
00096     /* Complex IFFT radix-4 */
00097     arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle,
00098                                      S->twidCoefModifier);
00099   }
00100   else
00101   {
00102     /* Complex FFT radix-4 */
00103     arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle,
00104                              S->twidCoefModifier);
00105   }
00106 
00107 
00108   if(S->bitReverseFlag == 1u)
00109   {
00110     /*  Bit Reversal */
00111     arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
00112   }
00113 
00114 }
00115 
00116 /**    
00117  * @} end of ComplexFFT group    
00118  */
00119 
00120 /*    
00121 * Radix-4 FFT algorithm used is :    
00122 *    
00123 * Input real and imaginary data:    
00124 * x(n) = xa + j * ya    
00125 * x(n+N/4 ) = xb + j * yb    
00126 * x(n+N/2 ) = xc + j * yc    
00127 * x(n+3N 4) = xd + j * yd    
00128 *    
00129 *    
00130 * Output real and imaginary data:    
00131 * x(4r) = xa'+ j * ya'    
00132 * x(4r+1) = xb'+ j * yb'    
00133 * x(4r+2) = xc'+ j * yc'    
00134 * x(4r+3) = xd'+ j * yd'    
00135 *    
00136 *    
00137 * Twiddle factors for radix-4 FFT:    
00138 * Wn = co1 + j * (- si1)    
00139 * W2n = co2 + j * (- si2)    
00140 * W3n = co3 + j * (- si3)    
00141 *    
00142 *  Butterfly implementation:    
00143 * xa' = xa + xb + xc + xd    
00144 * ya' = ya + yb + yc + yd    
00145 * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)    
00146 * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)    
00147 * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)    
00148 * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)    
00149 * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)    
00150 * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)    
00151 *    
00152 */
00153 
00154 /**    
00155  * @brief  Core function for the Q31 CFFT butterfly process.   
00156  * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.   
00157  * @param[in]      fftLen           length of the FFT.   
00158  * @param[in]      *pCoef           points to twiddle coefficient buffer.   
00159  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.   
00160  * @return none.   
00161  */
00162 
00163 void arm_radix4_butterfly_q31(
00164   q31_t * pSrc,
00165   uint32_t fftLen,
00166   q31_t * pCoef,
00167   uint32_t twidCoefModifier)
00168 {
00169 #if defined(ARM_MATH_CM7)
00170   uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
00171   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
00172 
00173   q31_t xa, xb, xc, xd;
00174   q31_t ya, yb, yc, yd;
00175   q31_t xa_out, xb_out, xc_out, xd_out;
00176   q31_t ya_out, yb_out, yc_out, yd_out;
00177 
00178   q31_t *ptr1;
00179   q63_t xaya, xbyb, xcyc, xdyd;
00180   /* Total process is divided into three stages */
00181 
00182   /* process first stage, middle stages, & last stage */
00183 
00184 
00185   /* start of first stage process */
00186 
00187   /*  Initializations for the first stage */
00188   n2 = fftLen;
00189   n1 = n2;
00190   /* n2 = fftLen/4 */
00191   n2 >>= 2u;
00192   i0 = 0u;
00193   ia1 = 0u;
00194 
00195   j = n2;
00196 
00197   /*  Calculation of first stage */
00198   do
00199   {
00200     /*  index calculation for the input as, */
00201     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
00202     i1 = i0 + n2;
00203     i2 = i1 + n2;
00204     i3 = i2 + n2;
00205 
00206     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
00207 
00208     /*  Butterfly implementation */
00209     /* xa + xc */
00210     r1 = (pSrc[(2u * i0)] >> 4u) + (pSrc[(2u * i2)] >> 4u);
00211     /* xa - xc */
00212     r2 = (pSrc[2u * i0] >> 4u) - (pSrc[2u * i2] >> 4u);
00213 
00214     /* xb + xd */
00215     t1 = (pSrc[2u * i1] >> 4u) + (pSrc[2u * i3] >> 4u);
00216 
00217     /* ya + yc */
00218     s1 = (pSrc[(2u * i0) + 1u] >> 4u) + (pSrc[(2u * i2) + 1u] >> 4u);
00219     /* ya - yc */
00220     s2 = (pSrc[(2u * i0) + 1u] >> 4u) - (pSrc[(2u * i2) + 1u] >> 4u);
00221 
00222     /* xa' = xa + xb + xc + xd */
00223     pSrc[2u * i0] = (r1 + t1);
00224     /* (xa + xc) - (xb + xd) */
00225     r1 = r1 - t1;
00226     /* yb + yd */
00227     t2 = (pSrc[(2u * i1) + 1u] >> 4u) + (pSrc[(2u * i3) + 1u] >> 4u);
00228 
00229     /* ya' = ya + yb + yc + yd */
00230     pSrc[(2u * i0) + 1u] = (s1 + t2);
00231 
00232     /* (ya + yc) - (yb + yd) */
00233     s1 = s1 - t2;
00234 
00235     /* yb - yd */
00236     t1 = (pSrc[(2u * i1) + 1u] >> 4u) - (pSrc[(2u * i3) + 1u] >> 4u);
00237     /* xb - xd */
00238     t2 = (pSrc[2u * i1] >> 4u) - (pSrc[2u * i3] >> 4u);
00239 
00240     /*  index calculation for the coefficients */
00241     ia2 = 2u * ia1;
00242     co2 = pCoef[ia2 * 2u];
00243     si2 = pCoef[(ia2 * 2u) + 1u];
00244 
00245     /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
00246     pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
00247                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
00248 
00249     /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
00250     pSrc[(2u * i1) + 1u] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
00251                             ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
00252 
00253     /* (xa - xc) + (yb - yd) */
00254     r1 = r2 + t1;
00255     /* (xa - xc) - (yb - yd) */
00256     r2 = r2 - t1;
00257 
00258     /* (ya - yc) - (xb - xd) */
00259     s1 = s2 - t2;
00260     /* (ya - yc) + (xb - xd) */
00261     s2 = s2 + t2;
00262 
00263     co1 = pCoef[ia1 * 2u];
00264     si1 = pCoef[(ia1 * 2u) + 1u];
00265 
00266     /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
00267     pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
00268                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
00269 
00270     /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
00271     pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
00272                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
00273 
00274     /*  index calculation for the coefficients */
00275     ia3 = 3u * ia1;
00276     co3 = pCoef[ia3 * 2u];
00277     si3 = pCoef[(ia3 * 2u) + 1u];
00278 
00279     /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
00280     pSrc[2u * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
00281                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
00282 
00283     /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
00284     pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
00285                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
00286 
00287     /*  Twiddle coefficients index modifier */
00288     ia1 = ia1 + twidCoefModifier;
00289 
00290     /*  Updating input index */
00291     i0 = i0 + 1u;
00292 
00293   } while(--j);
00294 
00295   /* end of first stage process */
00296 
00297   /* data is in 5.27(q27) format */
00298 
00299 
00300   /* start of Middle stages process */
00301 
00302 
00303   /* each stage in middle stages provides two down scaling of the input */
00304 
00305   twidCoefModifier <<= 2u;
00306 
00307 
00308   for (k = fftLen / 4u; k > 4u; k >>= 2u)
00309   {
00310     /*  Initializations for the first stage */
00311     n1 = n2;
00312     n2 >>= 2u;
00313     ia1 = 0u;
00314 
00315     /*  Calculation of first stage */
00316     for (j = 0u; j <= (n2 - 1u); j++)
00317     {
00318       /*  index calculation for the coefficients */
00319       ia2 = ia1 + ia1;
00320       ia3 = ia2 + ia1;
00321       co1 = pCoef[ia1 * 2u];
00322       si1 = pCoef[(ia1 * 2u) + 1u];
00323       co2 = pCoef[ia2 * 2u];
00324       si2 = pCoef[(ia2 * 2u) + 1u];
00325       co3 = pCoef[ia3 * 2u];
00326       si3 = pCoef[(ia3 * 2u) + 1u];
00327       /*  Twiddle coefficients index modifier */
00328       ia1 = ia1 + twidCoefModifier;
00329 
00330       for (i0 = j; i0 < fftLen; i0 += n1)
00331       {
00332         /*  index calculation for the input as, */
00333         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
00334         i1 = i0 + n2;
00335         i2 = i1 + n2;
00336         i3 = i2 + n2;
00337 
00338         /*  Butterfly implementation */
00339         /* xa + xc */
00340         r1 = pSrc[2u * i0] + pSrc[2u * i2];
00341         /* xa - xc */
00342         r2 = pSrc[2u * i0] - pSrc[2u * i2];
00343 
00344         /* ya + yc */
00345         s1 = pSrc[(2u * i0) + 1u] + pSrc[(2u * i2) + 1u];
00346         /* ya - yc */
00347         s2 = pSrc[(2u * i0) + 1u] - pSrc[(2u * i2) + 1u];
00348 
00349         /* xb + xd */
00350         t1 = pSrc[2u * i1] + pSrc[2u * i3];
00351 
00352         /* xa' = xa + xb + xc + xd */
00353         pSrc[2u * i0] = (r1 + t1) >> 2u;
00354         /* xa + xc -(xb + xd) */
00355         r1 = r1 - t1;
00356 
00357         /* yb + yd */
00358         t2 = pSrc[(2u * i1) + 1u] + pSrc[(2u * i3) + 1u];
00359         /* ya' = ya + yb + yc + yd */
00360         pSrc[(2u * i0) + 1u] = (s1 + t2) >> 2u;
00361 
00362         /* (ya + yc) - (yb + yd) */
00363         s1 = s1 - t2;
00364 
00365         /* (yb - yd) */
00366         t1 = pSrc[(2u * i1) + 1u] - pSrc[(2u * i3) + 1u];
00367         /* (xb - xd) */
00368         t2 = pSrc[2u * i1] - pSrc[2u * i3];
00369 
00370         /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
00371         pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
00372                          ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1u;
00373 
00374         /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
00375         pSrc[(2u * i1) + 1u] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
00376                                 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1u;
00377 
00378         /* (xa - xc) + (yb - yd) */
00379         r1 = r2 + t1;
00380         /* (xa - xc) - (yb - yd) */
00381         r2 = r2 - t1;
00382 
00383         /* (ya - yc) -  (xb - xd) */
00384         s1 = s2 - t2;
00385         /* (ya - yc) +  (xb - xd) */
00386         s2 = s2 + t2;
00387 
00388         /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
00389         pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
00390                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
00391 
00392         /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
00393         pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
00394                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
00395 
00396         /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
00397         pSrc[2u * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
00398                          ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
00399 
00400         /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
00401         pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
00402                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
00403       }
00404     }
00405     twidCoefModifier <<= 2u;
00406   }
00407 #else
00408   uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
00409   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
00410 
00411   q31_t xa, xb, xc, xd;
00412   q31_t ya, yb, yc, yd;
00413   q31_t xa_out, xb_out, xc_out, xd_out;
00414   q31_t ya_out, yb_out, yc_out, yd_out;
00415 
00416   q31_t *ptr1;
00417   q31_t *pSi0;
00418   q31_t *pSi1;
00419   q31_t *pSi2;
00420   q31_t *pSi3;
00421   q63_t xaya, xbyb, xcyc, xdyd;
00422   /* Total process is divided into three stages */
00423 
00424   /* process first stage, middle stages, & last stage */
00425 
00426 
00427   /* start of first stage process */
00428 
00429   /*  Initializations for the first stage */
00430   n2 = fftLen;
00431   n1 = n2;
00432   /* n2 = fftLen/4 */
00433   n2 >>= 2u;
00434 
00435   ia1 = 0u;
00436 
00437   j = n2;
00438   
00439   pSi0 = pSrc;
00440   pSi1 = pSi0 + 2 * n2;
00441   pSi2 = pSi1 + 2 * n2;
00442   pSi3 = pSi2 + 2 * n2;
00443 
00444   /*  Calculation of first stage */
00445   do
00446   {
00447     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
00448 
00449     /*  Butterfly implementation */
00450     /* xa + xc */
00451     r1 = (pSi0[0] >> 4u) + (pSi2[0] >> 4u);
00452     /* xa - xc */
00453     r2 = (pSi0[0] >> 4u) - (pSi2[0] >> 4u);
00454 
00455     /* xb + xd */
00456     t1 = (pSi1[0] >> 4u) + (pSi3[0] >> 4u);
00457 
00458     /* ya + yc */
00459     s1 = (pSi0[1] >> 4u) + (pSi2[1] >> 4u);
00460     /* ya - yc */
00461     s2 = (pSi0[1] >> 4u) - (pSi2[1] >> 4u);
00462 
00463     /* xa' = xa + xb + xc + xd */
00464     *pSi0++ = (r1 + t1);
00465     /* (xa + xc) - (xb + xd) */
00466     r1 = r1 - t1;
00467     /* yb + yd */
00468     t2 = (pSi1[1] >> 4u) + (pSi3[1] >> 4u);
00469 
00470     /* ya' = ya + yb + yc + yd */
00471     *pSi0++ = (s1 + t2);
00472 
00473     /* (ya + yc) - (yb + yd) */
00474     s1 = s1 - t2;
00475 
00476     /* yb - yd */
00477     t1 = (pSi1[1] >> 4u) - (pSi3[1] >> 4u);
00478     /* xb - xd */
00479     t2 = (pSi1[0] >> 4u) - (pSi3[0] >> 4u);
00480 
00481     /*  index calculation for the coefficients */
00482     ia2 = 2u * ia1;
00483     co2 = pCoef[ia2 * 2u];
00484     si2 = pCoef[(ia2 * 2u) + 1u];
00485 
00486     /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
00487     *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
00488                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
00489 
00490     /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
00491     *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
00492                             ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
00493 
00494     /* (xa - xc) + (yb - yd) */
00495     r1 = r2 + t1;
00496     /* (xa - xc) - (yb - yd) */
00497     r2 = r2 - t1;
00498 
00499     /* (ya - yc) - (xb - xd) */
00500     s1 = s2 - t2;
00501     /* (ya - yc) + (xb - xd) */
00502     s2 = s2 + t2;
00503 
00504     co1 = pCoef[ia1 * 2u];
00505     si1 = pCoef[(ia1 * 2u) + 1u];
00506 
00507     /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
00508     *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
00509                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
00510 
00511     /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
00512     *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
00513                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
00514 
00515     /*  index calculation for the coefficients */
00516     ia3 = 3u * ia1;
00517     co3 = pCoef[ia3 * 2u];
00518     si3 = pCoef[(ia3 * 2u) + 1u];
00519 
00520     /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
00521     *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
00522                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
00523 
00524     /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
00525     *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
00526                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
00527 
00528     /*  Twiddle coefficients index modifier */
00529     ia1 = ia1 + twidCoefModifier;
00530 
00531   } while(--j);
00532 
00533   /* end of first stage process */
00534 
00535   /* data is in 5.27(q27) format */
00536 
00537 
00538   /* start of Middle stages process */
00539 
00540 
00541   /* each stage in middle stages provides two down scaling of the input */
00542 
00543   twidCoefModifier <<= 2u;
00544 
00545 
00546   for (k = fftLen / 4u; k > 4u; k >>= 2u)
00547   {
00548     /*  Initializations for the first stage */
00549     n1 = n2;
00550     n2 >>= 2u;
00551     ia1 = 0u;
00552 
00553     /*  Calculation of first stage */
00554     for (j = 0u; j <= (n2 - 1u); j++)
00555     {
00556       /*  index calculation for the coefficients */
00557       ia2 = ia1 + ia1;
00558       ia3 = ia2 + ia1;
00559       co1 = pCoef[ia1 * 2u];
00560       si1 = pCoef[(ia1 * 2u) + 1u];
00561       co2 = pCoef[ia2 * 2u];
00562       si2 = pCoef[(ia2 * 2u) + 1u];
00563       co3 = pCoef[ia3 * 2u];
00564       si3 = pCoef[(ia3 * 2u) + 1u];
00565       /*  Twiddle coefficients index modifier */
00566       ia1 = ia1 + twidCoefModifier;
00567       
00568       pSi0 = pSrc + 2 * j;
00569       pSi1 = pSi0 + 2 * n2;
00570       pSi2 = pSi1 + 2 * n2;
00571       pSi3 = pSi2 + 2 * n2;
00572 
00573       for (i0 = j; i0 < fftLen; i0 += n1)
00574       {
00575         /*  Butterfly implementation */
00576         /* xa + xc */
00577         r1 = pSi0[0] + pSi2[0];
00578 
00579         /* xa - xc */
00580         r2 = pSi0[0] - pSi2[0];
00581 
00582 
00583         /* ya + yc */
00584         s1 = pSi0[1] + pSi2[1];
00585 
00586         /* ya - yc */
00587         s2 = pSi0[1] - pSi2[1];
00588 
00589 
00590         /* xb + xd */
00591         t1 = pSi1[0] + pSi3[0];
00592 
00593 
00594         /* xa' = xa + xb + xc + xd */
00595         pSi0[0] = (r1 + t1) >> 2u;
00596         /* xa + xc -(xb + xd) */
00597         r1 = r1 - t1;
00598 
00599         /* yb + yd */
00600         t2 = pSi1[1] + pSi3[1];
00601 
00602         /* ya' = ya + yb + yc + yd */
00603         pSi0[1] = (s1 + t2) >> 2u;
00604         pSi0 += 2 * n1;
00605 
00606         /* (ya + yc) - (yb + yd) */
00607         s1 = s1 - t2;
00608 
00609         /* (yb - yd) */
00610         t1 = pSi1[1] - pSi3[1];
00611 
00612         /* (xb - xd) */
00613         t2 = pSi1[0] - pSi3[0];
00614 
00615 
00616         /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
00617         pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
00618                          ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1u;
00619 
00620         /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
00621         pSi1[1] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
00622                                 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1u;
00623         pSi1 += 2 * n1;
00624 
00625         /* (xa - xc) + (yb - yd) */
00626         r1 = r2 + t1;
00627         /* (xa - xc) - (yb - yd) */
00628         r2 = r2 - t1;
00629 
00630         /* (ya - yc) -  (xb - xd) */
00631         s1 = s2 - t2;
00632         /* (ya - yc) +  (xb - xd) */
00633         s2 = s2 + t2;
00634 
00635         /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
00636         pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
00637                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
00638 
00639         /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
00640         pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
00641                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
00642         pSi2 += 2 * n1;
00643 
00644         /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
00645         pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
00646                          ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
00647 
00648         /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
00649         pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
00650                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
00651         pSi3 += 2 * n1;
00652       }
00653     }
00654     twidCoefModifier <<= 2u;
00655   }
00656 #endif
00657 
00658   /* End of Middle stages process */
00659 
00660   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
00661   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
00662   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
00663   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
00664 
00665 
00666   /* start of Last stage process */
00667   /*  Initializations for the last stage */
00668   j = fftLen >> 2;
00669   ptr1 = &pSrc[0];
00670 
00671   /*  Calculations of last stage */
00672   do
00673   {
00674 
00675 #ifndef ARM_MATH_BIG_ENDIAN
00676 
00677     /* Read xa (real), ya(imag) input */
00678     xaya = *__SIMD64(ptr1)++;
00679     xa = (q31_t) xaya;
00680     ya = (q31_t) (xaya >> 32);
00681 
00682     /* Read xb (real), yb(imag) input */
00683     xbyb = *__SIMD64(ptr1)++;
00684     xb = (q31_t) xbyb;
00685     yb = (q31_t) (xbyb >> 32);
00686 
00687     /* Read xc (real), yc(imag) input */
00688     xcyc = *__SIMD64(ptr1)++;
00689     xc = (q31_t) xcyc;
00690     yc = (q31_t) (xcyc >> 32);
00691 
00692     /* Read xc (real), yc(imag) input */
00693     xdyd = *__SIMD64(ptr1)++;
00694     xd = (q31_t) xdyd;
00695     yd = (q31_t) (xdyd >> 32);
00696 
00697 #else
00698 
00699     /* Read xa (real), ya(imag) input */
00700     xaya = *__SIMD64(ptr1)++;
00701     ya = (q31_t) xaya;
00702     xa = (q31_t) (xaya >> 32);
00703 
00704     /* Read xb (real), yb(imag) input */
00705     xbyb = *__SIMD64(ptr1)++;
00706     yb = (q31_t) xbyb;
00707     xb = (q31_t) (xbyb >> 32);
00708 
00709     /* Read xc (real), yc(imag) input */
00710     xcyc = *__SIMD64(ptr1)++;
00711     yc = (q31_t) xcyc;
00712     xc = (q31_t) (xcyc >> 32);
00713 
00714     /* Read xc (real), yc(imag) input */
00715     xdyd = *__SIMD64(ptr1)++;
00716     yd = (q31_t) xdyd;
00717     xd = (q31_t) (xdyd >> 32);
00718 
00719 
00720 #endif
00721 
00722     /* xa' = xa + xb + xc + xd */
00723     xa_out = xa + xb + xc + xd;
00724 
00725     /* ya' = ya + yb + yc + yd */
00726     ya_out = ya + yb + yc + yd;
00727 
00728     /* pointer updation for writing */
00729     ptr1 = ptr1 - 8u;
00730 
00731     /* writing xa' and ya' */
00732     *ptr1++ = xa_out;
00733     *ptr1++ = ya_out;
00734 
00735     xc_out = (xa - xb + xc - xd);
00736     yc_out = (ya - yb + yc - yd);
00737 
00738     /* writing xc' and yc' */
00739     *ptr1++ = xc_out;
00740     *ptr1++ = yc_out;
00741 
00742     xb_out = (xa + yb - xc - yd);
00743     yb_out = (ya - xb - yc + xd);
00744 
00745     /* writing xb' and yb' */
00746     *ptr1++ = xb_out;
00747     *ptr1++ = yb_out;
00748 
00749     xd_out = (xa - yb - xc + yd);
00750     yd_out = (ya + xb - yc - xd);
00751 
00752     /* writing xd' and yd' */
00753     *ptr1++ = xd_out;
00754     *ptr1++ = yd_out;
00755 
00756 
00757   } while(--j);
00758 
00759   /* output is in 11.21(q21) format for the 1024 point */
00760   /* output is in 9.23(q23) format for the 256 point */
00761   /* output is in 7.25(q25) format for the 64 point */
00762   /* output is in 5.27(q27) format for the 16 point */
00763 
00764   /* End of last stage process */
00765 
00766 }
00767 
00768 
00769 /**    
00770  * @brief  Core function for the Q31 CIFFT butterfly process.   
00771  * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.   
00772  * @param[in]      fftLen           length of the FFT.   
00773  * @param[in]      *pCoef           points to twiddle coefficient buffer.   
00774  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.   
00775  * @return none.   
00776  */
00777 
00778 
00779 /*    
00780 * Radix-4 IFFT algorithm used is :    
00781 *    
00782 * CIFFT uses same twiddle coefficients as CFFT Function    
00783 *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]    
00784 *    
00785 *    
00786 * IFFT is implemented with following changes in equations from FFT    
00787 *    
00788 * Input real and imaginary data:    
00789 * x(n) = xa + j * ya    
00790 * x(n+N/4 ) = xb + j * yb    
00791 * x(n+N/2 ) = xc + j * yc    
00792 * x(n+3N 4) = xd + j * yd    
00793 *    
00794 *    
00795 * Output real and imaginary data:    
00796 * x(4r) = xa'+ j * ya'    
00797 * x(4r+1) = xb'+ j * yb'    
00798 * x(4r+2) = xc'+ j * yc'    
00799 * x(4r+3) = xd'+ j * yd'    
00800 *    
00801 *    
00802 * Twiddle factors for radix-4 IFFT:    
00803 * Wn = co1 + j * (si1)    
00804 * W2n = co2 + j * (si2)    
00805 * W3n = co3 + j * (si3)    
00806     
00807 * The real and imaginary output values for the radix-4 butterfly are    
00808 * xa' = xa + xb + xc + xd    
00809 * ya' = ya + yb + yc + yd    
00810 * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)    
00811 * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)    
00812 * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)    
00813 * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)    
00814 * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)    
00815 * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)    
00816 *    
00817 */
00818 
00819 void arm_radix4_butterfly_inverse_q31(
00820   q31_t * pSrc,
00821   uint32_t fftLen,
00822   q31_t * pCoef,
00823   uint32_t twidCoefModifier)
00824 {
00825 #if defined(ARM_MATH_CM7)
00826   uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
00827   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
00828   q31_t xa, xb, xc, xd;
00829   q31_t ya, yb, yc, yd;
00830   q31_t xa_out, xb_out, xc_out, xd_out;
00831   q31_t ya_out, yb_out, yc_out, yd_out;
00832 
00833   q31_t *ptr1;
00834   q63_t xaya, xbyb, xcyc, xdyd;
00835 
00836   /* input is be 1.31(q31) format for all FFT sizes */
00837   /* Total process is divided into three stages */
00838   /* process first stage, middle stages, & last stage */
00839 
00840   /* Start of first stage process */
00841 
00842   /* Initializations for the first stage */
00843   n2 = fftLen;
00844   n1 = n2;
00845   /* n2 = fftLen/4 */
00846   n2 >>= 2u;
00847   i0 = 0u;
00848   ia1 = 0u;
00849 
00850   j = n2;
00851 
00852   do
00853   {
00854 
00855     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
00856 
00857     /*  index calculation for the input as, */
00858     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
00859     i1 = i0 + n2;
00860     i2 = i1 + n2;
00861     i3 = i2 + n2;
00862 
00863     /*  Butterfly implementation */
00864     /* xa + xc */
00865     r1 = (pSrc[2u * i0] >> 4u) + (pSrc[2u * i2] >> 4u);
00866     /* xa - xc */
00867     r2 = (pSrc[2u * i0] >> 4u) - (pSrc[2u * i2] >> 4u);
00868 
00869     /* xb + xd */
00870     t1 = (pSrc[2u * i1] >> 4u) + (pSrc[2u * i3] >> 4u);
00871 
00872     /* ya + yc */
00873     s1 = (pSrc[(2u * i0) + 1u] >> 4u) + (pSrc[(2u * i2) + 1u] >> 4u);
00874     /* ya - yc */
00875     s2 = (pSrc[(2u * i0) + 1u] >> 4u) - (pSrc[(2u * i2) + 1u] >> 4u);
00876 
00877     /* xa' = xa + xb + xc + xd */
00878     pSrc[2u * i0] = (r1 + t1);
00879     /* (xa + xc) - (xb + xd) */
00880     r1 = r1 - t1;
00881     /* yb + yd */
00882     t2 = (pSrc[(2u * i1) + 1u] >> 4u) + (pSrc[(2u * i3) + 1u] >> 4u);
00883     /* ya' = ya + yb + yc + yd */
00884     pSrc[(2u * i0) + 1u] = (s1 + t2);
00885 
00886     /* (ya + yc) - (yb + yd) */
00887     s1 = s1 - t2;
00888 
00889     /* yb - yd */
00890     t1 = (pSrc[(2u * i1) + 1u] >> 4u) - (pSrc[(2u * i3) + 1u] >> 4u);
00891     /* xb - xd */
00892     t2 = (pSrc[2u * i1] >> 4u) - (pSrc[2u * i3] >> 4u);
00893 
00894     /*  index calculation for the coefficients */
00895     ia2 = 2u * ia1;
00896     co2 = pCoef[ia2 * 2u];
00897     si2 = pCoef[(ia2 * 2u) + 1u];
00898 
00899     /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
00900     pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
00901                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
00902 
00903     /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
00904     pSrc[2u * i1 + 1u] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
00905                           ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
00906 
00907     /* (xa - xc) - (yb - yd) */
00908     r1 = r2 - t1;
00909     /* (xa - xc) + (yb - yd) */
00910     r2 = r2 + t1;
00911 
00912     /* (ya - yc) + (xb - xd) */
00913     s1 = s2 + t2;
00914     /* (ya - yc) - (xb - xd) */
00915     s2 = s2 - t2;
00916 
00917     co1 = pCoef[ia1 * 2u];
00918     si1 = pCoef[(ia1 * 2u) + 1u];
00919 
00920     /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
00921     pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
00922                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
00923 
00924     /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
00925     pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
00926                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
00927 
00928     /*  index calculation for the coefficients */
00929     ia3 = 3u * ia1;
00930     co3 = pCoef[ia3 * 2u];
00931     si3 = pCoef[(ia3 * 2u) + 1u];
00932 
00933     /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
00934     pSrc[2u * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
00935                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
00936 
00937     /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
00938     pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
00939                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
00940 
00941     /*  Twiddle coefficients index modifier */
00942     ia1 = ia1 + twidCoefModifier;
00943 
00944     /*  Updating input index */
00945     i0 = i0 + 1u;
00946 
00947   } while(--j);
00948 
00949   /* data is in 5.27(q27) format */
00950   /* each stage provides two down scaling of the input */
00951 
00952 
00953   /* Start of Middle stages process */
00954 
00955   twidCoefModifier <<= 2u;
00956 
00957   /*  Calculation of second stage to excluding last stage */
00958   for (k = fftLen / 4u; k > 4u; k >>= 2u)
00959   {
00960     /*  Initializations for the first stage */
00961     n1 = n2;
00962     n2 >>= 2u;
00963     ia1 = 0u;
00964 
00965     for (j = 0; j <= (n2 - 1u); j++)
00966     {
00967       /*  index calculation for the coefficients */
00968       ia2 = ia1 + ia1;
00969       ia3 = ia2 + ia1;
00970       co1 = pCoef[ia1 * 2u];
00971       si1 = pCoef[(ia1 * 2u) + 1u];
00972       co2 = pCoef[ia2 * 2u];
00973       si2 = pCoef[(ia2 * 2u) + 1u];
00974       co3 = pCoef[ia3 * 2u];
00975       si3 = pCoef[(ia3 * 2u) + 1u];
00976       /*  Twiddle coefficients index modifier */
00977       ia1 = ia1 + twidCoefModifier;
00978 
00979       for (i0 = j; i0 < fftLen; i0 += n1)
00980       {
00981         /*  index calculation for the input as, */
00982         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
00983         i1 = i0 + n2;
00984         i2 = i1 + n2;
00985         i3 = i2 + n2;
00986 
00987         /*  Butterfly implementation */
00988         /* xa + xc */
00989         r1 = pSrc[2u * i0] + pSrc[2u * i2];
00990         /* xa - xc */
00991         r2 = pSrc[2u * i0] - pSrc[2u * i2];
00992 
00993         /* ya + yc */
00994         s1 = pSrc[(2u * i0) + 1u] + pSrc[(2u * i2) + 1u];
00995         /* ya - yc */
00996         s2 = pSrc[(2u * i0) + 1u] - pSrc[(2u * i2) + 1u];
00997 
00998         /* xb + xd */
00999         t1 = pSrc[2u * i1] + pSrc[2u * i3];
01000 
01001         /* xa' = xa + xb + xc + xd */
01002         pSrc[2u * i0] = (r1 + t1) >> 2u;
01003         /* xa + xc -(xb + xd) */
01004         r1 = r1 - t1;
01005         /* yb + yd */
01006         t2 = pSrc[(2u * i1) + 1u] + pSrc[(2u * i3) + 1u];
01007         /* ya' = ya + yb + yc + yd */
01008         pSrc[(2u * i0) + 1u] = (s1 + t2) >> 2u;
01009 
01010         /* (ya + yc) - (yb + yd) */
01011         s1 = s1 - t2;
01012 
01013         /* (yb - yd) */
01014         t1 = pSrc[(2u * i1) + 1u] - pSrc[(2u * i3) + 1u];
01015         /* (xb - xd) */
01016         t2 = pSrc[2u * i1] - pSrc[2u * i3];
01017 
01018         /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
01019         pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32u)) -
01020                          ((int32_t) (((q63_t) s1 * si2) >> 32u))) >> 1u;
01021 
01022         /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
01023         pSrc[(2u * i1) + 1u] =
01024           (((int32_t) (((q63_t) s1 * co2) >> 32u)) +
01025            ((int32_t) (((q63_t) r1 * si2) >> 32u))) >> 1u;
01026 
01027         /* (xa - xc) - (yb - yd) */
01028         r1 = r2 - t1;
01029         /* (xa - xc) + (yb - yd) */
01030         r2 = r2 + t1;
01031 
01032         /* (ya - yc) +  (xb - xd) */
01033         s1 = s2 + t2;
01034         /* (ya - yc) -  (xb - xd) */
01035         s2 = s2 - t2;
01036 
01037         /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
01038         pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
01039                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
01040 
01041         /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
01042         pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
01043                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
01044 
01045         /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
01046         pSrc[(2u * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
01047                            ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
01048 
01049         /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
01050         pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
01051                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
01052       }
01053     }
01054     twidCoefModifier <<= 2u;
01055   }
01056 #else
01057   uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
01058   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
01059   q31_t xa, xb, xc, xd;
01060   q31_t ya, yb, yc, yd;
01061   q31_t xa_out, xb_out, xc_out, xd_out;
01062   q31_t ya_out, yb_out, yc_out, yd_out;
01063 
01064   q31_t *ptr1;
01065   q31_t *pSi0;
01066   q31_t *pSi1;
01067   q31_t *pSi2;
01068   q31_t *pSi3;
01069   q63_t xaya, xbyb, xcyc, xdyd;
01070 
01071   /* input is be 1.31(q31) format for all FFT sizes */
01072   /* Total process is divided into three stages */
01073   /* process first stage, middle stages, & last stage */
01074 
01075   /* Start of first stage process */
01076 
01077   /* Initializations for the first stage */
01078   n2 = fftLen;
01079   n1 = n2;
01080   /* n2 = fftLen/4 */
01081   n2 >>= 2u;
01082 
01083   ia1 = 0u;
01084 
01085   j = n2;
01086   
01087   pSi0 = pSrc;
01088   pSi1 = pSi0 + 2 * n2;
01089   pSi2 = pSi1 + 2 * n2;
01090   pSi3 = pSi2 + 2 * n2;
01091 
01092   do
01093   {
01094     /*  Butterfly implementation */
01095     /* xa + xc */
01096     r1 = (pSi0[0] >> 4u) + (pSi2[0] >> 4u);
01097     /* xa - xc */
01098     r2 = (pSi0[0] >> 4u) - (pSi2[0] >> 4u);
01099 
01100     /* xb + xd */
01101     t1 = (pSi1[0] >> 4u) + (pSi3[0] >> 4u);
01102 
01103     /* ya + yc */
01104     s1 = (pSi0[1] >> 4u) + (pSi2[1] >> 4u);
01105     /* ya - yc */
01106     s2 = (pSi0[1] >> 4u) - (pSi2[1] >> 4u);
01107 
01108     /* xa' = xa + xb + xc + xd */
01109     *pSi0++ = (r1 + t1);
01110     /* (xa + xc) - (xb + xd) */
01111     r1 = r1 - t1;
01112     /* yb + yd */
01113     t2 = (pSi1[1] >> 4u) + (pSi3[1] >> 4u);
01114     /* ya' = ya + yb + yc + yd */
01115     *pSi0++ = (s1 + t2);
01116 
01117     /* (ya + yc) - (yb + yd) */
01118     s1 = s1 - t2;
01119 
01120     /* yb - yd */
01121     t1 = (pSi1[1] >> 4u) - (pSi3[1] >> 4u);
01122     /* xb - xd */
01123     t2 = (pSi1[0] >> 4u) - (pSi3[0] >> 4u);
01124 
01125     /*  index calculation for the coefficients */
01126     ia2 = 2u * ia1;
01127     co2 = pCoef[ia2 * 2u];
01128     si2 = pCoef[(ia2 * 2u) + 1u];
01129 
01130     /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
01131     *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
01132                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
01133 
01134     /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
01135     *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
01136                           ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
01137 
01138     /* (xa - xc) - (yb - yd) */
01139     r1 = r2 - t1;
01140     /* (xa - xc) + (yb - yd) */
01141     r2 = r2 + t1;
01142 
01143     /* (ya - yc) + (xb - xd) */
01144     s1 = s2 + t2;
01145     /* (ya - yc) - (xb - xd) */
01146     s2 = s2 - t2;
01147 
01148     co1 = pCoef[ia1 * 2u];
01149     si1 = pCoef[(ia1 * 2u) + 1u];
01150 
01151     /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
01152     *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
01153                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
01154 
01155     /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
01156     *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
01157                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
01158 
01159     /*  index calculation for the coefficients */
01160     ia3 = 3u * ia1;
01161     co3 = pCoef[ia3 * 2u];
01162     si3 = pCoef[(ia3 * 2u) + 1u];
01163 
01164     /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
01165     *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
01166                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
01167 
01168     /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
01169     *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
01170                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
01171 
01172     /*  Twiddle coefficients index modifier */
01173     ia1 = ia1 + twidCoefModifier;
01174 
01175   } while(--j);
01176 
01177   /* data is in 5.27(q27) format */
01178   /* each stage provides two down scaling of the input */
01179 
01180 
01181   /* Start of Middle stages process */
01182 
01183   twidCoefModifier <<= 2u;
01184 
01185   /*  Calculation of second stage to excluding last stage */
01186   for (k = fftLen / 4u; k > 4u; k >>= 2u)
01187   {
01188     /*  Initializations for the first stage */
01189     n1 = n2;
01190     n2 >>= 2u;
01191     ia1 = 0u;
01192 
01193     for (j = 0; j <= (n2 - 1u); j++)
01194     {
01195       /*  index calculation for the coefficients */
01196       ia2 = ia1 + ia1;
01197       ia3 = ia2 + ia1;
01198       co1 = pCoef[ia1 * 2u];
01199       si1 = pCoef[(ia1 * 2u) + 1u];
01200       co2 = pCoef[ia2 * 2u];
01201       si2 = pCoef[(ia2 * 2u) + 1u];
01202       co3 = pCoef[ia3 * 2u];
01203       si3 = pCoef[(ia3 * 2u) + 1u];
01204       /*  Twiddle coefficients index modifier */
01205       ia1 = ia1 + twidCoefModifier;
01206       
01207       pSi0 = pSrc + 2 * j;
01208       pSi1 = pSi0 + 2 * n2;
01209       pSi2 = pSi1 + 2 * n2;
01210       pSi3 = pSi2 + 2 * n2;
01211 
01212       for (i0 = j; i0 < fftLen; i0 += n1)
01213       {
01214         /*  Butterfly implementation */
01215         /* xa + xc */
01216         r1 = pSi0[0] + pSi2[0];
01217 
01218         /* xa - xc */
01219         r2 = pSi0[0] - pSi2[0];
01220 
01221 
01222         /* ya + yc */
01223         s1 = pSi0[1] + pSi2[1];
01224 
01225         /* ya - yc */
01226         s2 = pSi0[1] - pSi2[1];
01227 
01228 
01229         /* xb + xd */
01230         t1 = pSi1[0] + pSi3[0];
01231 
01232 
01233         /* xa' = xa + xb + xc + xd */
01234         pSi0[0] = (r1 + t1) >> 2u;
01235         /* xa + xc -(xb + xd) */
01236         r1 = r1 - t1;
01237         /* yb + yd */
01238         t2 = pSi1[1] + pSi3[1];
01239 
01240         /* ya' = ya + yb + yc + yd */
01241         pSi0[1] = (s1 + t2) >> 2u;
01242         pSi0 += 2 * n1;
01243 
01244         /* (ya + yc) - (yb + yd) */
01245         s1 = s1 - t2;
01246 
01247         /* (yb - yd) */
01248         t1 = pSi1[1] - pSi3[1];
01249 
01250         /* (xb - xd) */
01251         t2 = pSi1[0] - pSi3[0];
01252 
01253 
01254         /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
01255         pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32u)) -
01256                          ((int32_t) (((q63_t) s1 * si2) >> 32u))) >> 1u;
01257 
01258         /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
01259         pSi1[1] =
01260 
01261           (((int32_t) (((q63_t) s1 * co2) >> 32u)) +
01262            ((int32_t) (((q63_t) r1 * si2) >> 32u))) >> 1u;
01263         pSi1 += 2 * n1;
01264 
01265         /* (xa - xc) - (yb - yd) */
01266         r1 = r2 - t1;
01267         /* (xa - xc) + (yb - yd) */
01268         r2 = r2 + t1;
01269 
01270         /* (ya - yc) +  (xb - xd) */
01271         s1 = s2 + t2;
01272         /* (ya - yc) -  (xb - xd) */
01273         s2 = s2 - t2;
01274 
01275         /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
01276         pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
01277                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
01278 
01279         /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
01280         pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
01281                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
01282         pSi2 += 2 * n1;
01283 
01284         /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
01285         pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
01286                            ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
01287 
01288         /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
01289         pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
01290                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
01291         pSi3 += 2 * n1;
01292       }
01293     }
01294     twidCoefModifier <<= 2u;
01295   }
01296 #endif
01297 
01298   /* End of Middle stages process */
01299 
01300   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
01301   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
01302   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
01303   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
01304 
01305 
01306   /* Start of last stage process */
01307 
01308 
01309   /*  Initializations for the last stage */
01310   j = fftLen >> 2;
01311   ptr1 = &pSrc[0];
01312 
01313   /*  Calculations of last stage */
01314   do
01315   {
01316 #ifndef ARM_MATH_BIG_ENDIAN
01317     /* Read xa (real), ya(imag) input */
01318     xaya = *__SIMD64(ptr1)++;
01319     xa = (q31_t) xaya;
01320     ya = (q31_t) (xaya >> 32);
01321 
01322     /* Read xb (real), yb(imag) input */
01323     xbyb = *__SIMD64(ptr1)++;
01324     xb = (q31_t) xbyb;
01325     yb = (q31_t) (xbyb >> 32);
01326 
01327     /* Read xc (real), yc(imag) input */
01328     xcyc = *__SIMD64(ptr1)++;
01329     xc = (q31_t) xcyc;
01330     yc = (q31_t) (xcyc >> 32);
01331 
01332     /* Read xc (real), yc(imag) input */
01333     xdyd = *__SIMD64(ptr1)++;
01334     xd = (q31_t) xdyd;
01335     yd = (q31_t) (xdyd >> 32);
01336 
01337 #else
01338 
01339     /* Read xa (real), ya(imag) input */
01340     xaya = *__SIMD64(ptr1)++;
01341     ya = (q31_t) xaya;
01342     xa = (q31_t) (xaya >> 32);
01343 
01344     /* Read xb (real), yb(imag) input */
01345     xbyb = *__SIMD64(ptr1)++;
01346     yb = (q31_t) xbyb;
01347     xb = (q31_t) (xbyb >> 32);
01348 
01349     /* Read xc (real), yc(imag) input */
01350     xcyc = *__SIMD64(ptr1)++;
01351     yc = (q31_t) xcyc;
01352     xc = (q31_t) (xcyc >> 32);
01353 
01354     /* Read xc (real), yc(imag) input */
01355     xdyd = *__SIMD64(ptr1)++;
01356     yd = (q31_t) xdyd;
01357     xd = (q31_t) (xdyd >> 32);
01358 
01359 
01360 #endif
01361 
01362     /* xa' = xa + xb + xc + xd */
01363     xa_out = xa + xb + xc + xd;
01364 
01365     /* ya' = ya + yb + yc + yd */
01366     ya_out = ya + yb + yc + yd;
01367 
01368     /* pointer updation for writing */
01369     ptr1 = ptr1 - 8u;
01370 
01371     /* writing xa' and ya' */
01372     *ptr1++ = xa_out;
01373     *ptr1++ = ya_out;
01374 
01375     xc_out = (xa - xb + xc - xd);
01376     yc_out = (ya - yb + yc - yd);
01377 
01378     /* writing xc' and yc' */
01379     *ptr1++ = xc_out;
01380     *ptr1++ = yc_out;
01381 
01382     xb_out = (xa - yb - xc + yd);
01383     yb_out = (ya + xb - yc - xd);
01384 
01385     /* writing xb' and yb' */
01386     *ptr1++ = xb_out;
01387     *ptr1++ = yb_out;
01388 
01389     xd_out = (xa + yb - xc - yd);
01390     yd_out = (ya - xb - yc + xd);
01391 
01392     /* writing xd' and yd' */
01393     *ptr1++ = xd_out;
01394     *ptr1++ = yd_out;
01395 
01396   } while(--j);
01397 
01398   /* output is in 11.21(q21) format for the 1024 point */
01399   /* output is in 9.23(q23) format for the 256 point */
01400   /* output is in 7.25(q25) format for the 64 point */
01401   /* output is in 5.27(q27) format for the 16 point */
01402 
01403   /* End of last stage process */
01404 }