CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Legacy Warning

This is an mbed 2 library. To learn more about mbed OS 5, visit the docs.

Revision:
5:3762170b6d4d
Parent:
3:7a284390b0ce
--- a/cmsis_dsp/TransformFunctions/arm_cfft_radix4_q15.c	Mon Jun 23 09:30:09 2014 +0100
+++ b/cmsis_dsp/TransformFunctions/arm_cfft_radix4_q15.c	Fri Nov 20 08:45:18 2015 +0000
@@ -1,8 +1,8 @@
 /* ----------------------------------------------------------------------    
-* Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
+* Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
-* $Date:        17. January 2013  
-* $Revision: 	V1.4.1  
+* $Date:        19. March 2015 
+* $Revision: 	V.1.4.5  
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_cfft_radix4_q15.c    
@@ -73,6 +73,7 @@
 /**    
  * @details    
  * @brief Processing function for the Q15 CFFT/CIFFT.   
+ * @deprecated Do not use this function.  It has been superseded by \ref arm_cfft_q15 and will be removed
  * @param[in]      *S    points to an instance of the Q15 CFFT/CIFFT structure.   
  * @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.   
  * @return none.   
@@ -172,12 +173,13 @@
 
   q31_t R, S, T, U;
   q31_t C1, C2, C3, out1, out2;
-  uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
-  q15_t in;
+  uint32_t n1, n2, ic, i0, j, k;
 
   q15_t *ptr1;
-
-
+  q15_t *pSi0;
+  q15_t *pSi1;
+  q15_t *pSi2;
+  q15_t *pSi3;
 
   q31_t xaya, xbyb, xcyc, xdyd;
 
@@ -196,8 +198,12 @@
   ic = 0u;
 
   /* Index for input read and output write */
-  i0 = 0u;
   j = n2;
+  
+  pSi0 = pSrc16;
+  pSi1 = pSi0 + 2 * n2;
+  pSi2 = pSi1 + 2 * n2;
+  pSi3 = pSi2 + 2 * n2;
 
   /* Input is in 1.15(q15) format */
 
@@ -206,22 +212,18 @@
   {
     /*  Butterfly implementation */
 
-    /*  index calculation for the input as, */
-    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
-    i1 = i0 + n2;
-    i2 = i1 + n2;
-    i3 = i2 + n2;
-
     /*  Reading i0, i0+fftLen/2 inputs */
     /* Read ya (real), xa(imag) input */
-    T = _SIMD32_OFFSET(pSrc16 + (2u * i0));
-    in = ((int16_t) (T & 0xFFFF)) >> 2;
-    T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+    T = _SIMD32_OFFSET(pSi0);
+    T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
+    T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
+    //in = ((int16_t) (T & 0xFFFF)) >> 2;       // alternative code that takes 3 cycles
+    //T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
 
     /* Read yc (real), xc(imag) input */
-    S = _SIMD32_OFFSET(pSrc16 + (2u * i2));
-    in = ((int16_t) (S & 0xFFFF)) >> 2;
-    S = ((S >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+    S = _SIMD32_OFFSET(pSi2);
+    S = __SHADD16(S, 0);
+    S = __SHADD16(S, 0);
 
     /* R = packed((ya + yc), (xa + xc) ) */
     R = __QADD16(T, S);
@@ -231,14 +233,14 @@
 
     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
     /* Read yb (real), xb(imag) input */
-    T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
-    in = ((int16_t) (T & 0xFFFF)) >> 2;
-    T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+    T = _SIMD32_OFFSET(pSi1);
+    T = __SHADD16(T, 0);
+    T = __SHADD16(T, 0);
 
     /* Read yd (real), xd(imag) input */
-    U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
-    in = ((int16_t) (U & 0xFFFF)) >> 2;
-    U = ((U >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+    U = _SIMD32_OFFSET(pSi3);
+    U = __SHADD16(U, 0);
+    U = __SHADD16(U, 0);
 
     /* T = packed((yb + yd), (xb + xd) ) */
     T = __QADD16(T, U);
@@ -246,7 +248,8 @@
     /*  writing the butterfly processed i0 sample */
     /* xa' = xa + xb + xc + xd */
     /* ya' = ya + yb + yc + yd */
-    _SIMD32_OFFSET(pSrc16 + (2u * i0)) = __SHADD16(R, T);
+    _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
+    pSi0 += 2;
 
     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
     R = __QSUB16(R, T);
@@ -272,20 +275,21 @@
 
     /*  Reading i0+fftLen/4 */
     /* T = packed(yb, xb) */
-    T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
-    in = ((int16_t) (T & 0xFFFF)) >> 2;
-    T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+    T = _SIMD32_OFFSET(pSi1);
+    T = __SHADD16(T, 0);
+    T = __SHADD16(T, 0);
 
     /* writing the butterfly processed i0 + fftLen/4 sample */
     /* writing output(xc', yc') in little endian format */
-    _SIMD32_OFFSET(pSrc16 + (2u * i1)) =
+    _SIMD32_OFFSET(pSi1) =
       (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+    pSi1 += 2;
 
     /*  Butterfly calculations */
     /* U = packed(yd, xd) */
-    U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
-    in = ((int16_t) (U & 0xFFFF)) >> 2;
-    U = ((U >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+    U = _SIMD32_OFFSET(pSi3);
+    U = __SHADD16(U, 0);
+    U = __SHADD16(U, 0);
 
     /* T = packed(yb-yd, xb-xd) */
     T = __QSUB16(T, U);
@@ -327,8 +331,9 @@
 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 
     /* writing output(xb', yb') in little endian format */
-    _SIMD32_OFFSET(pSrc16 + (2u * i2)) =
+    _SIMD32_OFFSET(pSi2) =
       ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
+    pSi2 += 2;
 
 
     /* co3 & si3 are read from SIMD Coefficient pointer */
@@ -352,15 +357,13 @@
 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 
     /* writing output(xd', yd') in little endian format */
-    _SIMD32_OFFSET(pSrc16 + (2u * i3)) =
+    _SIMD32_OFFSET(pSi3) =
       ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+    pSi3 += 2;
 
     /*  Twiddle coefficients index modifier */
     ic = ic + twidCoefModifier;
 
-    /*  Updating input index */
-    i0 = i0 + 1u;
-
   } while(--j);
   /* data is in 4.11(q11) format */
 
@@ -389,22 +392,21 @@
 
       /*  Twiddle coefficients index modifier */
       ic = ic + twidCoefModifier;
+      
+      pSi0 = pSrc16 + 2 * j;
+      pSi1 = pSi0 + 2 * n2;
+      pSi2 = pSi1 + 2 * n2;
+      pSi3 = pSi2 + 2 * n2;
 
       /*  Butterfly implementation */
       for (i0 = j; i0 < fftLen; i0 += n1)
       {
-        /*  index calculation for the input as, */
-        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
-        i1 = i0 + n2;
-        i2 = i1 + n2;
-        i3 = i2 + n2;
-
         /*  Reading i0, i0+fftLen/2 inputs */
         /* Read ya (real), xa(imag) input */
-        T = _SIMD32_OFFSET(pSrc16 + (2u * i0));
+        T = _SIMD32_OFFSET(pSi0);
 
         /* Read yc (real), xc(imag) input */
-        S = _SIMD32_OFFSET(pSrc16 + (2u * i2));
+        S = _SIMD32_OFFSET(pSi2);
 
         /* R = packed( (ya + yc), (xa + xc)) */
         R = __QADD16(T, S);
@@ -414,10 +416,10 @@
 
         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
         /* Read yb (real), xb(imag) input */
-        T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
+        T = _SIMD32_OFFSET(pSi1);
 
         /* Read yd (real), xd(imag) input */
-        U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
+        U = _SIMD32_OFFSET(pSi3);
 
         /* T = packed( (yb + yd), (xb + xd)) */
         T = __QADD16(T, U);
@@ -427,9 +429,9 @@
         /* xa' = xa + xb + xc + xd */
         /* ya' = ya + yb + yc + yd */
         out1 = __SHADD16(R, T);
-        in = ((int16_t) (out1 & 0xFFFF)) >> 1;
-        out1 = ((out1 >> 1) & 0xFFFF0000) | (in & 0xFFFF);
-        _SIMD32_OFFSET(pSrc16 + (2u * i0)) = out1;
+        out1 = __SHADD16(out1, 0);
+        _SIMD32_OFFSET(pSi0) = out1;
+        pSi0 += 2 * n1;
 
         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
         R = __SHSUB16(R, T);
@@ -454,18 +456,19 @@
 
         /*  Reading i0+3fftLen/4 */
         /* Read yb (real), xb(imag) input */
-        T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
+        T = _SIMD32_OFFSET(pSi1);
 
         /*  writing the butterfly processed i0 + fftLen/4 sample */
         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        _SIMD32_OFFSET(pSrc16 + (2u * i1)) =
+        _SIMD32_OFFSET(pSi1) =
           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+        pSi1 += 2 * n1;
 
         /*  Butterfly calculations */
 
         /* Read yd (real), xd(imag) input */
-        U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
+        U = _SIMD32_OFFSET(pSi3);
 
         /* T = packed(yb-yd, xb-xd) */
         T = __QSUB16(T, U);
@@ -500,8 +503,9 @@
 
         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-        _SIMD32_OFFSET(pSrc16 + (2u * i2)) =
+        _SIMD32_OFFSET(pSi2) =
           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+        pSi2 += 2 * n1;
 
         /*  Butterfly process for the i0+3fftLen/4 sample */
 
@@ -519,8 +523,9 @@
 
         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
-        _SIMD32_OFFSET(pSrc16 + (2u * i3)) =
+        _SIMD32_OFFSET(pSi3) =
           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+        pSi3 += 2 * n1;
       }
     }
     /*  Twiddle coefficients index modifier */
@@ -711,9 +716,9 @@
     Si2 = pCoef16[(2u * ic * 2u) + 1];
 
     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
-    out1 = (short) ((Co2 * R0 + Si2 * R1) >> 16u);
+    out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u);
     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-    out2 = (short) ((-Si2 * R0 + Co2 * R1) >> 16u);
+    out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u);
 
     /*  Reading i0+fftLen/4 */
     /* input is down scale by 4 to avoid overflow */
@@ -737,21 +742,21 @@
     T1 = __SSAT(T1 - U1, 16);
 
     /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
-    R0 = (short) __SSAT((q31_t) (S0 - T1), 16);
-    R1 = (short) __SSAT((q31_t) (S1 + T0), 16);
+    R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
+    R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
 
     /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
-    S0 = (short) __SSAT(((q31_t) S0 + T1), 16u);
-    S1 = (short) __SSAT(((q31_t) S1 - T0), 16u);
+    S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16u);
+    S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16u);
 
     /* co1 & si1 are read from Coefficient pointer */
     Co1 = pCoef16[ic * 2u];
     Si1 = pCoef16[(ic * 2u) + 1];
     /*  Butterfly process for the i0+fftLen/2 sample */
     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
-    out1 = (short) ((Si1 * S1 + Co1 * S0) >> 16);
+    out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-    out2 = (short) ((-Si1 * S0 + Co1 * S1) >> 16);
+    out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
 
     /* writing output(xb', yb') in little endian format */
     pSrc16[i2 * 2u] = out1;
@@ -762,9 +767,9 @@
     Si3 = pCoef16[(3u * (ic * 2u)) + 1];
     /*  Butterfly process for the i0+3fftLen/4 sample */
     /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
-    out1 = (short) ((Si3 * R1 + Co3 * R0) >> 16u);
+    out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u);
     /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
-    out2 = (short) ((-Si3 * R0 + Co3 * R1) >> 16u);
+    out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u);
     /* writing output(xd', yd') in little endian format */
     pSrc16[i3 * 2u] = out1;
     pSrc16[(i3 * 2u) + 1] = out2;
@@ -862,10 +867,10 @@
         R1 = (R1 >> 1u) - (T1 >> 1u);
 
         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
-        out1 = (short) ((Co2 * R0 + Si2 * R1) >> 16u);
+        out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u);
 
         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        out2 = (short) ((-Si2 * R0 + Co2 * R1) >> 16u);
+        out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u);
 
         /*  Reading i0+3fftLen/4 */
         /* Read yb (real), xb(imag) input */
@@ -897,9 +902,9 @@
         S1 = (S1 >> 1u) - (T0 >> 1u);
 
         /*  Butterfly process for the i0+fftLen/2 sample */
-        out1 = (short) ((Co1 * S0 + Si1 * S1) >> 16u);
+        out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16u);
 
-        out2 = (short) ((-Si1 * S0 + Co1 * S1) >> 16u);
+        out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16u);
 
         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
@@ -907,9 +912,9 @@
         pSrc16[(i2 * 2u) + 1u] = out2;
 
         /*  Butterfly process for the i0+3fftLen/4 sample */
-        out1 = (short) ((Si3 * R1 + Co3 * R0) >> 16u);
+        out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u);
 
-        out2 = (short) ((-Si3 * R0 + Co3 * R1) >> 16u);
+        out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u);
         /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
         /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
         pSrc16[i3 * 2u] = out1;
@@ -1085,12 +1090,13 @@
 
   q31_t R, S, T, U;
   q31_t C1, C2, C3, out1, out2;
-  uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
-  q15_t in;
+  uint32_t n1, n2, ic, i0, j, k;
 
   q15_t *ptr1;
-
-
+  q15_t *pSi0;
+  q15_t *pSi1;
+  q15_t *pSi2;
+  q15_t *pSi3;
 
   q31_t xaya, xbyb, xcyc, xdyd;
 
@@ -1109,8 +1115,12 @@
   ic = 0u;
 
   /* Index for input read and output write */
-  i0 = 0u;
   j = n2;
+  
+  pSi0 = pSrc16;
+  pSi1 = pSi0 + 2 * n2;
+  pSi2 = pSi1 + 2 * n2;
+  pSi3 = pSi2 + 2 * n2;
 
   /* Input is in 1.15(q15) format */
 
@@ -1119,22 +1129,16 @@
   {
     /*  Butterfly implementation */
 
-    /*  index calculation for the input as, */
-    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
-    i1 = i0 + n2;
-    i2 = i1 + n2;
-    i3 = i2 + n2;
-
     /*  Reading i0, i0+fftLen/2 inputs */
     /* Read ya (real), xa(imag) input */
-    T = _SIMD32_OFFSET(pSrc16 + (2u * i0));
-    in = ((int16_t) (T & 0xFFFF)) >> 2;
-    T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+    T = _SIMD32_OFFSET(pSi0);
+    T = __SHADD16(T, 0);
+    T = __SHADD16(T, 0);
 
     /* Read yc (real), xc(imag) input */
-    S = _SIMD32_OFFSET(pSrc16 + (2u * i2));
-    in = ((int16_t) (S & 0xFFFF)) >> 2;
-    S = ((S >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+    S = _SIMD32_OFFSET(pSi2);
+    S = __SHADD16(S, 0);
+    S = __SHADD16(S, 0);
 
     /* R = packed((ya + yc), (xa + xc) ) */
     R = __QADD16(T, S);
@@ -1144,14 +1148,14 @@
 
     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
     /* Read yb (real), xb(imag) input */
-    T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
-    in = ((int16_t) (T & 0xFFFF)) >> 2;
-    T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+    T = _SIMD32_OFFSET(pSi1);
+    T = __SHADD16(T, 0);
+    T = __SHADD16(T, 0);
 
     /* Read yd (real), xd(imag) input */
-    U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
-    in = ((int16_t) (U & 0xFFFF)) >> 2;
-    U = ((U >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+    U = _SIMD32_OFFSET(pSi3);
+    U = __SHADD16(U, 0);
+    U = __SHADD16(U, 0);
 
     /* T = packed((yb + yd), (xb + xd) ) */
     T = __QADD16(T, U);
@@ -1159,7 +1163,8 @@
     /*  writing the butterfly processed i0 sample */
     /* xa' = xa + xb + xc + xd */
     /* ya' = ya + yb + yc + yd */
-    _SIMD32_OFFSET(pSrc16 + (2u * i0)) = __SHADD16(R, T);
+    _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
+    pSi0 += 2;
 
     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
     R = __QSUB16(R, T);
@@ -1185,20 +1190,21 @@
 
     /*  Reading i0+fftLen/4 */
     /* T = packed(yb, xb) */
-    T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
-    in = ((int16_t) (T & 0xFFFF)) >> 2;
-    T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+    T = _SIMD32_OFFSET(pSi1);
+    T = __SHADD16(T, 0);
+    T = __SHADD16(T, 0);
 
     /* writing the butterfly processed i0 + fftLen/4 sample */
     /* writing output(xc', yc') in little endian format */
-    _SIMD32_OFFSET(pSrc16 + (2u * i1)) =
+    _SIMD32_OFFSET(pSi1) =
       (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+    pSi1 += 2;
 
     /*  Butterfly calculations */
     /* U = packed(yd, xd) */
-    U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
-    in = ((int16_t) (U & 0xFFFF)) >> 2;
-    U = ((U >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+    U = _SIMD32_OFFSET(pSi3);
+    U = __SHADD16(U, 0);
+    U = __SHADD16(U, 0);
 
     /* T = packed(yb-yd, xb-xd) */
     T = __QSUB16(T, U);
@@ -1240,8 +1246,9 @@
 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 
     /* writing output(xb', yb') in little endian format */
-    _SIMD32_OFFSET(pSrc16 + (2u * i2)) =
+    _SIMD32_OFFSET(pSi2) =
       ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
+    pSi2 += 2;
 
 
     /* co3 & si3 are read from SIMD Coefficient pointer */
@@ -1265,15 +1272,13 @@
 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 
     /* writing output(xd', yd') in little endian format */
-    _SIMD32_OFFSET(pSrc16 + (2u * i3)) =
+    _SIMD32_OFFSET(pSi3) =
       ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+    pSi3 += 2;
 
     /*  Twiddle coefficients index modifier */
     ic = ic + twidCoefModifier;
 
-    /*  Updating input index */
-    i0 = i0 + 1u;
-
   } while(--j);
   /* data is in 4.11(q11) format */
 
@@ -1302,22 +1307,21 @@
 
       /*  Twiddle coefficients index modifier */
       ic = ic + twidCoefModifier;
+      
+      pSi0 = pSrc16 + 2 * j;
+      pSi1 = pSi0 + 2 * n2;
+      pSi2 = pSi1 + 2 * n2;
+      pSi3 = pSi2 + 2 * n2;
 
       /*  Butterfly implementation */
       for (i0 = j; i0 < fftLen; i0 += n1)
       {
-        /*  index calculation for the input as, */
-        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
-        i1 = i0 + n2;
-        i2 = i1 + n2;
-        i3 = i2 + n2;
-
         /*  Reading i0, i0+fftLen/2 inputs */
         /* Read ya (real), xa(imag) input */
-        T = _SIMD32_OFFSET(pSrc16 + (2u * i0));
+        T = _SIMD32_OFFSET(pSi0);
 
         /* Read yc (real), xc(imag) input */
-        S = _SIMD32_OFFSET(pSrc16 + (2u * i2));
+        S = _SIMD32_OFFSET(pSi2);
 
         /* R = packed( (ya + yc), (xa + xc)) */
         R = __QADD16(T, S);
@@ -1327,10 +1331,10 @@
 
         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
         /* Read yb (real), xb(imag) input */
-        T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
+        T = _SIMD32_OFFSET(pSi1);
 
         /* Read yd (real), xd(imag) input */
-        U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
+        U = _SIMD32_OFFSET(pSi3);
 
         /* T = packed( (yb + yd), (xb + xd)) */
         T = __QADD16(T, U);
@@ -1340,9 +1344,9 @@
         /* xa' = xa + xb + xc + xd */
         /* ya' = ya + yb + yc + yd */
         out1 = __SHADD16(R, T);
-        in = ((int16_t) (out1 & 0xFFFF)) >> 1;
-        out1 = ((out1 >> 1) & 0xFFFF0000) | (in & 0xFFFF);
-        _SIMD32_OFFSET(pSrc16 + (2u * i0)) = out1;
+        out1 = __SHADD16(out1, 0);
+        _SIMD32_OFFSET(pSi0) = out1;
+        pSi0 += 2 * n1;
 
         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
         R = __SHSUB16(R, T);
@@ -1367,18 +1371,19 @@
 
         /*  Reading i0+3fftLen/4 */
         /* Read yb (real), xb(imag) input */
-        T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
+        T = _SIMD32_OFFSET(pSi1);
 
         /*  writing the butterfly processed i0 + fftLen/4 sample */
         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        _SIMD32_OFFSET(pSrc16 + (2u * i1)) =
+        _SIMD32_OFFSET(pSi1) =
           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+        pSi1 += 2 * n1;
 
         /*  Butterfly calculations */
 
         /* Read yd (real), xd(imag) input */
-        U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
+        U = _SIMD32_OFFSET(pSi3);
 
         /* T = packed(yb-yd, xb-xd) */
         T = __QSUB16(T, U);
@@ -1413,8 +1418,9 @@
 
         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-        _SIMD32_OFFSET(pSrc16 + (2u * i2)) =
+        _SIMD32_OFFSET(pSi2) =
           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+        pSi2 += 2 * n1;
 
         /*  Butterfly process for the i0+3fftLen/4 sample */
 
@@ -1432,8 +1438,9 @@
 
         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
-        _SIMD32_OFFSET(pSrc16 + (2u * i3)) =
+        _SIMD32_OFFSET(pSi3) =
           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+        pSi3 += 2 * n1;
       }
     }
     /*  Twiddle coefficients index modifier */
@@ -1615,9 +1622,9 @@
     Co2 = pCoef16[2u * ic * 2u];
     Si2 = pCoef16[(2u * ic * 2u) + 1u];
     /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
-    out1 = (short) ((Co2 * R0 - Si2 * R1) >> 16u);
+    out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16u);
     /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
-    out2 = (short) ((Si2 * R0 + Co2 * R1) >> 16u);
+    out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16u);
 
     /*  Reading i0+fftLen/4 */
     /* input is down scale by 4 to avoid overflow */
@@ -1640,20 +1647,20 @@
     T0 = __SSAT(T0 - U0, 16u);
     T1 = __SSAT(T1 - U1, 16u);
     /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
-    R0 = (short) __SSAT((q31_t) (S0 + T1), 16);
-    R1 = (short) __SSAT((q31_t) (S1 - T0), 16);
+    R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
+    R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
     /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
-    S0 = (short) __SSAT((q31_t) (S0 - T1), 16);
-    S1 = (short) __SSAT((q31_t) (S1 + T0), 16);
+    S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
+    S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
 
     /* co1 & si1 are read from Coefficient pointer */
     Co1 = pCoef16[ic * 2u];
     Si1 = pCoef16[(ic * 2u) + 1u];
     /*  Butterfly process for the i0+fftLen/2 sample */
     /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
-    out1 = (short) ((Co1 * S0 - Si1 * S1) >> 16u);
+    out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u);
     /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
-    out2 = (short) ((Si1 * S0 + Co1 * S1) >> 16u);
+    out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u);
     /* writing output(xb', yb') in little endian format */
     pSrc16[i2 * 2u] = out1;
     pSrc16[(i2 * 2u) + 1u] = out2;
@@ -1663,9 +1670,9 @@
     Si3 = pCoef16[(3u * ic * 2u) + 1u];
     /*  Butterfly process for the i0+3fftLen/4 sample */
     /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
-    out1 = (short) ((Co3 * R0 - Si3 * R1) >> 16u);
+    out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u);
     /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
-    out2 = (short) ((Si3 * R0 + Co3 * R1) >> 16u);
+    out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u);
     /* writing output(xd', yd') in little endian format */
     pSrc16[i3 * 2u] = out1;
     pSrc16[(i3 * 2u) + 1u] = out2;
@@ -1759,9 +1766,9 @@
         R1 = (R1 >> 1u) - (T1 >> 1u);
 
         /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
-        out1 = (short) ((Co2 * R0 - Si2 * R1) >> 16);
+        out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
         /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
-        out2 = (short) ((Si2 * R0 + Co2 * R1) >> 16);
+        out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
 
         /*  Reading i0+3fftLen/4 */
         /* Read yb (real), xb(imag) input */
@@ -1792,17 +1799,17 @@
         S1 = (S1 >> 1u) + (T0 >> 1u);
 
         /*  Butterfly process for the i0+fftLen/2 sample */
-        out1 = (short) ((Co1 * S0 - Si1 * S1) >> 16u);
-        out2 = (short) ((Si1 * S0 + Co1 * S1) >> 16u);
+        out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u);
+        out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u);
         /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
         /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
         pSrc16[i2 * 2u] = out1;
         pSrc16[(i2 * 2u) + 1u] = out2;
 
         /*  Butterfly process for the i0+3fftLen/4 sample */
-        out1 = (short) ((Co3 * R0 - Si3 * R1) >> 16u);
+        out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u);
 
-        out2 = (short) ((Si3 * R0 + Co3 * R1) >> 16u);
+        out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u);
         /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
         /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
         pSrc16[i3 * 2u] = out1;