CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
Legacy Warning
This is an mbed 2 library. To learn more about mbed OS 5, visit the docs.
Diff: cmsis_dsp/TransformFunctions/arm_cfft_radix4_q15.c
- Revision:
- 5:3762170b6d4d
- Parent:
- 3:7a284390b0ce
--- a/cmsis_dsp/TransformFunctions/arm_cfft_radix4_q15.c Mon Jun 23 09:30:09 2014 +0100 +++ b/cmsis_dsp/TransformFunctions/arm_cfft_radix4_q15.c Fri Nov 20 08:45:18 2015 +0000 @@ -1,8 +1,8 @@ /* ---------------------------------------------------------------------- -* Copyright (C) 2010-2013 ARM Limited. All rights reserved. +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. * -* $Date: 17. January 2013 -* $Revision: V1.4.1 +* $Date: 19. March 2015 +* $Revision: V.1.4.5 * * Project: CMSIS DSP Library * Title: arm_cfft_radix4_q15.c @@ -73,6 +73,7 @@ /** * @details * @brief Processing function for the Q15 CFFT/CIFFT. + * @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed * @param[in] *S points to an instance of the Q15 CFFT/CIFFT structure. * @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place. * @return none. @@ -172,12 +173,13 @@ q31_t R, S, T, U; q31_t C1, C2, C3, out1, out2; - uint32_t n1, n2, ic, i0, i1, i2, i3, j, k; - q15_t in; + uint32_t n1, n2, ic, i0, j, k; q15_t *ptr1; - - + q15_t *pSi0; + q15_t *pSi1; + q15_t *pSi2; + q15_t *pSi3; q31_t xaya, xbyb, xcyc, xdyd; @@ -196,8 +198,12 @@ ic = 0u; /* Index for input read and output write */ - i0 = 0u; j = n2; + + pSi0 = pSrc16; + pSi1 = pSi0 + 2 * n2; + pSi2 = pSi1 + 2 * n2; + pSi3 = pSi2 + 2 * n2; /* Input is in 1.15(q15) format */ @@ -206,22 +212,18 @@ { /* Butterfly implementation */ - /* index calculation for the input as, */ - /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */ - i1 = i0 + n2; - i2 = i1 + n2; - i3 = i2 + n2; - /* Reading i0, i0+fftLen/2 inputs */ /* Read ya (real), xa(imag) input */ - T = _SIMD32_OFFSET(pSrc16 + (2u * i0)); - in = ((int16_t) (T & 0xFFFF)) >> 2; - T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF); + T = _SIMD32_OFFSET(pSi0); + T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1 + T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles + //in = ((int16_t) (T & 0xFFFF)) >> 2; // alternative code that takes 3 cycles + //T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF); /* Read yc (real), xc(imag) input */ - S = _SIMD32_OFFSET(pSrc16 + (2u * i2)); - in = ((int16_t) (S & 0xFFFF)) >> 2; - S = ((S >> 2) & 0xFFFF0000) | (in & 0xFFFF); + S = _SIMD32_OFFSET(pSi2); + S = __SHADD16(S, 0); + S = __SHADD16(S, 0); /* R = packed((ya + yc), (xa + xc) ) */ R = __QADD16(T, S); @@ -231,14 +233,14 @@ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ /* Read yb (real), xb(imag) input */ - T = _SIMD32_OFFSET(pSrc16 + (2u * i1)); - in = ((int16_t) (T & 0xFFFF)) >> 2; - T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF); + T = _SIMD32_OFFSET(pSi1); + T = __SHADD16(T, 0); + T = __SHADD16(T, 0); /* Read yd (real), xd(imag) input */ - U = _SIMD32_OFFSET(pSrc16 + (2u * i3)); - in = ((int16_t) (U & 0xFFFF)) >> 2; - U = ((U >> 2) & 0xFFFF0000) | (in & 0xFFFF); + U = _SIMD32_OFFSET(pSi3); + U = __SHADD16(U, 0); + U = __SHADD16(U, 0); /* T = packed((yb + yd), (xb + xd) ) */ T = __QADD16(T, U); @@ -246,7 +248,8 @@ /* writing the butterfly processed i0 sample */ /* xa' = xa + xb + xc + xd */ /* ya' = ya + yb + yc + yd */ - _SIMD32_OFFSET(pSrc16 + (2u * i0)) = __SHADD16(R, T); + _SIMD32_OFFSET(pSi0) = __SHADD16(R, T); + pSi0 += 2; /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */ R = __QSUB16(R, T); @@ -272,20 +275,21 @@ /* Reading i0+fftLen/4 */ /* T = packed(yb, xb) */ - T = _SIMD32_OFFSET(pSrc16 + (2u * i1)); - in = ((int16_t) (T & 0xFFFF)) >> 2; - T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF); + T = _SIMD32_OFFSET(pSi1); + T = __SHADD16(T, 0); + T = __SHADD16(T, 0); /* writing the butterfly processed i0 + fftLen/4 sample */ /* writing output(xc', yc') in little endian format */ - _SIMD32_OFFSET(pSrc16 + (2u * i1)) = + _SIMD32_OFFSET(pSi1) = (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF); + pSi1 += 2; /* Butterfly calculations */ /* U = packed(yd, xd) */ - U = _SIMD32_OFFSET(pSrc16 + (2u * i3)); - in = ((int16_t) (U & 0xFFFF)) >> 2; - U = ((U >> 2) & 0xFFFF0000) | (in & 0xFFFF); + U = _SIMD32_OFFSET(pSi3); + U = __SHADD16(U, 0); + U = __SHADD16(U, 0); /* T = packed(yb-yd, xb-xd) */ T = __QSUB16(T, U); @@ -327,8 +331,9 @@ #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* writing output(xb', yb') in little endian format */ - _SIMD32_OFFSET(pSrc16 + (2u * i2)) = + _SIMD32_OFFSET(pSi2) = ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF); + pSi2 += 2; /* co3 & si3 are read from SIMD Coefficient pointer */ @@ -352,15 +357,13 @@ #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* writing output(xd', yd') in little endian format */ - _SIMD32_OFFSET(pSrc16 + (2u * i3)) = + _SIMD32_OFFSET(pSi3) = ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF); + pSi3 += 2; /* Twiddle coefficients index modifier */ ic = ic + twidCoefModifier; - /* Updating input index */ - i0 = i0 + 1u; - } while(--j); /* data is in 4.11(q11) format */ @@ -389,22 +392,21 @@ /* Twiddle coefficients index modifier */ ic = ic + twidCoefModifier; + + pSi0 = pSrc16 + 2 * j; + pSi1 = pSi0 + 2 * n2; + pSi2 = pSi1 + 2 * n2; + pSi3 = pSi2 + 2 * n2; /* Butterfly implementation */ for (i0 = j; i0 < fftLen; i0 += n1) { - /* index calculation for the input as, */ - /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */ - i1 = i0 + n2; - i2 = i1 + n2; - i3 = i2 + n2; - /* Reading i0, i0+fftLen/2 inputs */ /* Read ya (real), xa(imag) input */ - T = _SIMD32_OFFSET(pSrc16 + (2u * i0)); + T = _SIMD32_OFFSET(pSi0); /* Read yc (real), xc(imag) input */ - S = _SIMD32_OFFSET(pSrc16 + (2u * i2)); + S = _SIMD32_OFFSET(pSi2); /* R = packed( (ya + yc), (xa + xc)) */ R = __QADD16(T, S); @@ -414,10 +416,10 @@ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ /* Read yb (real), xb(imag) input */ - T = _SIMD32_OFFSET(pSrc16 + (2u * i1)); + T = _SIMD32_OFFSET(pSi1); /* Read yd (real), xd(imag) input */ - U = _SIMD32_OFFSET(pSrc16 + (2u * i3)); + U = _SIMD32_OFFSET(pSi3); /* T = packed( (yb + yd), (xb + xd)) */ T = __QADD16(T, U); @@ -427,9 +429,9 @@ /* xa' = xa + xb + xc + xd */ /* ya' = ya + yb + yc + yd */ out1 = __SHADD16(R, T); - in = ((int16_t) (out1 & 0xFFFF)) >> 1; - out1 = ((out1 >> 1) & 0xFFFF0000) | (in & 0xFFFF); - _SIMD32_OFFSET(pSrc16 + (2u * i0)) = out1; + out1 = __SHADD16(out1, 0); + _SIMD32_OFFSET(pSi0) = out1; + pSi0 += 2 * n1; /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */ R = __SHSUB16(R, T); @@ -454,18 +456,19 @@ /* Reading i0+3fftLen/4 */ /* Read yb (real), xb(imag) input */ - T = _SIMD32_OFFSET(pSrc16 + (2u * i1)); + T = _SIMD32_OFFSET(pSi1); /* writing the butterfly processed i0 + fftLen/4 sample */ /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ - _SIMD32_OFFSET(pSrc16 + (2u * i1)) = + _SIMD32_OFFSET(pSi1) = ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF); + pSi1 += 2 * n1; /* Butterfly calculations */ /* Read yd (real), xd(imag) input */ - U = _SIMD32_OFFSET(pSrc16 + (2u * i3)); + U = _SIMD32_OFFSET(pSi3); /* T = packed(yb-yd, xb-xd) */ T = __QSUB16(T, U); @@ -500,8 +503,9 @@ /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ - _SIMD32_OFFSET(pSrc16 + (2u * i2)) = + _SIMD32_OFFSET(pSi2) = ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF); + pSi2 += 2 * n1; /* Butterfly process for the i0+3fftLen/4 sample */ @@ -519,8 +523,9 @@ /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */ /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */ - _SIMD32_OFFSET(pSrc16 + (2u * i3)) = + _SIMD32_OFFSET(pSi3) = ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF); + pSi3 += 2 * n1; } } /* Twiddle coefficients index modifier */ @@ -711,9 +716,9 @@ Si2 = pCoef16[(2u * ic * 2u) + 1]; /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ - out1 = (short) ((Co2 * R0 + Si2 * R1) >> 16u); + out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u); /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ - out2 = (short) ((-Si2 * R0 + Co2 * R1) >> 16u); + out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u); /* Reading i0+fftLen/4 */ /* input is down scale by 4 to avoid overflow */ @@ -737,21 +742,21 @@ T1 = __SSAT(T1 - U1, 16); /* R1 = (ya-yc) + (xb- xd), R0 = (xa-xc) - (yb-yd)) */ - R0 = (short) __SSAT((q31_t) (S0 - T1), 16); - R1 = (short) __SSAT((q31_t) (S1 + T0), 16); + R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16); + R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16); /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */ - S0 = (short) __SSAT(((q31_t) S0 + T1), 16u); - S1 = (short) __SSAT(((q31_t) S1 - T0), 16u); + S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16u); + S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16u); /* co1 & si1 are read from Coefficient pointer */ Co1 = pCoef16[ic * 2u]; Si1 = pCoef16[(ic * 2u) + 1]; /* Butterfly process for the i0+fftLen/2 sample */ /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ - out1 = (short) ((Si1 * S1 + Co1 * S0) >> 16); + out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16); /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ - out2 = (short) ((-Si1 * S0 + Co1 * S1) >> 16); + out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16); /* writing output(xb', yb') in little endian format */ pSrc16[i2 * 2u] = out1; @@ -762,9 +767,9 @@ Si3 = pCoef16[(3u * (ic * 2u)) + 1]; /* Butterfly process for the i0+3fftLen/4 sample */ /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */ - out1 = (short) ((Si3 * R1 + Co3 * R0) >> 16u); + out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u); /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */ - out2 = (short) ((-Si3 * R0 + Co3 * R1) >> 16u); + out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u); /* writing output(xd', yd') in little endian format */ pSrc16[i3 * 2u] = out1; pSrc16[(i3 * 2u) + 1] = out2; @@ -862,10 +867,10 @@ R1 = (R1 >> 1u) - (T1 >> 1u); /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */ - out1 = (short) ((Co2 * R0 + Si2 * R1) >> 16u); + out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u); /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ - out2 = (short) ((-Si2 * R0 + Co2 * R1) >> 16u); + out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u); /* Reading i0+3fftLen/4 */ /* Read yb (real), xb(imag) input */ @@ -897,9 +902,9 @@ S1 = (S1 >> 1u) - (T0 >> 1u); /* Butterfly process for the i0+fftLen/2 sample */ - out1 = (short) ((Co1 * S0 + Si1 * S1) >> 16u); + out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16u); - out2 = (short) ((-Si1 * S0 + Co1 * S1) >> 16u); + out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16u); /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ @@ -907,9 +912,9 @@ pSrc16[(i2 * 2u) + 1u] = out2; /* Butterfly process for the i0+3fftLen/4 sample */ - out1 = (short) ((Si3 * R1 + Co3 * R0) >> 16u); + out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u); - out2 = (short) ((-Si3 * R0 + Co3 * R1) >> 16u); + out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u); /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */ /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */ pSrc16[i3 * 2u] = out1; @@ -1085,12 +1090,13 @@ q31_t R, S, T, U; q31_t C1, C2, C3, out1, out2; - uint32_t n1, n2, ic, i0, i1, i2, i3, j, k; - q15_t in; + uint32_t n1, n2, ic, i0, j, k; q15_t *ptr1; - - + q15_t *pSi0; + q15_t *pSi1; + q15_t *pSi2; + q15_t *pSi3; q31_t xaya, xbyb, xcyc, xdyd; @@ -1109,8 +1115,12 @@ ic = 0u; /* Index for input read and output write */ - i0 = 0u; j = n2; + + pSi0 = pSrc16; + pSi1 = pSi0 + 2 * n2; + pSi2 = pSi1 + 2 * n2; + pSi3 = pSi2 + 2 * n2; /* Input is in 1.15(q15) format */ @@ -1119,22 +1129,16 @@ { /* Butterfly implementation */ - /* index calculation for the input as, */ - /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */ - i1 = i0 + n2; - i2 = i1 + n2; - i3 = i2 + n2; - /* Reading i0, i0+fftLen/2 inputs */ /* Read ya (real), xa(imag) input */ - T = _SIMD32_OFFSET(pSrc16 + (2u * i0)); - in = ((int16_t) (T & 0xFFFF)) >> 2; - T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF); + T = _SIMD32_OFFSET(pSi0); + T = __SHADD16(T, 0); + T = __SHADD16(T, 0); /* Read yc (real), xc(imag) input */ - S = _SIMD32_OFFSET(pSrc16 + (2u * i2)); - in = ((int16_t) (S & 0xFFFF)) >> 2; - S = ((S >> 2) & 0xFFFF0000) | (in & 0xFFFF); + S = _SIMD32_OFFSET(pSi2); + S = __SHADD16(S, 0); + S = __SHADD16(S, 0); /* R = packed((ya + yc), (xa + xc) ) */ R = __QADD16(T, S); @@ -1144,14 +1148,14 @@ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ /* Read yb (real), xb(imag) input */ - T = _SIMD32_OFFSET(pSrc16 + (2u * i1)); - in = ((int16_t) (T & 0xFFFF)) >> 2; - T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF); + T = _SIMD32_OFFSET(pSi1); + T = __SHADD16(T, 0); + T = __SHADD16(T, 0); /* Read yd (real), xd(imag) input */ - U = _SIMD32_OFFSET(pSrc16 + (2u * i3)); - in = ((int16_t) (U & 0xFFFF)) >> 2; - U = ((U >> 2) & 0xFFFF0000) | (in & 0xFFFF); + U = _SIMD32_OFFSET(pSi3); + U = __SHADD16(U, 0); + U = __SHADD16(U, 0); /* T = packed((yb + yd), (xb + xd) ) */ T = __QADD16(T, U); @@ -1159,7 +1163,8 @@ /* writing the butterfly processed i0 sample */ /* xa' = xa + xb + xc + xd */ /* ya' = ya + yb + yc + yd */ - _SIMD32_OFFSET(pSrc16 + (2u * i0)) = __SHADD16(R, T); + _SIMD32_OFFSET(pSi0) = __SHADD16(R, T); + pSi0 += 2; /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */ R = __QSUB16(R, T); @@ -1185,20 +1190,21 @@ /* Reading i0+fftLen/4 */ /* T = packed(yb, xb) */ - T = _SIMD32_OFFSET(pSrc16 + (2u * i1)); - in = ((int16_t) (T & 0xFFFF)) >> 2; - T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF); + T = _SIMD32_OFFSET(pSi1); + T = __SHADD16(T, 0); + T = __SHADD16(T, 0); /* writing the butterfly processed i0 + fftLen/4 sample */ /* writing output(xc', yc') in little endian format */ - _SIMD32_OFFSET(pSrc16 + (2u * i1)) = + _SIMD32_OFFSET(pSi1) = (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF); + pSi1 += 2; /* Butterfly calculations */ /* U = packed(yd, xd) */ - U = _SIMD32_OFFSET(pSrc16 + (2u * i3)); - in = ((int16_t) (U & 0xFFFF)) >> 2; - U = ((U >> 2) & 0xFFFF0000) | (in & 0xFFFF); + U = _SIMD32_OFFSET(pSi3); + U = __SHADD16(U, 0); + U = __SHADD16(U, 0); /* T = packed(yb-yd, xb-xd) */ T = __QSUB16(T, U); @@ -1240,8 +1246,9 @@ #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* writing output(xb', yb') in little endian format */ - _SIMD32_OFFSET(pSrc16 + (2u * i2)) = + _SIMD32_OFFSET(pSi2) = ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF); + pSi2 += 2; /* co3 & si3 are read from SIMD Coefficient pointer */ @@ -1265,15 +1272,13 @@ #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* writing output(xd', yd') in little endian format */ - _SIMD32_OFFSET(pSrc16 + (2u * i3)) = + _SIMD32_OFFSET(pSi3) = ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF); + pSi3 += 2; /* Twiddle coefficients index modifier */ ic = ic + twidCoefModifier; - /* Updating input index */ - i0 = i0 + 1u; - } while(--j); /* data is in 4.11(q11) format */ @@ -1302,22 +1307,21 @@ /* Twiddle coefficients index modifier */ ic = ic + twidCoefModifier; + + pSi0 = pSrc16 + 2 * j; + pSi1 = pSi0 + 2 * n2; + pSi2 = pSi1 + 2 * n2; + pSi3 = pSi2 + 2 * n2; /* Butterfly implementation */ for (i0 = j; i0 < fftLen; i0 += n1) { - /* index calculation for the input as, */ - /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */ - i1 = i0 + n2; - i2 = i1 + n2; - i3 = i2 + n2; - /* Reading i0, i0+fftLen/2 inputs */ /* Read ya (real), xa(imag) input */ - T = _SIMD32_OFFSET(pSrc16 + (2u * i0)); + T = _SIMD32_OFFSET(pSi0); /* Read yc (real), xc(imag) input */ - S = _SIMD32_OFFSET(pSrc16 + (2u * i2)); + S = _SIMD32_OFFSET(pSi2); /* R = packed( (ya + yc), (xa + xc)) */ R = __QADD16(T, S); @@ -1327,10 +1331,10 @@ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ /* Read yb (real), xb(imag) input */ - T = _SIMD32_OFFSET(pSrc16 + (2u * i1)); + T = _SIMD32_OFFSET(pSi1); /* Read yd (real), xd(imag) input */ - U = _SIMD32_OFFSET(pSrc16 + (2u * i3)); + U = _SIMD32_OFFSET(pSi3); /* T = packed( (yb + yd), (xb + xd)) */ T = __QADD16(T, U); @@ -1340,9 +1344,9 @@ /* xa' = xa + xb + xc + xd */ /* ya' = ya + yb + yc + yd */ out1 = __SHADD16(R, T); - in = ((int16_t) (out1 & 0xFFFF)) >> 1; - out1 = ((out1 >> 1) & 0xFFFF0000) | (in & 0xFFFF); - _SIMD32_OFFSET(pSrc16 + (2u * i0)) = out1; + out1 = __SHADD16(out1, 0); + _SIMD32_OFFSET(pSi0) = out1; + pSi0 += 2 * n1; /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */ R = __SHSUB16(R, T); @@ -1367,18 +1371,19 @@ /* Reading i0+3fftLen/4 */ /* Read yb (real), xb(imag) input */ - T = _SIMD32_OFFSET(pSrc16 + (2u * i1)); + T = _SIMD32_OFFSET(pSi1); /* writing the butterfly processed i0 + fftLen/4 sample */ /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ - _SIMD32_OFFSET(pSrc16 + (2u * i1)) = + _SIMD32_OFFSET(pSi1) = ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF); + pSi1 += 2 * n1; /* Butterfly calculations */ /* Read yd (real), xd(imag) input */ - U = _SIMD32_OFFSET(pSrc16 + (2u * i3)); + U = _SIMD32_OFFSET(pSi3); /* T = packed(yb-yd, xb-xd) */ T = __QSUB16(T, U); @@ -1413,8 +1418,9 @@ /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ - _SIMD32_OFFSET(pSrc16 + (2u * i2)) = + _SIMD32_OFFSET(pSi2) = ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF); + pSi2 += 2 * n1; /* Butterfly process for the i0+3fftLen/4 sample */ @@ -1432,8 +1438,9 @@ /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */ /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */ - _SIMD32_OFFSET(pSrc16 + (2u * i3)) = + _SIMD32_OFFSET(pSi3) = ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF); + pSi3 += 2 * n1; } } /* Twiddle coefficients index modifier */ @@ -1615,9 +1622,9 @@ Co2 = pCoef16[2u * ic * 2u]; Si2 = pCoef16[(2u * ic * 2u) + 1u]; /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */ - out1 = (short) ((Co2 * R0 - Si2 * R1) >> 16u); + out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16u); /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */ - out2 = (short) ((Si2 * R0 + Co2 * R1) >> 16u); + out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16u); /* Reading i0+fftLen/4 */ /* input is down scale by 4 to avoid overflow */ @@ -1640,20 +1647,20 @@ T0 = __SSAT(T0 - U0, 16u); T1 = __SSAT(T1 - U1, 16u); /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */ - R0 = (short) __SSAT((q31_t) (S0 + T1), 16); - R1 = (short) __SSAT((q31_t) (S1 - T0), 16); + R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16); + R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16); /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */ - S0 = (short) __SSAT((q31_t) (S0 - T1), 16); - S1 = (short) __SSAT((q31_t) (S1 + T0), 16); + S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16); + S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16); /* co1 & si1 are read from Coefficient pointer */ Co1 = pCoef16[ic * 2u]; Si1 = pCoef16[(ic * 2u) + 1u]; /* Butterfly process for the i0+fftLen/2 sample */ /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */ - out1 = (short) ((Co1 * S0 - Si1 * S1) >> 16u); + out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u); /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */ - out2 = (short) ((Si1 * S0 + Co1 * S1) >> 16u); + out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u); /* writing output(xb', yb') in little endian format */ pSrc16[i2 * 2u] = out1; pSrc16[(i2 * 2u) + 1u] = out2; @@ -1663,9 +1670,9 @@ Si3 = pCoef16[(3u * ic * 2u) + 1u]; /* Butterfly process for the i0+3fftLen/4 sample */ /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */ - out1 = (short) ((Co3 * R0 - Si3 * R1) >> 16u); + out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u); /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */ - out2 = (short) ((Si3 * R0 + Co3 * R1) >> 16u); + out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u); /* writing output(xd', yd') in little endian format */ pSrc16[i3 * 2u] = out1; pSrc16[(i3 * 2u) + 1u] = out2; @@ -1759,9 +1766,9 @@ R1 = (R1 >> 1u) - (T1 >> 1u); /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */ - out1 = (short) ((Co2 * R0 - Si2 * R1) >> 16); + out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16); /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */ - out2 = (short) ((Si2 * R0 + Co2 * R1) >> 16); + out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16); /* Reading i0+3fftLen/4 */ /* Read yb (real), xb(imag) input */ @@ -1792,17 +1799,17 @@ S1 = (S1 >> 1u) + (T0 >> 1u); /* Butterfly process for the i0+fftLen/2 sample */ - out1 = (short) ((Co1 * S0 - Si1 * S1) >> 16u); - out2 = (short) ((Si1 * S0 + Co1 * S1) >> 16u); + out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u); + out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u); /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */ /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */ pSrc16[i2 * 2u] = out1; pSrc16[(i2 * 2u) + 1u] = out2; /* Butterfly process for the i0+3fftLen/4 sample */ - out1 = (short) ((Co3 * R0 - Si3 * R1) >> 16u); + out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u); - out2 = (short) ((Si3 * R0 + Co3 * R1) >> 16u); + out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u); /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */ /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */ pSrc16[i3 * 2u] = out1;