CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_dct4_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_dct4_q31.c 00009 * 00010 * Description: Processing function of DCT4 & IDCT4 Q31. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @addtogroup DCT4_IDCT4 00045 * @{ 00046 */ 00047 00048 /** 00049 * @brief Processing function for the Q31 DCT4/IDCT4. 00050 * @param[in] *S points to an instance of the Q31 DCT4 structure. 00051 * @param[in] *pState points to state buffer. 00052 * @param[in,out] *pInlineBuffer points to the in-place input and output buffer. 00053 * @return none. 00054 * \par Input an output formats: 00055 * Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process, 00056 * as the conversion from DCT2 to DCT4 involves one subtraction. 00057 * Internally inputs are downscaled in the RFFT process function to avoid overflows. 00058 * Number of bits downscaled, depends on the size of the transform. 00059 * The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below: 00060 * 00061 * \image html dct4FormatsQ31Table.gif 00062 */ 00063 00064 void arm_dct4_q31( 00065 const arm_dct4_instance_q31 * S, 00066 q31_t * pState, 00067 q31_t * pInlineBuffer) 00068 { 00069 uint16_t i; /* Loop counter */ 00070 q31_t *weights = S->pTwiddle; /* Pointer to the Weights table */ 00071 q31_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */ 00072 q31_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */ 00073 q31_t in; /* Temporary variable */ 00074 00075 00076 /* DCT4 computation involves DCT2 (which is calculated using RFFT) 00077 * along with some pre-processing and post-processing. 00078 * Computational procedure is explained as follows: 00079 * (a) Pre-processing involves multiplying input with cos factor, 00080 * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n)) 00081 * where, 00082 * r(n) -- output of preprocessing 00083 * u(n) -- input to preprocessing(actual Source buffer) 00084 * (b) Calculation of DCT2 using FFT is divided into three steps: 00085 * Step1: Re-ordering of even and odd elements of input. 00086 * Step2: Calculating FFT of the re-ordered input. 00087 * Step3: Taking the real part of the product of FFT output and weights. 00088 * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation: 00089 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00090 * where, 00091 * Y4 -- DCT4 output, Y2 -- DCT2 output 00092 * (d) Multiplying the output with the normalizing factor sqrt(2/N). 00093 */ 00094 00095 /*-------- Pre-processing ------------*/ 00096 /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */ 00097 arm_mult_q31(pInlineBuffer, cosFact, pInlineBuffer, S->N); 00098 arm_shift_q31(pInlineBuffer, 1, pInlineBuffer, S->N); 00099 00100 /* ---------------------------------------------------------------- 00101 * Step1: Re-ordering of even and odd elements as 00102 * pState[i] = pInlineBuffer[2*i] and 00103 * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2 00104 ---------------------------------------------------------------------*/ 00105 00106 /* pS1 initialized to pState */ 00107 pS1 = pState; 00108 00109 /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */ 00110 pS2 = pState + (S->N - 1u); 00111 00112 /* pbuff initialized to input buffer */ 00113 pbuff = pInlineBuffer; 00114 00115 #ifndef ARM_MATH_CM0_FAMILY 00116 00117 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00118 00119 /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */ 00120 i = S->Nby2 >> 2u; 00121 00122 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00123 ** a second loop below computes the remaining 1 to 3 samples. */ 00124 do 00125 { 00126 /* Re-ordering of even and odd elements */ 00127 /* pState[i] = pInlineBuffer[2*i] */ 00128 *pS1++ = *pbuff++; 00129 /* pState[N-i-1] = pInlineBuffer[2*i+1] */ 00130 *pS2-- = *pbuff++; 00131 00132 *pS1++ = *pbuff++; 00133 *pS2-- = *pbuff++; 00134 00135 *pS1++ = *pbuff++; 00136 *pS2-- = *pbuff++; 00137 00138 *pS1++ = *pbuff++; 00139 *pS2-- = *pbuff++; 00140 00141 /* Decrement the loop counter */ 00142 i--; 00143 } while(i > 0u); 00144 00145 /* pbuff initialized to input buffer */ 00146 pbuff = pInlineBuffer; 00147 00148 /* pS1 initialized to pState */ 00149 pS1 = pState; 00150 00151 /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 00152 i = S->N >> 2u; 00153 00154 /* Processing with loop unrolling 4 times as N is always multiple of 4. 00155 * Compute 4 outputs at a time */ 00156 do 00157 { 00158 /* Writing the re-ordered output back to inplace input buffer */ 00159 *pbuff++ = *pS1++; 00160 *pbuff++ = *pS1++; 00161 *pbuff++ = *pS1++; 00162 *pbuff++ = *pS1++; 00163 00164 /* Decrement the loop counter */ 00165 i--; 00166 } while(i > 0u); 00167 00168 00169 /* --------------------------------------------------------- 00170 * Step2: Calculate RFFT for N-point input 00171 * ---------------------------------------------------------- */ 00172 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ 00173 arm_rfft_q31(S->pRfft, pInlineBuffer, pState); 00174 00175 /*---------------------------------------------------------------------- 00176 * Step3: Multiply the FFT output with the weights. 00177 *----------------------------------------------------------------------*/ 00178 arm_cmplx_mult_cmplx_q31(pState, weights, pState, S->N); 00179 00180 /* The output of complex multiplication is in 3.29 format. 00181 * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */ 00182 arm_shift_q31(pState, 2, pState, S->N * 2); 00183 00184 /* ----------- Post-processing ---------- */ 00185 /* DCT-IV can be obtained from DCT-II by the equation, 00186 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00187 * Hence, Y4(0) = Y2(0)/2 */ 00188 /* Getting only real part from the output and Converting to DCT-IV */ 00189 00190 /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */ 00191 i = (S->N - 1u) >> 2u; 00192 00193 /* pbuff initialized to input buffer. */ 00194 pbuff = pInlineBuffer; 00195 00196 /* pS1 initialized to pState */ 00197 pS1 = pState; 00198 00199 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ 00200 in = *pS1++ >> 1u; 00201 /* input buffer acts as inplace, so output values are stored in the input itself. */ 00202 *pbuff++ = in; 00203 00204 /* pState pointer is incremented twice as the real values are located alternatively in the array */ 00205 pS1++; 00206 00207 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00208 ** a second loop below computes the remaining 1 to 3 samples. */ 00209 do 00210 { 00211 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00212 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00213 in = *pS1++ - in; 00214 *pbuff++ = in; 00215 /* points to the next real value */ 00216 pS1++; 00217 00218 in = *pS1++ - in; 00219 *pbuff++ = in; 00220 pS1++; 00221 00222 in = *pS1++ - in; 00223 *pbuff++ = in; 00224 pS1++; 00225 00226 in = *pS1++ - in; 00227 *pbuff++ = in; 00228 pS1++; 00229 00230 /* Decrement the loop counter */ 00231 i--; 00232 } while(i > 0u); 00233 00234 /* If the blockSize is not a multiple of 4, compute any remaining output samples here. 00235 ** No loop unrolling is used. */ 00236 i = (S->N - 1u) % 0x4u; 00237 00238 while(i > 0u) 00239 { 00240 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00241 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00242 in = *pS1++ - in; 00243 *pbuff++ = in; 00244 /* points to the next real value */ 00245 pS1++; 00246 00247 /* Decrement the loop counter */ 00248 i--; 00249 } 00250 00251 00252 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ 00253 00254 /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 00255 i = S->N >> 2u; 00256 00257 /* pbuff initialized to the pInlineBuffer(now contains the output values) */ 00258 pbuff = pInlineBuffer; 00259 00260 /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */ 00261 do 00262 { 00263 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ 00264 in = *pbuff; 00265 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 00266 00267 in = *pbuff; 00268 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 00269 00270 in = *pbuff; 00271 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 00272 00273 in = *pbuff; 00274 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 00275 00276 /* Decrement the loop counter */ 00277 i--; 00278 } while(i > 0u); 00279 00280 00281 #else 00282 00283 /* Run the below code for Cortex-M0 */ 00284 00285 /* Initializing the loop counter to N/2 */ 00286 i = S->Nby2; 00287 00288 do 00289 { 00290 /* Re-ordering of even and odd elements */ 00291 /* pState[i] = pInlineBuffer[2*i] */ 00292 *pS1++ = *pbuff++; 00293 /* pState[N-i-1] = pInlineBuffer[2*i+1] */ 00294 *pS2-- = *pbuff++; 00295 00296 /* Decrement the loop counter */ 00297 i--; 00298 } while(i > 0u); 00299 00300 /* pbuff initialized to input buffer */ 00301 pbuff = pInlineBuffer; 00302 00303 /* pS1 initialized to pState */ 00304 pS1 = pState; 00305 00306 /* Initializing the loop counter */ 00307 i = S->N; 00308 00309 do 00310 { 00311 /* Writing the re-ordered output back to inplace input buffer */ 00312 *pbuff++ = *pS1++; 00313 00314 /* Decrement the loop counter */ 00315 i--; 00316 } while(i > 0u); 00317 00318 00319 /* --------------------------------------------------------- 00320 * Step2: Calculate RFFT for N-point input 00321 * ---------------------------------------------------------- */ 00322 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ 00323 arm_rfft_q31(S->pRfft, pInlineBuffer, pState); 00324 00325 /*---------------------------------------------------------------------- 00326 * Step3: Multiply the FFT output with the weights. 00327 *----------------------------------------------------------------------*/ 00328 arm_cmplx_mult_cmplx_q31(pState, weights, pState, S->N); 00329 00330 /* The output of complex multiplication is in 3.29 format. 00331 * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */ 00332 arm_shift_q31(pState, 2, pState, S->N * 2); 00333 00334 /* ----------- Post-processing ---------- */ 00335 /* DCT-IV can be obtained from DCT-II by the equation, 00336 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00337 * Hence, Y4(0) = Y2(0)/2 */ 00338 /* Getting only real part from the output and Converting to DCT-IV */ 00339 00340 /* pbuff initialized to input buffer. */ 00341 pbuff = pInlineBuffer; 00342 00343 /* pS1 initialized to pState */ 00344 pS1 = pState; 00345 00346 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ 00347 in = *pS1++ >> 1u; 00348 /* input buffer acts as inplace, so output values are stored in the input itself. */ 00349 *pbuff++ = in; 00350 00351 /* pState pointer is incremented twice as the real values are located alternatively in the array */ 00352 pS1++; 00353 00354 /* Initializing the loop counter */ 00355 i = (S->N - 1u); 00356 00357 while(i > 0u) 00358 { 00359 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00360 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00361 in = *pS1++ - in; 00362 *pbuff++ = in; 00363 /* points to the next real value */ 00364 pS1++; 00365 00366 /* Decrement the loop counter */ 00367 i--; 00368 } 00369 00370 00371 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ 00372 00373 /* Initializing the loop counter */ 00374 i = S->N; 00375 00376 /* pbuff initialized to the pInlineBuffer(now contains the output values) */ 00377 pbuff = pInlineBuffer; 00378 00379 do 00380 { 00381 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ 00382 in = *pbuff; 00383 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 00384 00385 /* Decrement the loop counter */ 00386 i--; 00387 } while(i > 0u); 00388 00389 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00390 00391 } 00392 00393 /** 00394 * @} end of DCT4_IDCT4 group 00395 */
Generated on Tue Jul 12 2022 11:59:17 by 1.7.2