CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_dct4_f32.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_dct4_f32.c 00009 * 00010 * Description: Processing function of DCT4 & IDCT4 F32. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupTransforms 00045 */ 00046 00047 /** 00048 * @defgroup DCT4_IDCT4 DCT Type IV Functions 00049 * Representation of signals by minimum number of values is important for storage and transmission. 00050 * The possibility of large discontinuity between the beginning and end of a period of a signal 00051 * in DFT can be avoided by extending the signal so that it is even-symmetric. 00052 * Discrete Cosine Transform (DCT) is constructed such that its energy is heavily concentrated in the lower part of the 00053 * spectrum and is very widely used in signal and image coding applications. 00054 * The family of DCTs (DCT type- 1,2,3,4) is the outcome of different combinations of homogeneous boundary conditions. 00055 * DCT has an excellent energy-packing capability, hence has many applications and in data compression in particular. 00056 * 00057 * DCT is essentially the Discrete Fourier Transform(DFT) of an even-extended real signal. 00058 * Reordering of the input data makes the computation of DCT just a problem of 00059 * computing the DFT of a real signal with a few additional operations. 00060 * This approach provides regular, simple, and very efficient DCT algorithms for practical hardware and software implementations. 00061 * 00062 * DCT type-II can be implemented using Fast fourier transform (FFT) internally, as the transform is applied on real values, Real FFT can be used. 00063 * DCT4 is implemented using DCT2 as their implementations are similar except with some added pre-processing and post-processing. 00064 * DCT2 implementation can be described in the following steps: 00065 * - Re-ordering input 00066 * - Calculating Real FFT 00067 * - Multiplication of weights and Real FFT output and getting real part from the product. 00068 * 00069 * This process is explained by the block diagram below: 00070 * \image html DCT4.gif "Discrete Cosine Transform - type-IV" 00071 * 00072 * \par Algorithm: 00073 * The N-point type-IV DCT is defined as a real, linear transformation by the formula: 00074 * \image html DCT4Equation.gif 00075 * where <code>k = 0,1,2,.....N-1</code> 00076 *\par 00077 * Its inverse is defined as follows: 00078 * \image html IDCT4Equation.gif 00079 * where <code>n = 0,1,2,.....N-1</code> 00080 *\par 00081 * The DCT4 matrices become involutory (i.e. they are self-inverse) by multiplying with an overall scale factor of sqrt(2/N). 00082 * The symmetry of the transform matrix indicates that the fast algorithms for the forward 00083 * and inverse transform computation are identical. 00084 * Note that the implementation of Inverse DCT4 and DCT4 is same, hence same process function can be used for both. 00085 * 00086 * \par Lengths supported by the transform: 00087 * As DCT4 internally uses Real FFT, it supports all the lengths supported by arm_rfft_f32(). 00088 * The library provides separate functions for Q15, Q31, and floating-point data types. 00089 * \par Instance Structure 00090 * The instances for Real FFT and FFT, cosine values table and twiddle factor table are stored in an instance data structure. 00091 * A separate instance structure must be defined for each transform. 00092 * There are separate instance structure declarations for each of the 3 supported data types. 00093 * 00094 * \par Initialization Functions 00095 * There is also an associated initialization function for each data type. 00096 * The initialization function performs the following operations: 00097 * - Sets the values of the internal structure fields. 00098 * - Initializes Real FFT as its process function is used internally in DCT4, by calling arm_rfft_init_f32(). 00099 * \par 00100 * Use of the initialization function is optional. 00101 * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. 00102 * To place an instance structure into a const data section, the instance structure must be manually initialized. 00103 * Manually initialize the instance structure as follows: 00104 * <pre> 00105 *arm_dct4_instance_f32 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft}; 00106 *arm_dct4_instance_q31 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft}; 00107 *arm_dct4_instance_q15 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft}; 00108 * </pre> 00109 * where \c N is the length of the DCT4; \c Nby2 is half of the length of the DCT4; 00110 * \c normalize is normalizing factor used and is equal to <code>sqrt(2/N)</code>; 00111 * \c pTwiddle points to the twiddle factor table; 00112 * \c pCosFactor points to the cosFactor table; 00113 * \c pRfft points to the real FFT instance; 00114 * \c pCfft points to the complex FFT instance; 00115 * The CFFT and RFFT structures also needs to be initialized, refer to arm_cfft_radix4_f32() 00116 * and arm_rfft_f32() respectively for details regarding static initialization. 00117 * 00118 * \par Fixed-Point Behavior 00119 * Care must be taken when using the fixed-point versions of the DCT4 transform functions. 00120 * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. 00121 * Refer to the function specific documentation below for usage guidelines. 00122 */ 00123 00124 /** 00125 * @addtogroup DCT4_IDCT4 00126 * @{ 00127 */ 00128 00129 /** 00130 * @brief Processing function for the floating-point DCT4/IDCT4. 00131 * @param[in] *S points to an instance of the floating-point DCT4/IDCT4 structure. 00132 * @param[in] *pState points to state buffer. 00133 * @param[in,out] *pInlineBuffer points to the in-place input and output buffer. 00134 * @return none. 00135 */ 00136 00137 void arm_dct4_f32( 00138 const arm_dct4_instance_f32 * S, 00139 float32_t * pState, 00140 float32_t * pInlineBuffer) 00141 { 00142 uint32_t i; /* Loop counter */ 00143 float32_t *weights = S->pTwiddle; /* Pointer to the Weights table */ 00144 float32_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */ 00145 float32_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */ 00146 float32_t in; /* Temporary variable */ 00147 00148 00149 /* DCT4 computation involves DCT2 (which is calculated using RFFT) 00150 * along with some pre-processing and post-processing. 00151 * Computational procedure is explained as follows: 00152 * (a) Pre-processing involves multiplying input with cos factor, 00153 * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n)) 00154 * where, 00155 * r(n) -- output of preprocessing 00156 * u(n) -- input to preprocessing(actual Source buffer) 00157 * (b) Calculation of DCT2 using FFT is divided into three steps: 00158 * Step1: Re-ordering of even and odd elements of input. 00159 * Step2: Calculating FFT of the re-ordered input. 00160 * Step3: Taking the real part of the product of FFT output and weights. 00161 * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation: 00162 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00163 * where, 00164 * Y4 -- DCT4 output, Y2 -- DCT2 output 00165 * (d) Multiplying the output with the normalizing factor sqrt(2/N). 00166 */ 00167 00168 /*-------- Pre-processing ------------*/ 00169 /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */ 00170 arm_scale_f32(pInlineBuffer, 2.0f, pInlineBuffer, S->N); 00171 arm_mult_f32(pInlineBuffer, cosFact, pInlineBuffer, S->N); 00172 00173 /* ---------------------------------------------------------------- 00174 * Step1: Re-ordering of even and odd elements as, 00175 * pState[i] = pInlineBuffer[2*i] and 00176 * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2 00177 ---------------------------------------------------------------------*/ 00178 00179 /* pS1 initialized to pState */ 00180 pS1 = pState; 00181 00182 /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */ 00183 pS2 = pState + (S->N - 1u); 00184 00185 /* pbuff initialized to input buffer */ 00186 pbuff = pInlineBuffer; 00187 00188 #ifndef ARM_MATH_CM0_FAMILY 00189 00190 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00191 00192 /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */ 00193 i = (uint32_t) S->Nby2 >> 2u; 00194 00195 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00196 ** a second loop below computes the remaining 1 to 3 samples. */ 00197 do 00198 { 00199 /* Re-ordering of even and odd elements */ 00200 /* pState[i] = pInlineBuffer[2*i] */ 00201 *pS1++ = *pbuff++; 00202 /* pState[N-i-1] = pInlineBuffer[2*i+1] */ 00203 *pS2-- = *pbuff++; 00204 00205 *pS1++ = *pbuff++; 00206 *pS2-- = *pbuff++; 00207 00208 *pS1++ = *pbuff++; 00209 *pS2-- = *pbuff++; 00210 00211 *pS1++ = *pbuff++; 00212 *pS2-- = *pbuff++; 00213 00214 /* Decrement the loop counter */ 00215 i--; 00216 } while(i > 0u); 00217 00218 /* pbuff initialized to input buffer */ 00219 pbuff = pInlineBuffer; 00220 00221 /* pS1 initialized to pState */ 00222 pS1 = pState; 00223 00224 /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 00225 i = (uint32_t) S->N >> 2u; 00226 00227 /* Processing with loop unrolling 4 times as N is always multiple of 4. 00228 * Compute 4 outputs at a time */ 00229 do 00230 { 00231 /* Writing the re-ordered output back to inplace input buffer */ 00232 *pbuff++ = *pS1++; 00233 *pbuff++ = *pS1++; 00234 *pbuff++ = *pS1++; 00235 *pbuff++ = *pS1++; 00236 00237 /* Decrement the loop counter */ 00238 i--; 00239 } while(i > 0u); 00240 00241 00242 /* --------------------------------------------------------- 00243 * Step2: Calculate RFFT for N-point input 00244 * ---------------------------------------------------------- */ 00245 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ 00246 arm_rfft_f32(S->pRfft, pInlineBuffer, pState); 00247 00248 /*---------------------------------------------------------------------- 00249 * Step3: Multiply the FFT output with the weights. 00250 *----------------------------------------------------------------------*/ 00251 arm_cmplx_mult_cmplx_f32(pState, weights, pState, S->N); 00252 00253 /* ----------- Post-processing ---------- */ 00254 /* DCT-IV can be obtained from DCT-II by the equation, 00255 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00256 * Hence, Y4(0) = Y2(0)/2 */ 00257 /* Getting only real part from the output and Converting to DCT-IV */ 00258 00259 /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */ 00260 i = ((uint32_t) S->N - 1u) >> 2u; 00261 00262 /* pbuff initialized to input buffer. */ 00263 pbuff = pInlineBuffer; 00264 00265 /* pS1 initialized to pState */ 00266 pS1 = pState; 00267 00268 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ 00269 in = *pS1++ * (float32_t) 0.5; 00270 /* input buffer acts as inplace, so output values are stored in the input itself. */ 00271 *pbuff++ = in; 00272 00273 /* pState pointer is incremented twice as the real values are located alternatively in the array */ 00274 pS1++; 00275 00276 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00277 ** a second loop below computes the remaining 1 to 3 samples. */ 00278 do 00279 { 00280 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00281 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00282 in = *pS1++ - in; 00283 *pbuff++ = in; 00284 /* points to the next real value */ 00285 pS1++; 00286 00287 in = *pS1++ - in; 00288 *pbuff++ = in; 00289 pS1++; 00290 00291 in = *pS1++ - in; 00292 *pbuff++ = in; 00293 pS1++; 00294 00295 in = *pS1++ - in; 00296 *pbuff++ = in; 00297 pS1++; 00298 00299 /* Decrement the loop counter */ 00300 i--; 00301 } while(i > 0u); 00302 00303 /* If the blockSize is not a multiple of 4, compute any remaining output samples here. 00304 ** No loop unrolling is used. */ 00305 i = ((uint32_t) S->N - 1u) % 0x4u; 00306 00307 while(i > 0u) 00308 { 00309 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00310 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00311 in = *pS1++ - in; 00312 *pbuff++ = in; 00313 /* points to the next real value */ 00314 pS1++; 00315 00316 /* Decrement the loop counter */ 00317 i--; 00318 } 00319 00320 00321 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ 00322 00323 /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 00324 i = (uint32_t) S->N >> 2u; 00325 00326 /* pbuff initialized to the pInlineBuffer(now contains the output values) */ 00327 pbuff = pInlineBuffer; 00328 00329 /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */ 00330 do 00331 { 00332 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ 00333 in = *pbuff; 00334 *pbuff++ = in * S->normalize; 00335 00336 in = *pbuff; 00337 *pbuff++ = in * S->normalize; 00338 00339 in = *pbuff; 00340 *pbuff++ = in * S->normalize; 00341 00342 in = *pbuff; 00343 *pbuff++ = in * S->normalize; 00344 00345 /* Decrement the loop counter */ 00346 i--; 00347 } while(i > 0u); 00348 00349 00350 #else 00351 00352 /* Run the below code for Cortex-M0 */ 00353 00354 /* Initializing the loop counter to N/2 */ 00355 i = (uint32_t) S->Nby2; 00356 00357 do 00358 { 00359 /* Re-ordering of even and odd elements */ 00360 /* pState[i] = pInlineBuffer[2*i] */ 00361 *pS1++ = *pbuff++; 00362 /* pState[N-i-1] = pInlineBuffer[2*i+1] */ 00363 *pS2-- = *pbuff++; 00364 00365 /* Decrement the loop counter */ 00366 i--; 00367 } while(i > 0u); 00368 00369 /* pbuff initialized to input buffer */ 00370 pbuff = pInlineBuffer; 00371 00372 /* pS1 initialized to pState */ 00373 pS1 = pState; 00374 00375 /* Initializing the loop counter */ 00376 i = (uint32_t) S->N; 00377 00378 do 00379 { 00380 /* Writing the re-ordered output back to inplace input buffer */ 00381 *pbuff++ = *pS1++; 00382 00383 /* Decrement the loop counter */ 00384 i--; 00385 } while(i > 0u); 00386 00387 00388 /* --------------------------------------------------------- 00389 * Step2: Calculate RFFT for N-point input 00390 * ---------------------------------------------------------- */ 00391 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ 00392 arm_rfft_f32(S->pRfft, pInlineBuffer, pState); 00393 00394 /*---------------------------------------------------------------------- 00395 * Step3: Multiply the FFT output with the weights. 00396 *----------------------------------------------------------------------*/ 00397 arm_cmplx_mult_cmplx_f32(pState, weights, pState, S->N); 00398 00399 /* ----------- Post-processing ---------- */ 00400 /* DCT-IV can be obtained from DCT-II by the equation, 00401 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00402 * Hence, Y4(0) = Y2(0)/2 */ 00403 /* Getting only real part from the output and Converting to DCT-IV */ 00404 00405 /* pbuff initialized to input buffer. */ 00406 pbuff = pInlineBuffer; 00407 00408 /* pS1 initialized to pState */ 00409 pS1 = pState; 00410 00411 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ 00412 in = *pS1++ * (float32_t) 0.5; 00413 /* input buffer acts as inplace, so output values are stored in the input itself. */ 00414 *pbuff++ = in; 00415 00416 /* pState pointer is incremented twice as the real values are located alternatively in the array */ 00417 pS1++; 00418 00419 /* Initializing the loop counter */ 00420 i = ((uint32_t) S->N - 1u); 00421 00422 do 00423 { 00424 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00425 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00426 in = *pS1++ - in; 00427 *pbuff++ = in; 00428 /* points to the next real value */ 00429 pS1++; 00430 00431 00432 /* Decrement the loop counter */ 00433 i--; 00434 } while(i > 0u); 00435 00436 00437 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ 00438 00439 /* Initializing the loop counter */ 00440 i = (uint32_t) S->N; 00441 00442 /* pbuff initialized to the pInlineBuffer(now contains the output values) */ 00443 pbuff = pInlineBuffer; 00444 00445 do 00446 { 00447 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ 00448 in = *pbuff; 00449 *pbuff++ = in * S->normalize; 00450 00451 /* Decrement the loop counter */ 00452 i--; 00453 } while(i > 0u); 00454 00455 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00456 00457 } 00458 00459 /** 00460 * @} end of DCT4_IDCT4 group 00461 */
Generated on Tue Jul 12 2022 11:59:16 by 1.7.2