Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
arm_dct4_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_dct4_q15.c 00004 * Description: Processing function of DCT4 & IDCT4 Q15 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @addtogroup DCT4_IDCT4 00033 * @{ 00034 */ 00035 00036 /** 00037 * @brief Processing function for the Q15 DCT4/IDCT4. 00038 * @param[in] *S points to an instance of the Q15 DCT4 structure. 00039 * @param[in] *pState points to state buffer. 00040 * @param[in,out] *pInlineBuffer points to the in-place input and output buffer. 00041 * @return none. 00042 * 00043 * \par Input an output formats: 00044 * Internally inputs are downscaled in the RFFT process function to avoid overflows. 00045 * Number of bits downscaled, depends on the size of the transform. 00046 * The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below: 00047 * 00048 * \image html dct4FormatsQ15Table.gif 00049 */ 00050 00051 void arm_dct4_q15( 00052 const arm_dct4_instance_q15 * S, 00053 q15_t * pState, 00054 q15_t * pInlineBuffer) 00055 { 00056 uint32_t i; /* Loop counter */ 00057 q15_t *weights = S->pTwiddle; /* Pointer to the Weights table */ 00058 q15_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */ 00059 q15_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */ 00060 q15_t in; /* Temporary variable */ 00061 00062 00063 /* DCT4 computation involves DCT2 (which is calculated using RFFT) 00064 * along with some pre-processing and post-processing. 00065 * Computational procedure is explained as follows: 00066 * (a) Pre-processing involves multiplying input with cos factor, 00067 * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n)) 00068 * where, 00069 * r(n) -- output of preprocessing 00070 * u(n) -- input to preprocessing(actual Source buffer) 00071 * (b) Calculation of DCT2 using FFT is divided into three steps: 00072 * Step1: Re-ordering of even and odd elements of input. 00073 * Step2: Calculating FFT of the re-ordered input. 00074 * Step3: Taking the real part of the product of FFT output and weights. 00075 * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation: 00076 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00077 * where, 00078 * Y4 -- DCT4 output, Y2 -- DCT2 output 00079 * (d) Multiplying the output with the normalizing factor sqrt(2/N). 00080 */ 00081 00082 /*-------- Pre-processing ------------*/ 00083 /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */ 00084 arm_mult_q15(pInlineBuffer, cosFact, pInlineBuffer, S->N); 00085 arm_shift_q15(pInlineBuffer, 1, pInlineBuffer, S->N); 00086 00087 /* ---------------------------------------------------------------- 00088 * Step1: Re-ordering of even and odd elements as 00089 * pState[i] = pInlineBuffer[2*i] and 00090 * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2 00091 ---------------------------------------------------------------------*/ 00092 00093 /* pS1 initialized to pState */ 00094 pS1 = pState; 00095 00096 /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */ 00097 pS2 = pState + (S->N - 1U); 00098 00099 /* pbuff initialized to input buffer */ 00100 pbuff = pInlineBuffer; 00101 00102 00103 #if defined (ARM_MATH_DSP) 00104 00105 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00106 00107 /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */ 00108 i = (uint32_t) S->Nby2 >> 2U; 00109 00110 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00111 ** a second loop below computes the remaining 1 to 3 samples. */ 00112 do 00113 { 00114 /* Re-ordering of even and odd elements */ 00115 /* pState[i] = pInlineBuffer[2*i] */ 00116 *pS1++ = *pbuff++; 00117 /* pState[N-i-1] = pInlineBuffer[2*i+1] */ 00118 *pS2-- = *pbuff++; 00119 00120 *pS1++ = *pbuff++; 00121 *pS2-- = *pbuff++; 00122 00123 *pS1++ = *pbuff++; 00124 *pS2-- = *pbuff++; 00125 00126 *pS1++ = *pbuff++; 00127 *pS2-- = *pbuff++; 00128 00129 /* Decrement the loop counter */ 00130 i--; 00131 } while (i > 0U); 00132 00133 /* pbuff initialized to input buffer */ 00134 pbuff = pInlineBuffer; 00135 00136 /* pS1 initialized to pState */ 00137 pS1 = pState; 00138 00139 /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 00140 i = (uint32_t) S->N >> 2U; 00141 00142 /* Processing with loop unrolling 4 times as N is always multiple of 4. 00143 * Compute 4 outputs at a time */ 00144 do 00145 { 00146 /* Writing the re-ordered output back to inplace input buffer */ 00147 *pbuff++ = *pS1++; 00148 *pbuff++ = *pS1++; 00149 *pbuff++ = *pS1++; 00150 *pbuff++ = *pS1++; 00151 00152 /* Decrement the loop counter */ 00153 i--; 00154 } while (i > 0U); 00155 00156 00157 /* --------------------------------------------------------- 00158 * Step2: Calculate RFFT for N-point input 00159 * ---------------------------------------------------------- */ 00160 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ 00161 arm_rfft_q15(S->pRfft, pInlineBuffer, pState); 00162 00163 /*---------------------------------------------------------------------- 00164 * Step3: Multiply the FFT output with the weights. 00165 *----------------------------------------------------------------------*/ 00166 arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N); 00167 00168 /* The output of complex multiplication is in 3.13 format. 00169 * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */ 00170 arm_shift_q15(pState, 2, pState, S->N * 2); 00171 00172 /* ----------- Post-processing ---------- */ 00173 /* DCT-IV can be obtained from DCT-II by the equation, 00174 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00175 * Hence, Y4(0) = Y2(0)/2 */ 00176 /* Getting only real part from the output and Converting to DCT-IV */ 00177 00178 /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */ 00179 i = ((uint32_t) S->N - 1U) >> 2U; 00180 00181 /* pbuff initialized to input buffer. */ 00182 pbuff = pInlineBuffer; 00183 00184 /* pS1 initialized to pState */ 00185 pS1 = pState; 00186 00187 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ 00188 in = *pS1++ >> 1U; 00189 /* input buffer acts as inplace, so output values are stored in the input itself. */ 00190 *pbuff++ = in; 00191 00192 /* pState pointer is incremented twice as the real values are located alternatively in the array */ 00193 pS1++; 00194 00195 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00196 ** a second loop below computes the remaining 1 to 3 samples. */ 00197 do 00198 { 00199 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00200 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00201 in = *pS1++ - in; 00202 *pbuff++ = in; 00203 /* points to the next real value */ 00204 pS1++; 00205 00206 in = *pS1++ - in; 00207 *pbuff++ = in; 00208 pS1++; 00209 00210 in = *pS1++ - in; 00211 *pbuff++ = in; 00212 pS1++; 00213 00214 in = *pS1++ - in; 00215 *pbuff++ = in; 00216 pS1++; 00217 00218 /* Decrement the loop counter */ 00219 i--; 00220 } while (i > 0U); 00221 00222 /* If the blockSize is not a multiple of 4, compute any remaining output samples here. 00223 ** No loop unrolling is used. */ 00224 i = ((uint32_t) S->N - 1U) % 0x4U; 00225 00226 while (i > 0U) 00227 { 00228 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00229 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00230 in = *pS1++ - in; 00231 *pbuff++ = in; 00232 /* points to the next real value */ 00233 pS1++; 00234 00235 /* Decrement the loop counter */ 00236 i--; 00237 } 00238 00239 00240 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ 00241 00242 /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 00243 i = (uint32_t) S->N >> 2U; 00244 00245 /* pbuff initialized to the pInlineBuffer(now contains the output values) */ 00246 pbuff = pInlineBuffer; 00247 00248 /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */ 00249 do 00250 { 00251 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ 00252 in = *pbuff; 00253 *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); 00254 00255 in = *pbuff; 00256 *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); 00257 00258 in = *pbuff; 00259 *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); 00260 00261 in = *pbuff; 00262 *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); 00263 00264 /* Decrement the loop counter */ 00265 i--; 00266 } while (i > 0U); 00267 00268 00269 #else 00270 00271 /* Run the below code for Cortex-M0 */ 00272 00273 /* Initializing the loop counter to N/2 */ 00274 i = (uint32_t) S->Nby2; 00275 00276 do 00277 { 00278 /* Re-ordering of even and odd elements */ 00279 /* pState[i] = pInlineBuffer[2*i] */ 00280 *pS1++ = *pbuff++; 00281 /* pState[N-i-1] = pInlineBuffer[2*i+1] */ 00282 *pS2-- = *pbuff++; 00283 00284 /* Decrement the loop counter */ 00285 i--; 00286 } while (i > 0U); 00287 00288 /* pbuff initialized to input buffer */ 00289 pbuff = pInlineBuffer; 00290 00291 /* pS1 initialized to pState */ 00292 pS1 = pState; 00293 00294 /* Initializing the loop counter */ 00295 i = (uint32_t) S->N; 00296 00297 do 00298 { 00299 /* Writing the re-ordered output back to inplace input buffer */ 00300 *pbuff++ = *pS1++; 00301 00302 /* Decrement the loop counter */ 00303 i--; 00304 } while (i > 0U); 00305 00306 00307 /* --------------------------------------------------------- 00308 * Step2: Calculate RFFT for N-point input 00309 * ---------------------------------------------------------- */ 00310 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ 00311 arm_rfft_q15(S->pRfft, pInlineBuffer, pState); 00312 00313 /*---------------------------------------------------------------------- 00314 * Step3: Multiply the FFT output with the weights. 00315 *----------------------------------------------------------------------*/ 00316 arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N); 00317 00318 /* The output of complex multiplication is in 3.13 format. 00319 * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */ 00320 arm_shift_q15(pState, 2, pState, S->N * 2); 00321 00322 /* ----------- Post-processing ---------- */ 00323 /* DCT-IV can be obtained from DCT-II by the equation, 00324 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00325 * Hence, Y4(0) = Y2(0)/2 */ 00326 /* Getting only real part from the output and Converting to DCT-IV */ 00327 00328 /* Initializing the loop counter */ 00329 i = ((uint32_t) S->N - 1U); 00330 00331 /* pbuff initialized to input buffer. */ 00332 pbuff = pInlineBuffer; 00333 00334 /* pS1 initialized to pState */ 00335 pS1 = pState; 00336 00337 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ 00338 in = *pS1++ >> 1U; 00339 /* input buffer acts as inplace, so output values are stored in the input itself. */ 00340 *pbuff++ = in; 00341 00342 /* pState pointer is incremented twice as the real values are located alternatively in the array */ 00343 pS1++; 00344 00345 do 00346 { 00347 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00348 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00349 in = *pS1++ - in; 00350 *pbuff++ = in; 00351 /* points to the next real value */ 00352 pS1++; 00353 00354 /* Decrement the loop counter */ 00355 i--; 00356 } while (i > 0U); 00357 00358 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ 00359 00360 /* Initializing the loop counter */ 00361 i = (uint32_t) S->N; 00362 00363 /* pbuff initialized to the pInlineBuffer(now contains the output values) */ 00364 pbuff = pInlineBuffer; 00365 00366 do 00367 { 00368 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ 00369 in = *pbuff; 00370 *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); 00371 00372 /* Decrement the loop counter */ 00373 i--; 00374 } while (i > 0U); 00375 00376 #endif /* #if defined (ARM_MATH_DSP) */ 00377 00378 } 00379 00380 /** 00381 * @} end of DCT4_IDCT4 group 00382 */ 00383
Generated on Tue Jul 12 2022 16:47:27 by
 1.7.2
 1.7.2