Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_dct4_q15.c Source File

arm_dct4_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_dct4_q15.c
00004  * Description:  Processing function of DCT4 & IDCT4 Q15
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @addtogroup DCT4_IDCT4
00033  * @{
00034  */
00035 
00036 /**
00037  * @brief Processing function for the Q15 DCT4/IDCT4.
00038  * @param[in]       *S             points to an instance of the Q15 DCT4 structure.
00039  * @param[in]       *pState        points to state buffer.
00040  * @param[in,out]   *pInlineBuffer points to the in-place input and output buffer.
00041  * @return none.
00042  *
00043  * \par Input an output formats:
00044  * Internally inputs are downscaled in the RFFT process function to avoid overflows.
00045  * Number of bits downscaled, depends on the size of the transform.
00046  * The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below:
00047  *
00048  * \image html dct4FormatsQ15Table.gif
00049  */
00050 
00051 void arm_dct4_q15(
00052   const arm_dct4_instance_q15 * S,
00053   q15_t * pState,
00054   q15_t * pInlineBuffer)
00055 {
00056   uint32_t i;                                    /* Loop counter */
00057   q15_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
00058   q15_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
00059   q15_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
00060   q15_t in;                                      /* Temporary variable */
00061 
00062 
00063   /* DCT4 computation involves DCT2 (which is calculated using RFFT)
00064    * along with some pre-processing and post-processing.
00065    * Computational procedure is explained as follows:
00066    * (a) Pre-processing involves multiplying input with cos factor,
00067    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
00068    *              where,
00069    *                 r(n) -- output of preprocessing
00070    *                 u(n) -- input to preprocessing(actual Source buffer)
00071    * (b) Calculation of DCT2 using FFT is divided into three steps:
00072    *                  Step1: Re-ordering of even and odd elements of input.
00073    *                  Step2: Calculating FFT of the re-ordered input.
00074    *                  Step3: Taking the real part of the product of FFT output and weights.
00075    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
00076    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
00077    *                        where,
00078    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
00079    * (d) Multiplying the output with the normalizing factor sqrt(2/N).
00080    */
00081 
00082         /*-------- Pre-processing ------------*/
00083   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
00084   arm_mult_q15(pInlineBuffer, cosFact, pInlineBuffer, S->N);
00085   arm_shift_q15(pInlineBuffer, 1, pInlineBuffer, S->N);
00086 
00087   /* ----------------------------------------------------------------
00088    * Step1: Re-ordering of even and odd elements as
00089    *             pState[i] =  pInlineBuffer[2*i] and
00090    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
00091    ---------------------------------------------------------------------*/
00092 
00093   /* pS1 initialized to pState */
00094   pS1 = pState;
00095 
00096   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
00097   pS2 = pState + (S->N - 1U);
00098 
00099   /* pbuff initialized to input buffer */
00100   pbuff = pInlineBuffer;
00101 
00102 
00103 #if defined (ARM_MATH_DSP)
00104 
00105   /* Run the below code for Cortex-M4 and Cortex-M3 */
00106 
00107   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
00108   i = (uint32_t) S->Nby2 >> 2U;
00109 
00110   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
00111    ** a second loop below computes the remaining 1 to 3 samples. */
00112   do
00113   {
00114     /* Re-ordering of even and odd elements */
00115     /* pState[i] =  pInlineBuffer[2*i] */
00116     *pS1++ = *pbuff++;
00117     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
00118     *pS2-- = *pbuff++;
00119 
00120     *pS1++ = *pbuff++;
00121     *pS2-- = *pbuff++;
00122 
00123     *pS1++ = *pbuff++;
00124     *pS2-- = *pbuff++;
00125 
00126     *pS1++ = *pbuff++;
00127     *pS2-- = *pbuff++;
00128 
00129     /* Decrement the loop counter */
00130     i--;
00131   } while (i > 0U);
00132 
00133   /* pbuff initialized to input buffer */
00134   pbuff = pInlineBuffer;
00135 
00136   /* pS1 initialized to pState */
00137   pS1 = pState;
00138 
00139   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
00140   i = (uint32_t) S->N >> 2U;
00141 
00142   /* Processing with loop unrolling 4 times as N is always multiple of 4.
00143    * Compute 4 outputs at a time */
00144   do
00145   {
00146     /* Writing the re-ordered output back to inplace input buffer */
00147     *pbuff++ = *pS1++;
00148     *pbuff++ = *pS1++;
00149     *pbuff++ = *pS1++;
00150     *pbuff++ = *pS1++;
00151 
00152     /* Decrement the loop counter */
00153     i--;
00154   } while (i > 0U);
00155 
00156 
00157   /* ---------------------------------------------------------
00158    *     Step2: Calculate RFFT for N-point input
00159    * ---------------------------------------------------------- */
00160   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
00161   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);
00162 
00163  /*----------------------------------------------------------------------
00164   *  Step3: Multiply the FFT output with the weights.
00165   *----------------------------------------------------------------------*/
00166   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);
00167 
00168   /* The output of complex multiplication is in 3.13 format.
00169    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
00170   arm_shift_q15(pState, 2, pState, S->N * 2);
00171 
00172   /* ----------- Post-processing ---------- */
00173   /* DCT-IV can be obtained from DCT-II by the equation,
00174    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
00175    *       Hence, Y4(0) = Y2(0)/2  */
00176   /* Getting only real part from the output and Converting to DCT-IV */
00177 
00178   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
00179   i = ((uint32_t) S->N - 1U) >> 2U;
00180 
00181   /* pbuff initialized to input buffer. */
00182   pbuff = pInlineBuffer;
00183 
00184   /* pS1 initialized to pState */
00185   pS1 = pState;
00186 
00187   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
00188   in = *pS1++ >> 1U;
00189   /* input buffer acts as inplace, so output values are stored in the input itself. */
00190   *pbuff++ = in;
00191 
00192   /* pState pointer is incremented twice as the real values are located alternatively in the array */
00193   pS1++;
00194 
00195   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
00196    ** a second loop below computes the remaining 1 to 3 samples. */
00197   do
00198   {
00199     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
00200     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
00201     in = *pS1++ - in;
00202     *pbuff++ = in;
00203     /* points to the next real value */
00204     pS1++;
00205 
00206     in = *pS1++ - in;
00207     *pbuff++ = in;
00208     pS1++;
00209 
00210     in = *pS1++ - in;
00211     *pbuff++ = in;
00212     pS1++;
00213 
00214     in = *pS1++ - in;
00215     *pbuff++ = in;
00216     pS1++;
00217 
00218     /* Decrement the loop counter */
00219     i--;
00220   } while (i > 0U);
00221 
00222   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
00223    ** No loop unrolling is used. */
00224   i = ((uint32_t) S->N - 1U) % 0x4U;
00225 
00226   while (i > 0U)
00227   {
00228     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
00229     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
00230     in = *pS1++ - in;
00231     *pbuff++ = in;
00232     /* points to the next real value */
00233     pS1++;
00234 
00235     /* Decrement the loop counter */
00236     i--;
00237   }
00238 
00239 
00240    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
00241 
00242   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
00243   i = (uint32_t) S->N >> 2U;
00244 
00245   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
00246   pbuff = pInlineBuffer;
00247 
00248   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
00249   do
00250   {
00251     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
00252     in = *pbuff;
00253     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
00254 
00255     in = *pbuff;
00256     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
00257 
00258     in = *pbuff;
00259     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
00260 
00261     in = *pbuff;
00262     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
00263 
00264     /* Decrement the loop counter */
00265     i--;
00266   } while (i > 0U);
00267 
00268 
00269 #else
00270 
00271   /* Run the below code for Cortex-M0 */
00272 
00273   /* Initializing the loop counter to N/2 */
00274   i = (uint32_t) S->Nby2;
00275 
00276   do
00277   {
00278     /* Re-ordering of even and odd elements */
00279     /* pState[i] =  pInlineBuffer[2*i] */
00280     *pS1++ = *pbuff++;
00281     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
00282     *pS2-- = *pbuff++;
00283 
00284     /* Decrement the loop counter */
00285     i--;
00286   } while (i > 0U);
00287 
00288   /* pbuff initialized to input buffer */
00289   pbuff = pInlineBuffer;
00290 
00291   /* pS1 initialized to pState */
00292   pS1 = pState;
00293 
00294   /* Initializing the loop counter */
00295   i = (uint32_t) S->N;
00296 
00297   do
00298   {
00299     /* Writing the re-ordered output back to inplace input buffer */
00300     *pbuff++ = *pS1++;
00301 
00302     /* Decrement the loop counter */
00303     i--;
00304   } while (i > 0U);
00305 
00306 
00307   /* ---------------------------------------------------------
00308    *     Step2: Calculate RFFT for N-point input
00309    * ---------------------------------------------------------- */
00310   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
00311   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);
00312 
00313  /*----------------------------------------------------------------------
00314   *  Step3: Multiply the FFT output with the weights.
00315   *----------------------------------------------------------------------*/
00316   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);
00317 
00318   /* The output of complex multiplication is in 3.13 format.
00319    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
00320   arm_shift_q15(pState, 2, pState, S->N * 2);
00321 
00322   /* ----------- Post-processing ---------- */
00323   /* DCT-IV can be obtained from DCT-II by the equation,
00324    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
00325    *       Hence, Y4(0) = Y2(0)/2  */
00326   /* Getting only real part from the output and Converting to DCT-IV */
00327 
00328   /* Initializing the loop counter */
00329   i = ((uint32_t) S->N - 1U);
00330 
00331   /* pbuff initialized to input buffer. */
00332   pbuff = pInlineBuffer;
00333 
00334   /* pS1 initialized to pState */
00335   pS1 = pState;
00336 
00337   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
00338   in = *pS1++ >> 1U;
00339   /* input buffer acts as inplace, so output values are stored in the input itself. */
00340   *pbuff++ = in;
00341 
00342   /* pState pointer is incremented twice as the real values are located alternatively in the array */
00343   pS1++;
00344 
00345   do
00346   {
00347     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
00348     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
00349     in = *pS1++ - in;
00350     *pbuff++ = in;
00351     /* points to the next real value */
00352     pS1++;
00353 
00354     /* Decrement the loop counter */
00355     i--;
00356   } while (i > 0U);
00357 
00358    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
00359 
00360   /* Initializing the loop counter */
00361   i = (uint32_t) S->N;
00362 
00363   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
00364   pbuff = pInlineBuffer;
00365 
00366   do
00367   {
00368     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
00369     in = *pbuff;
00370     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
00371 
00372     /* Decrement the loop counter */
00373     i--;
00374   } while (i > 0U);
00375 
00376 #endif /* #if defined (ARM_MATH_DSP) */
00377 
00378 }
00379 
00380 /**
00381    * @} end of DCT4_IDCT4 group
00382    */
00383