CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_dct4_q31.c Source File

arm_dct4_q31.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_dct4_q31.c  
00009 *  
00010 * Description:  Processing function of DCT4 & IDCT4 Q31.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated.  
00025 * -------------------------------------------------------------------- */ 
00026  
00027 #include "arm_math.h" 
00028  
00029 /**  
00030  * @addtogroup DCT4_IDCT4  
00031  * @{  
00032  */ 
00033  
00034 /**  
00035  * @brief Processing function for the Q31 DCT4/IDCT4. 
00036  * @param[in]       *S             points to an instance of the Q31 DCT4 structure. 
00037  * @param[in]       *pState        points to state buffer. 
00038  * @param[in,out]   *pInlineBuffer points to the in-place input and output buffer. 
00039  * @return none. 
00040  * \par Input an output formats:  
00041  * Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process,  
00042  * as the conversion from DCT2 to DCT4 involves one subtraction.  
00043  * Internally inputs are downscaled in the RFFT process function to avoid overflows.  
00044  * Number of bits downscaled, depends on the size of the transform.  
00045  * The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below:   
00046  *  
00047  * \image html dct4FormatsQ31Table.gif  
00048  */ 
00049  
00050 void arm_dct4_q31( 
00051   const arm_dct4_instance_q31 * S, 
00052   q31_t * pState, 
00053   q31_t * pInlineBuffer) 
00054 { 
00055   uint16_t i;                                    /* Loop counter */ 
00056   q31_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */ 
00057   q31_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */ 
00058   q31_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */ 
00059   q31_t in;                                      /* Temporary variable */ 
00060  
00061  
00062   /* DCT4 computation involves DCT2 (which is calculated using RFFT)  
00063    * along with some pre-processing and post-processing.  
00064    * Computational procedure is explained as follows:  
00065    * (a) Pre-processing involves multiplying input with cos factor,  
00066    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))  
00067    *              where,  
00068    *                 r(n) -- output of preprocessing  
00069    *                 u(n) -- input to preprocessing(actual Source buffer)  
00070    * (b) Calculation of DCT2 using FFT is divided into three steps:  
00071    *                  Step1: Re-ordering of even and odd elements of input.  
00072    *                  Step2: Calculating FFT of the re-ordered input.  
00073    *                  Step3: Taking the real part of the product of FFT output and weights.  
00074    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:  
00075    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)  
00076    *                        where,  
00077    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output  
00078    * (d) Multiplying the output with the normalizing factor sqrt(2/N).  
00079    */ 
00080  
00081         /*-------- Pre-processing ------------*/ 
00082   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */ 
00083   arm_mult_q31(pInlineBuffer, cosFact, pInlineBuffer, S->N); 
00084   arm_shift_q31(pInlineBuffer, 1, pInlineBuffer, S->N); 
00085  
00086   /* ----------------------------------------------------------------  
00087    * Step1: Re-ordering of even and odd elements as  
00088    *             pState[i] =  pInlineBuffer[2*i] and  
00089    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2  
00090    ---------------------------------------------------------------------*/ 
00091  
00092   /* pS1 initialized to pState */ 
00093   pS1 = pState; 
00094  
00095   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */ 
00096   pS2 = pState + (S->N - 1u); 
00097  
00098   /* pbuff initialized to input buffer */ 
00099   pbuff = pInlineBuffer; 
00100  
00101   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */ 
00102   i = S->Nby2 >> 2u; 
00103  
00104   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
00105    ** a second loop below computes the remaining 1 to 3 samples. */ 
00106   do 
00107   { 
00108     /* Re-ordering of even and odd elements */ 
00109     /* pState[i] =  pInlineBuffer[2*i] */ 
00110     *pS1++ = *pbuff++; 
00111     /* pState[N-i-1] = pInlineBuffer[2*i+1] */ 
00112     *pS2-- = *pbuff++; 
00113  
00114     *pS1++ = *pbuff++; 
00115     *pS2-- = *pbuff++; 
00116  
00117     *pS1++ = *pbuff++; 
00118     *pS2-- = *pbuff++; 
00119  
00120     *pS1++ = *pbuff++; 
00121     *pS2-- = *pbuff++; 
00122  
00123     /* Decrement the loop counter */ 
00124     i--; 
00125   } while(i > 0u); 
00126  
00127   /* pbuff initialized to input buffer */ 
00128   pbuff = pInlineBuffer; 
00129  
00130   /* pS1 initialized to pState */ 
00131   pS1 = pState; 
00132  
00133   /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 
00134   i = S->N >> 2u; 
00135  
00136   /* Processing with loop unrolling 4 times as N is always multiple of 4.  
00137    * Compute 4 outputs at a time */ 
00138   do 
00139   { 
00140     /* Writing the re-ordered output back to inplace input buffer */ 
00141     *pbuff++ = *pS1++; 
00142     *pbuff++ = *pS1++; 
00143     *pbuff++ = *pS1++; 
00144     *pbuff++ = *pS1++; 
00145  
00146     /* Decrement the loop counter */ 
00147     i--; 
00148   } while(i > 0u); 
00149  
00150  
00151   /* ---------------------------------------------------------  
00152    *     Step2: Calculate RFFT for N-point input  
00153    * ---------------------------------------------------------- */ 
00154   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ 
00155   arm_rfft_q31(S->pRfft, pInlineBuffer, pState); 
00156  
00157   /*----------------------------------------------------------------------  
00158    *  Step3: Multiply the FFT output with the weights.  
00159    *----------------------------------------------------------------------*/ 
00160   arm_cmplx_mult_cmplx_q31(pState, weights, pState, S->N); 
00161  
00162   /* The output of complex multiplication is in 3.29 format.  
00163    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */ 
00164   arm_shift_q31(pState, 2, pState, S->N * 2); 
00165  
00166   /* ----------- Post-processing ---------- */ 
00167   /* DCT-IV can be obtained from DCT-II by the equation,  
00168    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)  
00169    *       Hence, Y4(0) = Y2(0)/2  */ 
00170   /* Getting only real part from the output and Converting to DCT-IV */ 
00171  
00172   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */ 
00173   i = (S->N - 1u) >> 2u; 
00174  
00175   /* pbuff initialized to input buffer. */ 
00176   pbuff = pInlineBuffer; 
00177  
00178   /* pS1 initialized to pState */ 
00179   pS1 = pState; 
00180  
00181   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ 
00182   in = *pS1++ >> 1u; 
00183   /* input buffer acts as inplace, so output values are stored in the input itself. */ 
00184   *pbuff++ = in; 
00185  
00186   /* pState pointer is incremented twice as the real values are located alternatively in the array */ 
00187   pS1++; 
00188  
00189   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
00190    ** a second loop below computes the remaining 1 to 3 samples. */ 
00191   do 
00192   { 
00193     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 
00194     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 
00195     in = *pS1++ - in; 
00196     *pbuff++ = in; 
00197     /* points to the next real value */ 
00198     pS1++; 
00199  
00200     in = *pS1++ - in; 
00201     *pbuff++ = in; 
00202     pS1++; 
00203  
00204     in = *pS1++ - in; 
00205     *pbuff++ = in; 
00206     pS1++; 
00207  
00208     in = *pS1++ - in; 
00209     *pbuff++ = in; 
00210     pS1++; 
00211  
00212     /* Decrement the loop counter */ 
00213     i--; 
00214   } while(i > 0u); 
00215  
00216   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
00217    ** No loop unrolling is used. */ 
00218   i = (S->N - 1u) % 0x4u; 
00219  
00220   while(i > 0u) 
00221   { 
00222     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 
00223     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 
00224     in = *pS1++ - in; 
00225     *pbuff++ = in; 
00226     /* points to the next real value */ 
00227     pS1++; 
00228  
00229     /* Decrement the loop counter */ 
00230     i--; 
00231   } 
00232  
00233  
00234         /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ 
00235  
00236   /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 
00237   i = S->N >> 2u; 
00238  
00239   /* pbuff initialized to the pInlineBuffer(now contains the output values) */ 
00240   pbuff = pInlineBuffer; 
00241  
00242   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */ 
00243   do 
00244   { 
00245     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ 
00246     in = *pbuff; 
00247     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 
00248  
00249     in = *pbuff; 
00250     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 
00251  
00252     in = *pbuff; 
00253     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 
00254  
00255     in = *pbuff; 
00256     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 
00257  
00258     /* Decrement the loop counter */ 
00259     i--; 
00260   } while(i > 0u); 
00261  
00262 } 
00263  
00264 /**  
00265    * @} end of DCT4_IDCT4 group  
00266    */