CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_dct4_q15.c Source File

arm_dct4_q15.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_dct4_q15.c  
00009 *  
00010 * Description:  Processing function of DCT4 & IDCT4 Q15.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated.  
00025 * -------------------------------------------------------------------- */ 
00026  
00027 #include "arm_math.h" 
00028  
00029 /**  
00030  * @addtogroup DCT4_IDCT4  
00031  * @{  
00032  */ 
00033  
00034 /**  
00035  * @brief Processing function for the Q15 DCT4/IDCT4. 
00036  * @param[in]       *S             points to an instance of the Q15 DCT4 structure. 
00037  * @param[in]       *pState        points to state buffer. 
00038  * @param[in,out]   *pInlineBuffer points to the in-place input and output buffer. 
00039  * @return none. 
00040  *   
00041  * \par Input an output formats:  
00042  * Internally inputs are downscaled in the RFFT process function to avoid overflows.  
00043  * Number of bits downscaled, depends on the size of the transform.  
00044  * The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below:   
00045  *  
00046  * \image html dct4FormatsQ15Table.gif  
00047  */ 
00048  
00049 void arm_dct4_q15( 
00050   const arm_dct4_instance_q15 * S, 
00051   q15_t * pState, 
00052   q15_t * pInlineBuffer) 
00053 { 
00054   uint32_t i;                                    /* Loop counter */ 
00055   q15_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */ 
00056   q15_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */ 
00057   q15_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */ 
00058   q15_t in;                                      /* Temporary variable */ 
00059  
00060  
00061   /* DCT4 computation involves DCT2 (which is calculated using RFFT)  
00062    * along with some pre-processing and post-processing.  
00063    * Computational procedure is explained as follows:  
00064    * (a) Pre-processing involves multiplying input with cos factor,  
00065    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))  
00066    *              where,  
00067    *                 r(n) -- output of preprocessing  
00068    *                 u(n) -- input to preprocessing(actual Source buffer)  
00069    * (b) Calculation of DCT2 using FFT is divided into three steps:  
00070    *                  Step1: Re-ordering of even and odd elements of input.  
00071    *                  Step2: Calculating FFT of the re-ordered input.  
00072    *                  Step3: Taking the real part of the product of FFT output and weights.  
00073    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:  
00074    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)  
00075    *                        where,  
00076    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output  
00077    * (d) Multiplying the output with the normalizing factor sqrt(2/N).  
00078    */ 
00079  
00080         /*-------- Pre-processing ------------*/ 
00081   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */ 
00082   arm_mult_q15(pInlineBuffer, cosFact, pInlineBuffer, S->N); 
00083   arm_shift_q15(pInlineBuffer, 1, pInlineBuffer, S->N); 
00084  
00085   /* ----------------------------------------------------------------  
00086    * Step1: Re-ordering of even and odd elements as  
00087    *             pState[i] =  pInlineBuffer[2*i] and  
00088    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2  
00089    ---------------------------------------------------------------------*/ 
00090  
00091   /* pS1 initialized to pState */ 
00092   pS1 = pState; 
00093  
00094   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */ 
00095   pS2 = pState + (S->N - 1u); 
00096  
00097   /* pbuff initialized to input buffer */ 
00098   pbuff = pInlineBuffer; 
00099  
00100   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */ 
00101   i = (uint32_t) S->Nby2 >> 2u; 
00102  
00103   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
00104    ** a second loop below computes the remaining 1 to 3 samples. */ 
00105   do 
00106   { 
00107     /* Re-ordering of even and odd elements */ 
00108     /* pState[i] =  pInlineBuffer[2*i] */ 
00109     *pS1++ = *pbuff++; 
00110     /* pState[N-i-1] = pInlineBuffer[2*i+1] */ 
00111     *pS2-- = *pbuff++; 
00112  
00113     *pS1++ = *pbuff++; 
00114     *pS2-- = *pbuff++; 
00115  
00116     *pS1++ = *pbuff++; 
00117     *pS2-- = *pbuff++; 
00118  
00119     *pS1++ = *pbuff++; 
00120     *pS2-- = *pbuff++; 
00121  
00122     /* Decrement the loop counter */ 
00123     i--; 
00124   } while(i > 0u); 
00125  
00126   /* pbuff initialized to input buffer */ 
00127   pbuff = pInlineBuffer; 
00128  
00129   /* pS1 initialized to pState */ 
00130   pS1 = pState; 
00131  
00132   /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 
00133   i = (uint32_t) S->N >> 2u; 
00134  
00135   /* Processing with loop unrolling 4 times as N is always multiple of 4.  
00136    * Compute 4 outputs at a time */ 
00137   do 
00138   { 
00139     /* Writing the re-ordered output back to inplace input buffer */ 
00140     *pbuff++ = *pS1++; 
00141     *pbuff++ = *pS1++; 
00142     *pbuff++ = *pS1++; 
00143     *pbuff++ = *pS1++; 
00144  
00145     /* Decrement the loop counter */ 
00146     i--; 
00147   } while(i > 0u); 
00148  
00149  
00150   /* ---------------------------------------------------------  
00151    *     Step2: Calculate RFFT for N-point input  
00152    * ---------------------------------------------------------- */ 
00153   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ 
00154   arm_rfft_q15(S->pRfft, pInlineBuffer, pState); 
00155  
00156  /*----------------------------------------------------------------------  
00157   *  Step3: Multiply the FFT output with the weights.  
00158   *----------------------------------------------------------------------*/ 
00159   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N); 
00160  
00161   /* The output of complex multiplication is in 3.13 format.  
00162    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */ 
00163   arm_shift_q15(pState, 2, pState, S->N * 2); 
00164  
00165   /* ----------- Post-processing ---------- */ 
00166   /* DCT-IV can be obtained from DCT-II by the equation,  
00167    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)  
00168    *       Hence, Y4(0) = Y2(0)/2  */ 
00169   /* Getting only real part from the output and Converting to DCT-IV */ 
00170  
00171   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */ 
00172   i = ((uint32_t) S->N - 1u) >> 2u; 
00173  
00174   /* pbuff initialized to input buffer. */ 
00175   pbuff = pInlineBuffer; 
00176  
00177   /* pS1 initialized to pState */ 
00178   pS1 = pState; 
00179  
00180   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ 
00181   in = *pS1++ >> 1u; 
00182   /* input buffer acts as inplace, so output values are stored in the input itself. */ 
00183   *pbuff++ = in; 
00184  
00185   /* pState pointer is incremented twice as the real values are located alternatively in the array */ 
00186   pS1++; 
00187  
00188   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
00189    ** a second loop below computes the remaining 1 to 3 samples. */ 
00190   do 
00191   { 
00192     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 
00193     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 
00194     in = *pS1++ - in; 
00195     *pbuff++ = in; 
00196     /* points to the next real value */ 
00197     pS1++; 
00198  
00199     in = *pS1++ - in; 
00200     *pbuff++ = in; 
00201     pS1++; 
00202  
00203     in = *pS1++ - in; 
00204     *pbuff++ = in; 
00205     pS1++; 
00206  
00207     in = *pS1++ - in; 
00208     *pbuff++ = in; 
00209     pS1++; 
00210  
00211     /* Decrement the loop counter */ 
00212     i--; 
00213   } while(i > 0u); 
00214  
00215   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
00216    ** No loop unrolling is used. */ 
00217   i = ((uint32_t) S->N - 1u) % 0x4u; 
00218  
00219   while(i > 0u) 
00220   { 
00221     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 
00222     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 
00223     in = *pS1++ - in; 
00224     *pbuff++ = in; 
00225     /* points to the next real value */ 
00226     pS1++; 
00227  
00228     /* Decrement the loop counter */ 
00229     i--; 
00230   } 
00231  
00232  
00233    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ 
00234  
00235   /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 
00236   i = (uint32_t) S->N >> 2u; 
00237  
00238   /* pbuff initialized to the pInlineBuffer(now contains the output values) */ 
00239   pbuff = pInlineBuffer; 
00240  
00241   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */ 
00242   do 
00243   { 
00244     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ 
00245     in = *pbuff; 
00246     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); 
00247  
00248     in = *pbuff; 
00249     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); 
00250  
00251     in = *pbuff; 
00252     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); 
00253  
00254     in = *pbuff; 
00255     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); 
00256  
00257     /* Decrement the loop counter */ 
00258     i--; 
00259   } while(i > 0u); 
00260  
00261 } 
00262  
00263 /**  
00264    * @} end of DCT4_IDCT4 group  
00265    */