CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_fast_q31.c Source File

arm_conv_fast_q31.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_conv_fast_q31.c  
00009 *  
00010 * Description:  Q31 Convolution (fast version).  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated.  
00025 * -------------------------------------------------------------------- */ 
00026  
00027 #include "arm_math.h" 
00028  
00029 /**  
00030  * @ingroup groupFilters  
00031  */ 
00032  
00033 /**  
00034  * @addtogroup Conv  
00035  * @{  
00036  */ 
00037  
00038 /**  
00039  * @param[in] *pSrcA points to the first input sequence.  
00040  * @param[in] srcALen length of the first input sequence.  
00041  * @param[in] *pSrcB points to the second input sequence.  
00042  * @param[in] srcBLen length of the second input sequence.  
00043  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.  
00044  * @return none.  
00045  *  
00046  * @details  
00047  * <b>Scaling and Overflow Behavior:</b>  
00048  *  
00049  * \par  
00050  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.  
00051  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.  
00052  * These intermediate results are accumulated in a 32-bit register in 2.30 format.  
00053  * Finally, the accumulator is saturated and converted to a 1.31 result.  
00054  *  
00055  * \par  
00056  * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.  
00057  * In order to avoid overflows completely the input signals must be scaled down.  
00058  * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,  
00059  * as maximum of min(srcALen, srcBLen) number of additions are carried internally.  
00060  *  
00061  * \par  
00062  * See <code>arm_conv_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.  
00063  */ 
00064  
00065 void arm_conv_fast_q31( 
00066   q31_t * pSrcA, 
00067   uint32_t srcALen, 
00068   q31_t * pSrcB, 
00069   uint32_t srcBLen, 
00070   q31_t * pDst) 
00071 { 
00072   q31_t *pIn1;                                   /* inputA pointer */ 
00073   q31_t *pIn2;                                   /* inputB pointer */ 
00074   q31_t *pOut = pDst;                            /* output pointer */ 
00075   q31_t *px;                                     /* Intermediate inputA pointer  */ 
00076   q31_t *py;                                     /* Intermediate inputB pointer  */ 
00077   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */ 
00078   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */ 
00079   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */ 
00080   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */ 
00081  
00082  
00083   /* The algorithm implementation is based on the lengths of the inputs. */ 
00084   /* srcB is always made to slide across srcA. */ 
00085   /* So srcBLen is always considered as shorter or equal to srcALen */ 
00086   if(srcALen >= srcBLen) 
00087   { 
00088     /* Initialization of inputA pointer */ 
00089     pIn1 = pSrcA; 
00090  
00091     /* Initialization of inputB pointer */ 
00092     pIn2 = pSrcB; 
00093   } 
00094   else 
00095   { 
00096     /* Initialization of inputA pointer */ 
00097     pIn1 = pSrcB; 
00098  
00099     /* Initialization of inputB pointer */ 
00100     pIn2 = pSrcA; 
00101  
00102     /* srcBLen is always considered as shorter or equal to srcALen */ 
00103     j = srcBLen; 
00104     srcBLen = srcALen; 
00105     srcALen = j; 
00106   } 
00107  
00108   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 
00109   /* The function is internally  
00110    * divided into three stages according to the number of multiplications that has to be  
00111    * taken place between inputA samples and inputB samples. In the first stage of the  
00112    * algorithm, the multiplications increase by one for every iteration.  
00113    * In the second stage of the algorithm, srcBLen number of multiplications are done.  
00114    * In the third stage of the algorithm, the multiplications decrease by one  
00115    * for every iteration. */ 
00116  
00117   /* The algorithm is implemented in three stages.  
00118      The loop counters of each stage is initiated here. */ 
00119   blockSize1 = srcBLen - 1u; 
00120   blockSize2 = srcALen - (srcBLen - 1u); 
00121   blockSize3 = blockSize1; 
00122  
00123   /* --------------------------  
00124    * Initializations of stage1  
00125    * -------------------------*/ 
00126  
00127   /* sum = x[0] * y[0]  
00128    * sum = x[0] * y[1] + x[1] * y[0]  
00129    * ....  
00130    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]  
00131    */ 
00132  
00133   /* In this stage the MAC operations are increased by 1 for every iteration.  
00134      The count variable holds the number of MAC operations performed */ 
00135   count = 1u; 
00136  
00137   /* Working pointer of inputA */ 
00138   px = pIn1; 
00139  
00140   /* Working pointer of inputB */ 
00141   py = pIn2; 
00142  
00143  
00144   /* ------------------------  
00145    * Stage1 process  
00146    * ----------------------*/ 
00147  
00148   /* The first stage starts here */ 
00149   while(blockSize1 > 0u) 
00150   { 
00151     /* Accumulator is made zero for every iteration */ 
00152     sum = 0; 
00153  
00154     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00155     k = count >> 2u; 
00156  
00157     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00158      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00159     while(k > 0u) 
00160     { 
00161       /* x[0] * y[srcBLen - 1] */ 
00162       sum = (q31_t) ((((q63_t) sum << 32) +  
00163             ((q63_t) * px++ * (*py--))) >> 32); 
00164  
00165       /* x[1] * y[srcBLen - 2] */ 
00166       sum = (q31_t) ((((q63_t) sum << 32) +  
00167             ((q63_t) * px++ * (*py--))) >> 32); 
00168  
00169       /* x[2] * y[srcBLen - 3] */ 
00170       sum = (q31_t) ((((q63_t) sum << 32) +  
00171             ((q63_t) * px++ * (*py--))) >> 32); 
00172  
00173       /* x[3] * y[srcBLen - 4] */ 
00174       sum = (q31_t) ((((q63_t) sum << 32) +  
00175             ((q63_t) * px++ * (*py--))) >> 32); 
00176  
00177       /* Decrement the loop counter */ 
00178       k--; 
00179     } 
00180  
00181     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00182      ** No loop unrolling is used. */ 
00183     k = count % 0x4u; 
00184  
00185     while(k > 0u) 
00186     { 
00187       /* Perform the multiply-accumulate */ 
00188       sum = (q31_t) ((((q63_t) sum << 32) + 
00189             ((q63_t) * px++ * (*py--))) >> 32); 
00190  
00191       /* Decrement the loop counter */ 
00192       k--; 
00193     } 
00194  
00195     /* Store the result in the accumulator in the destination buffer. */ 
00196     *pOut++ = sum << 1; 
00197  
00198     /* Update the inputA and inputB pointers for next MAC calculation */ 
00199     py = pIn2 + count; 
00200     px = pIn1; 
00201  
00202     /* Increment the MAC count */ 
00203     count++; 
00204  
00205     /* Decrement the loop counter */ 
00206     blockSize1--; 
00207   } 
00208  
00209   /* --------------------------  
00210    * Initializations of stage2  
00211    * ------------------------*/ 
00212  
00213   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]  
00214    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]  
00215    * ....  
00216    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]  
00217    */ 
00218  
00219   /* Working pointer of inputA */ 
00220   px = pIn1; 
00221  
00222   /* Working pointer of inputB */ 
00223   pSrc2 = pIn2 + (srcBLen - 1u); 
00224   py = pSrc2; 
00225  
00226   /* count is index by which the pointer pIn1 to be incremented */ 
00227   count = 1u; 
00228  
00229   /* -------------------  
00230    * Stage2 process  
00231    * ------------------*/ 
00232  
00233   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00234    * So, to loop unroll over blockSize2,  
00235    * srcBLen should be greater than or equal to 4 */ 
00236   if(srcBLen >= 4u) 
00237   { 
00238     /* Loop unroll over blockSize2, by 4 */ 
00239     blkCnt = blockSize2 >> 2u; 
00240  
00241     while(blkCnt > 0u) 
00242     { 
00243       /* Set all accumulators to zero */ 
00244       acc0 = 0; 
00245       acc1 = 0; 
00246       acc2 = 0; 
00247       acc3 = 0; 
00248  
00249       /* read x[0], x[1], x[2] samples */ 
00250       x0 = *(px++); 
00251       x1 = *(px++); 
00252       x2 = *(px++); 
00253  
00254       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00255       k = srcBLen >> 2u; 
00256  
00257       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00258        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00259       do 
00260       { 
00261         /* Read y[srcBLen - 1] sample */ 
00262         c0 = *(py--); 
00263  
00264         /* Read x[3] sample */ 
00265         x3 = *(px++); 
00266  
00267         /* Perform the multiply-accumulates */ 
00268         /* acc0 +=  x[0] * y[srcBLen - 1] */ 
00269         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 
00270  
00271         /* acc1 +=  x[1] * y[srcBLen - 1] */ 
00272         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 
00273  
00274         /* acc2 +=  x[2] * y[srcBLen - 1] */ 
00275         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 
00276  
00277         /* acc3 +=  x[3] * y[srcBLen - 1] */ 
00278         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 
00279  
00280         /* Read y[srcBLen - 2] sample */ 
00281         c0 = *(py--); 
00282  
00283         /* Read x[4] sample */ 
00284         x0 = *(px++); 
00285  
00286         /* Perform the multiply-accumulate */ 
00287         /* acc0 +=  x[1] * y[srcBLen - 2] */ 
00288         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 
00289         /* acc1 +=  x[2] * y[srcBLen - 2] */ 
00290         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 
00291         /* acc2 +=  x[3] * y[srcBLen - 2] */ 
00292         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 
00293         /* acc3 +=  x[4] * y[srcBLen - 2] */ 
00294         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 
00295  
00296         /* Read y[srcBLen - 3] sample */ 
00297         c0 = *(py--); 
00298  
00299         /* Read x[5] sample */ 
00300         x1 = *(px++); 
00301  
00302         /* Perform the multiply-accumulates */ 
00303         /* acc0 +=  x[2] * y[srcBLen - 3] */ 
00304         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 
00305         /* acc1 +=  x[3] * y[srcBLen - 2] */ 
00306         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 
00307         /* acc2 +=  x[4] * y[srcBLen - 2] */ 
00308         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 
00309         /* acc3 +=  x[5] * y[srcBLen - 2] */ 
00310         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 
00311  
00312         /* Read y[srcBLen - 4] sample */ 
00313         c0 = *(py--); 
00314  
00315         /* Read x[6] sample */ 
00316         x2 = *(px++); 
00317  
00318         /* Perform the multiply-accumulates */ 
00319         /* acc0 +=  x[3] * y[srcBLen - 4] */ 
00320         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 
00321         /* acc1 +=  x[4] * y[srcBLen - 4] */ 
00322         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 
00323         /* acc2 +=  x[5] * y[srcBLen - 4] */ 
00324         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 
00325         /* acc3 +=  x[6] * y[srcBLen - 4] */ 
00326         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 
00327  
00328  
00329       } while(--k); 
00330  
00331       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00332        ** No loop unrolling is used. */ 
00333       k = srcBLen % 0x4u; 
00334  
00335       while(k > 0u) 
00336       { 
00337         /* Read y[srcBLen - 5] sample */ 
00338         c0 = *(py--); 
00339  
00340         /* Read x[7] sample */ 
00341         x3 = *(px++); 
00342  
00343         /* Perform the multiply-accumulates */ 
00344         /* acc0 +=  x[4] * y[srcBLen - 5] */ 
00345         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 
00346         /* acc1 +=  x[5] * y[srcBLen - 5] */ 
00347         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 
00348         /* acc2 +=  x[6] * y[srcBLen - 5] */ 
00349         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 
00350         /* acc3 +=  x[7] * y[srcBLen - 5] */ 
00351         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 
00352  
00353         /* Reuse the present samples for the next MAC */ 
00354         x0 = x1; 
00355         x1 = x2; 
00356         x2 = x3; 
00357  
00358         /* Decrement the loop counter */ 
00359         k--; 
00360       } 
00361  
00362       /* Store the results in the accumulators in the destination buffer. */ 
00363       *pOut++ = (q31_t) (acc0 << 1); 
00364       *pOut++ = (q31_t) (acc1 << 1); 
00365       *pOut++ = (q31_t) (acc2 << 1); 
00366       *pOut++ = (q31_t) (acc3 << 1); 
00367  
00368       /* Update the inputA and inputB pointers for next MAC calculation */ 
00369       px = pIn1 + (count * 4u); 
00370       py = pSrc2; 
00371  
00372       /* Increment the pointer pIn1 index, count by 1 */ 
00373       count++; 
00374  
00375       /* Decrement the loop counter */ 
00376       blkCnt--; 
00377     } 
00378  
00379     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00380      ** No loop unrolling is used. */ 
00381     blkCnt = blockSize2 % 0x4u; 
00382  
00383     while(blkCnt > 0u) 
00384     { 
00385       /* Accumulator is made zero for every iteration */ 
00386       sum = 0; 
00387  
00388       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00389       k = srcBLen >> 2u; 
00390  
00391       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00392        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00393       while(k > 0u) 
00394       { 
00395         /* Perform the multiply-accumulates */ 
00396         sum = (q31_t) ((((q63_t) sum << 32) +  
00397                         ((q63_t) * px++ * (*py--))) >> 32); 
00398         sum = (q31_t) ((((q63_t) sum << 32) +  
00399                         ((q63_t) * px++ * (*py--))) >> 32); 
00400         sum = (q31_t) ((((q63_t) sum << 32) +  
00401                         ((q63_t) * px++ * (*py--))) >> 32); 
00402         sum = (q31_t) ((((q63_t) sum << 32) +  
00403                         ((q63_t) * px++ * (*py--))) >> 32); 
00404  
00405         /* Decrement the loop counter */ 
00406         k--; 
00407       } 
00408  
00409       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00410        ** No loop unrolling is used. */ 
00411       k = srcBLen % 0x4u; 
00412  
00413       while(k > 0u) 
00414       { 
00415         /* Perform the multiply-accumulate */ 
00416         sum = (q31_t) ((((q63_t) sum << 32) +  
00417                         ((q63_t) * px++ * (*py--))) >> 32); 
00418  
00419         /* Decrement the loop counter */ 
00420         k--; 
00421       } 
00422  
00423       /* Store the result in the accumulator in the destination buffer. */ 
00424       *pOut++ = sum << 1; 
00425  
00426       /* Update the inputA and inputB pointers for next MAC calculation */ 
00427       px = pIn1 + count; 
00428       py = pSrc2; 
00429  
00430       /* Increment the MAC count */ 
00431       count++; 
00432  
00433       /* Decrement the loop counter */ 
00434       blkCnt--; 
00435     } 
00436   } 
00437   else 
00438   { 
00439     /* If the srcBLen is not a multiple of 4,  
00440      * the blockSize2 loop cannot be unrolled by 4 */ 
00441     blkCnt = blockSize2; 
00442  
00443     while(blkCnt > 0u) 
00444     { 
00445       /* Accumulator is made zero for every iteration */ 
00446       sum = 0; 
00447  
00448       /* srcBLen number of MACS should be performed */ 
00449       k = srcBLen; 
00450  
00451       while(k > 0u) 
00452       { 
00453         /* Perform the multiply-accumulate */ 
00454         sum = (q31_t) ((((q63_t) sum << 32) +  
00455                         ((q63_t) * px++ * (*py--))) >> 32); 
00456  
00457         /* Decrement the loop counter */ 
00458         k--; 
00459       } 
00460  
00461       /* Store the result in the accumulator in the destination buffer. */ 
00462       *pOut++ = sum << 1; 
00463  
00464       /* Update the inputA and inputB pointers for next MAC calculation */ 
00465       px = pIn1 + count; 
00466       py = pSrc2; 
00467  
00468       /* Increment the MAC count */ 
00469       count++; 
00470  
00471       /* Decrement the loop counter */ 
00472       blkCnt--; 
00473     } 
00474   } 
00475  
00476  
00477   /* --------------------------  
00478    * Initializations of stage3  
00479    * -------------------------*/ 
00480  
00481   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]  
00482    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]  
00483    * ....  
00484    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]  
00485    * sum +=  x[srcALen-1] * y[srcBLen-1]  
00486    */ 
00487  
00488   /* In this stage the MAC operations are decreased by 1 for every iteration.  
00489      The blockSize3 variable holds the number of MAC operations performed */ 
00490  
00491   /* Working pointer of inputA */ 
00492   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00493   px = pSrc1; 
00494  
00495   /* Working pointer of inputB */ 
00496   pSrc2 = pIn2 + (srcBLen - 1u); 
00497   py = pSrc2; 
00498  
00499   /* -------------------  
00500    * Stage3 process  
00501    * ------------------*/ 
00502  
00503   while(blockSize3 > 0u) 
00504   { 
00505     /* Accumulator is made zero for every iteration */ 
00506     sum = 0; 
00507  
00508     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00509     k = blockSize3 >> 2u; 
00510  
00511     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00512      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00513     while(k > 0u) 
00514     { 
00515       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 
00516       sum = (q31_t) ((((q63_t) sum << 32) +  
00517                       ((q63_t) * px++ * (*py--))) >> 32); 
00518  
00519       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 
00520       sum = (q31_t) ((((q63_t) sum << 32) +  
00521                       ((q63_t) * px++ * (*py--))) >> 32); 
00522  
00523       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 
00524       sum = (q31_t) ((((q63_t) sum << 32) + 
00525                       ((q63_t) * px++ * (*py--))) >> 32); 
00526  
00527       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 
00528       sum = (q31_t) ((((q63_t) sum << 32) +  
00529                       ((q63_t) * px++ * (*py--))) >> 32); 
00530  
00531       /* Decrement the loop counter */ 
00532       k--; 
00533     } 
00534  
00535     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.  
00536      ** No loop unrolling is used. */ 
00537     k = blockSize3 % 0x4u; 
00538  
00539     while(k > 0u) 
00540     { 
00541       /* Perform the multiply-accumulate */ 
00542       sum = (q31_t) ((((q63_t) sum << 32) +  
00543                       ((q63_t) * px++ * (*py--))) >> 32); 
00544  
00545       /* Decrement the loop counter */ 
00546       k--; 
00547     } 
00548  
00549     /* Store the result in the accumulator in the destination buffer. */ 
00550     *pOut++ = sum << 1; 
00551  
00552     /* Update the inputA and inputB pointers for next MAC calculation */ 
00553     px = ++pSrc1; 
00554     py = pSrc2; 
00555  
00556     /* Decrement the loop counter */ 
00557     blockSize3--; 
00558   } 
00559  
00560 } 
00561  
00562 /**  
00563  * @} end of Conv group  
00564  */