CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_fast_q31.c Source File

arm_conv_fast_q31.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_conv_fast_q31.c    
00009 *    
00010 * Description:  Q31 Convolution (fast version).    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup Conv    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @param[in] *pSrcA points to the first input sequence.    
00054  * @param[in] srcALen length of the first input sequence.    
00055  * @param[in] *pSrcB points to the second input sequence.    
00056  * @param[in] srcBLen length of the second input sequence.    
00057  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.    
00058  * @return none.    
00059  *    
00060  * @details    
00061  * <b>Scaling and Overflow Behavior:</b>    
00062  *    
00063  * \par    
00064  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.    
00065  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.    
00066  * These intermediate results are accumulated in a 32-bit register in 2.30 format.    
00067  * Finally, the accumulator is saturated and converted to a 1.31 result.    
00068  *    
00069  * \par    
00070  * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.    
00071  * In order to avoid overflows completely the input signals must be scaled down.    
00072  * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,    
00073  * as maximum of min(srcALen, srcBLen) number of additions are carried internally.    
00074  *    
00075  * \par    
00076  * See <code>arm_conv_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.    
00077  */
00078 
00079 void arm_conv_fast_q31(
00080   q31_t * pSrcA,
00081   uint32_t srcALen,
00082   q31_t * pSrcB,
00083   uint32_t srcBLen,
00084   q31_t * pDst)
00085 {
00086   q31_t *pIn1;                                   /* inputA pointer */
00087   q31_t *pIn2;                                   /* inputB pointer */
00088   q31_t *pOut = pDst;                            /* output pointer */
00089   q31_t *px;                                     /* Intermediate inputA pointer  */
00090   q31_t *py;                                     /* Intermediate inputB pointer  */
00091   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
00092   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00093   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
00094   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */
00095 
00096   /* The algorithm implementation is based on the lengths of the inputs. */
00097   /* srcB is always made to slide across srcA. */
00098   /* So srcBLen is always considered as shorter or equal to srcALen */
00099   if(srcALen >= srcBLen)
00100   {
00101     /* Initialization of inputA pointer */
00102     pIn1 = pSrcA;
00103 
00104     /* Initialization of inputB pointer */
00105     pIn2 = pSrcB;
00106   }
00107   else
00108   {
00109     /* Initialization of inputA pointer */
00110     pIn1 = pSrcB;
00111 
00112     /* Initialization of inputB pointer */
00113     pIn2 = pSrcA;
00114 
00115     /* srcBLen is always considered as shorter or equal to srcALen */
00116     j = srcBLen;
00117     srcBLen = srcALen;
00118     srcALen = j;
00119   }
00120 
00121   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00122   /* The function is internally    
00123    * divided into three stages according to the number of multiplications that has to be    
00124    * taken place between inputA samples and inputB samples. In the first stage of the    
00125    * algorithm, the multiplications increase by one for every iteration.    
00126    * In the second stage of the algorithm, srcBLen number of multiplications are done.    
00127    * In the third stage of the algorithm, the multiplications decrease by one    
00128    * for every iteration. */
00129 
00130   /* The algorithm is implemented in three stages.    
00131      The loop counters of each stage is initiated here. */
00132   blockSize1 = srcBLen - 1u;
00133   blockSize2 = srcALen - (srcBLen - 1u);
00134   blockSize3 = blockSize1;
00135 
00136   /* --------------------------    
00137    * Initializations of stage1    
00138    * -------------------------*/
00139 
00140   /* sum = x[0] * y[0]    
00141    * sum = x[0] * y[1] + x[1] * y[0]    
00142    * ....    
00143    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]    
00144    */
00145 
00146   /* In this stage the MAC operations are increased by 1 for every iteration.    
00147      The count variable holds the number of MAC operations performed */
00148   count = 1u;
00149 
00150   /* Working pointer of inputA */
00151   px = pIn1;
00152 
00153   /* Working pointer of inputB */
00154   py = pIn2;
00155 
00156 
00157   /* ------------------------    
00158    * Stage1 process    
00159    * ----------------------*/
00160 
00161   /* The first stage starts here */
00162   while(blockSize1 > 0u)
00163   {
00164     /* Accumulator is made zero for every iteration */
00165     sum = 0;
00166 
00167     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00168     k = count >> 2u;
00169 
00170     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00171      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00172     while(k > 0u)
00173     {
00174       /* x[0] * y[srcBLen - 1] */
00175       sum = (q31_t) ((((q63_t) sum << 32) +
00176                       ((q63_t) * px++ * (*py--))) >> 32);
00177 
00178       /* x[1] * y[srcBLen - 2] */
00179       sum = (q31_t) ((((q63_t) sum << 32) +
00180                       ((q63_t) * px++ * (*py--))) >> 32);
00181 
00182       /* x[2] * y[srcBLen - 3] */
00183       sum = (q31_t) ((((q63_t) sum << 32) +
00184                       ((q63_t) * px++ * (*py--))) >> 32);
00185 
00186       /* x[3] * y[srcBLen - 4] */
00187       sum = (q31_t) ((((q63_t) sum << 32) +
00188                       ((q63_t) * px++ * (*py--))) >> 32);
00189 
00190       /* Decrement the loop counter */
00191       k--;
00192     }
00193 
00194     /* If the count is not a multiple of 4, compute any remaining MACs here.    
00195      ** No loop unrolling is used. */
00196     k = count % 0x4u;
00197 
00198     while(k > 0u)
00199     {
00200       /* Perform the multiply-accumulate */
00201       sum = (q31_t) ((((q63_t) sum << 32) +
00202                       ((q63_t) * px++ * (*py--))) >> 32);
00203 
00204       /* Decrement the loop counter */
00205       k--;
00206     }
00207 
00208     /* Store the result in the accumulator in the destination buffer. */
00209     *pOut++ = sum << 1;
00210 
00211     /* Update the inputA and inputB pointers for next MAC calculation */
00212     py = pIn2 + count;
00213     px = pIn1;
00214 
00215     /* Increment the MAC count */
00216     count++;
00217 
00218     /* Decrement the loop counter */
00219     blockSize1--;
00220   }
00221 
00222   /* --------------------------    
00223    * Initializations of stage2    
00224    * ------------------------*/
00225 
00226   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]    
00227    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]    
00228    * ....    
00229    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]    
00230    */
00231 
00232   /* Working pointer of inputA */
00233   px = pIn1;
00234 
00235   /* Working pointer of inputB */
00236   pSrc2 = pIn2 + (srcBLen - 1u);
00237   py = pSrc2;
00238 
00239   /* count is index by which the pointer pIn1 to be incremented */
00240   count = 0u;
00241 
00242   /* -------------------    
00243    * Stage2 process    
00244    * ------------------*/
00245 
00246   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.    
00247    * So, to loop unroll over blockSize2,    
00248    * srcBLen should be greater than or equal to 4 */
00249   if(srcBLen >= 4u)
00250   {
00251     /* Loop unroll over blockSize2, by 4 */
00252     blkCnt = blockSize2 >> 2u;
00253 
00254     while(blkCnt > 0u)
00255     {
00256       /* Set all accumulators to zero */
00257       acc0 = 0;
00258       acc1 = 0;
00259       acc2 = 0;
00260       acc3 = 0;
00261 
00262       /* read x[0], x[1], x[2] samples */
00263       x0 = *(px++);
00264       x1 = *(px++);
00265       x2 = *(px++);
00266 
00267       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00268       k = srcBLen >> 2u;
00269 
00270       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00271        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00272       do
00273       {
00274         /* Read y[srcBLen - 1] sample */
00275         c0 = *(py--);
00276 
00277         /* Read x[3] sample */
00278         x3 = *(px++);
00279 
00280         /* Perform the multiply-accumulates */
00281         /* acc0 +=  x[0] * y[srcBLen - 1] */
00282         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00283 
00284         /* acc1 +=  x[1] * y[srcBLen - 1] */
00285         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00286 
00287         /* acc2 +=  x[2] * y[srcBLen - 1] */
00288         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
00289 
00290         /* acc3 +=  x[3] * y[srcBLen - 1] */
00291         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
00292 
00293         /* Read y[srcBLen - 2] sample */
00294         c0 = *(py--);
00295 
00296         /* Read x[4] sample */
00297         x0 = *(px++);
00298 
00299         /* Perform the multiply-accumulate */
00300         /* acc0 +=  x[1] * y[srcBLen - 2] */
00301         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
00302         /* acc1 +=  x[2] * y[srcBLen - 2] */
00303         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
00304         /* acc2 +=  x[3] * y[srcBLen - 2] */
00305         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
00306         /* acc3 +=  x[4] * y[srcBLen - 2] */
00307         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
00308 
00309         /* Read y[srcBLen - 3] sample */
00310         c0 = *(py--);
00311 
00312         /* Read x[5] sample */
00313         x1 = *(px++);
00314 
00315         /* Perform the multiply-accumulates */
00316         /* acc0 +=  x[2] * y[srcBLen - 3] */
00317         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
00318         /* acc1 +=  x[3] * y[srcBLen - 3] */
00319         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
00320         /* acc2 +=  x[4] * y[srcBLen - 3] */
00321         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
00322         /* acc3 +=  x[5] * y[srcBLen - 3] */
00323         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
00324 
00325         /* Read y[srcBLen - 4] sample */
00326         c0 = *(py--);
00327 
00328         /* Read x[6] sample */
00329         x2 = *(px++);
00330 
00331         /* Perform the multiply-accumulates */
00332         /* acc0 +=  x[3] * y[srcBLen - 4] */
00333         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
00334         /* acc1 +=  x[4] * y[srcBLen - 4] */
00335         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
00336         /* acc2 +=  x[5] * y[srcBLen - 4] */
00337         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
00338         /* acc3 +=  x[6] * y[srcBLen - 4] */
00339         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
00340 
00341 
00342       } while(--k);
00343 
00344       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00345        ** No loop unrolling is used. */
00346       k = srcBLen % 0x4u;
00347 
00348       while(k > 0u)
00349       {
00350         /* Read y[srcBLen - 5] sample */
00351         c0 = *(py--);
00352 
00353         /* Read x[7] sample */
00354         x3 = *(px++);
00355 
00356         /* Perform the multiply-accumulates */
00357         /* acc0 +=  x[4] * y[srcBLen - 5] */
00358         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00359         /* acc1 +=  x[5] * y[srcBLen - 5] */
00360         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00361         /* acc2 +=  x[6] * y[srcBLen - 5] */
00362         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
00363         /* acc3 +=  x[7] * y[srcBLen - 5] */
00364         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
00365 
00366         /* Reuse the present samples for the next MAC */
00367         x0 = x1;
00368         x1 = x2;
00369         x2 = x3;
00370 
00371         /* Decrement the loop counter */
00372         k--;
00373       }
00374 
00375       /* Store the results in the accumulators in the destination buffer. */
00376       *pOut++ = (q31_t) (acc0 << 1);
00377       *pOut++ = (q31_t) (acc1 << 1);
00378       *pOut++ = (q31_t) (acc2 << 1);
00379       *pOut++ = (q31_t) (acc3 << 1);
00380 
00381       /* Increment the pointer pIn1 index, count by 4 */
00382       count += 4u;
00383 
00384       /* Update the inputA and inputB pointers for next MAC calculation */
00385       px = pIn1 + count;
00386       py = pSrc2;
00387 
00388       /* Decrement the loop counter */
00389       blkCnt--;
00390     }
00391 
00392     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.    
00393      ** No loop unrolling is used. */
00394     blkCnt = blockSize2 % 0x4u;
00395 
00396     while(blkCnt > 0u)
00397     {
00398       /* Accumulator is made zero for every iteration */
00399       sum = 0;
00400 
00401       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00402       k = srcBLen >> 2u;
00403 
00404       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00405        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00406       while(k > 0u)
00407       {
00408         /* Perform the multiply-accumulates */
00409         sum = (q31_t) ((((q63_t) sum << 32) +
00410                         ((q63_t) * px++ * (*py--))) >> 32);
00411         sum = (q31_t) ((((q63_t) sum << 32) +
00412                         ((q63_t) * px++ * (*py--))) >> 32);
00413         sum = (q31_t) ((((q63_t) sum << 32) +
00414                         ((q63_t) * px++ * (*py--))) >> 32);
00415         sum = (q31_t) ((((q63_t) sum << 32) +
00416                         ((q63_t) * px++ * (*py--))) >> 32);
00417 
00418         /* Decrement the loop counter */
00419         k--;
00420       }
00421 
00422       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00423        ** No loop unrolling is used. */
00424       k = srcBLen % 0x4u;
00425 
00426       while(k > 0u)
00427       {
00428         /* Perform the multiply-accumulate */
00429         sum = (q31_t) ((((q63_t) sum << 32) +
00430                         ((q63_t) * px++ * (*py--))) >> 32);
00431 
00432         /* Decrement the loop counter */
00433         k--;
00434       }
00435 
00436       /* Store the result in the accumulator in the destination buffer. */
00437       *pOut++ = sum << 1;
00438 
00439       /* Increment the MAC count */
00440       count++;
00441 
00442       /* Update the inputA and inputB pointers for next MAC calculation */
00443       px = pIn1 + count;
00444       py = pSrc2;
00445 
00446       /* Decrement the loop counter */
00447       blkCnt--;
00448     }
00449   }
00450   else
00451   {
00452     /* If the srcBLen is not a multiple of 4,    
00453      * the blockSize2 loop cannot be unrolled by 4 */
00454     blkCnt = blockSize2;
00455 
00456     while(blkCnt > 0u)
00457     {
00458       /* Accumulator is made zero for every iteration */
00459       sum = 0;
00460 
00461       /* srcBLen number of MACS should be performed */
00462       k = srcBLen;
00463 
00464       while(k > 0u)
00465       {
00466         /* Perform the multiply-accumulate */
00467         sum = (q31_t) ((((q63_t) sum << 32) +
00468                         ((q63_t) * px++ * (*py--))) >> 32);
00469 
00470         /* Decrement the loop counter */
00471         k--;
00472       }
00473 
00474       /* Store the result in the accumulator in the destination buffer. */
00475       *pOut++ = sum << 1;
00476 
00477       /* Increment the MAC count */
00478       count++;
00479 
00480       /* Update the inputA and inputB pointers for next MAC calculation */
00481       px = pIn1 + count;
00482       py = pSrc2;
00483 
00484       /* Decrement the loop counter */
00485       blkCnt--;
00486     }
00487   }
00488 
00489 
00490   /* --------------------------    
00491    * Initializations of stage3    
00492    * -------------------------*/
00493 
00494   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]    
00495    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]    
00496    * ....    
00497    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]    
00498    * sum +=  x[srcALen-1] * y[srcBLen-1]    
00499    */
00500 
00501   /* In this stage the MAC operations are decreased by 1 for every iteration.    
00502      The blockSize3 variable holds the number of MAC operations performed */
00503 
00504   /* Working pointer of inputA */
00505   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00506   px = pSrc1;
00507 
00508   /* Working pointer of inputB */
00509   pSrc2 = pIn2 + (srcBLen - 1u);
00510   py = pSrc2;
00511 
00512   /* -------------------    
00513    * Stage3 process    
00514    * ------------------*/
00515 
00516   while(blockSize3 > 0u)
00517   {
00518     /* Accumulator is made zero for every iteration */
00519     sum = 0;
00520 
00521     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00522     k = blockSize3 >> 2u;
00523 
00524     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00525      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00526     while(k > 0u)
00527     {
00528       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00529       sum = (q31_t) ((((q63_t) sum << 32) +
00530                       ((q63_t) * px++ * (*py--))) >> 32);
00531 
00532       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00533       sum = (q31_t) ((((q63_t) sum << 32) +
00534                       ((q63_t) * px++ * (*py--))) >> 32);
00535 
00536       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00537       sum = (q31_t) ((((q63_t) sum << 32) +
00538                       ((q63_t) * px++ * (*py--))) >> 32);
00539 
00540       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00541       sum = (q31_t) ((((q63_t) sum << 32) +
00542                       ((q63_t) * px++ * (*py--))) >> 32);
00543 
00544       /* Decrement the loop counter */
00545       k--;
00546     }
00547 
00548     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.    
00549      ** No loop unrolling is used. */
00550     k = blockSize3 % 0x4u;
00551 
00552     while(k > 0u)
00553     {
00554       /* Perform the multiply-accumulate */
00555       sum = (q31_t) ((((q63_t) sum << 32) +
00556                       ((q63_t) * px++ * (*py--))) >> 32);
00557 
00558       /* Decrement the loop counter */
00559       k--;
00560     }
00561 
00562     /* Store the result in the accumulator in the destination buffer. */
00563     *pOut++ = sum << 1;
00564 
00565     /* Update the inputA and inputB pointers for next MAC calculation */
00566     px = ++pSrc1;
00567     py = pSrc2;
00568 
00569     /* Decrement the loop counter */
00570     blockSize3--;
00571   }
00572 
00573 }
00574 
00575 /**    
00576  * @} end of Conv group    
00577  */