CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_fast_q15.c Source File

arm_conv_fast_q15.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_conv_fast_q15.c  
00009 *  
00010 * Description:  Fast Q15 Convolution.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated.  
00025 * -------------------------------------------------------------------- */ 
00026  
00027 #include "arm_math.h" 
00028  
00029 /**  
00030  * @ingroup groupFilters  
00031  */ 
00032  
00033 /**  
00034  * @addtogroup Conv  
00035  * @{  
00036  */ 
00037  
00038 /**  
00039  * @brief Convolution of Q15 sequences (fast version).  
00040  * @param[in] *pSrcA points to the first input sequence.  
00041  * @param[in] srcALen length of the first input sequence.  
00042  * @param[in] *pSrcB points to the second input sequence.  
00043  * @param[in] srcBLen length of the second input sequence.  
00044  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.  
00045  * @return none.  
00046  *  
00047  * <b>Scaling and Overflow Behavior:</b>  
00048  *  
00049  * \par  
00050  * This fast version uses a 32-bit accumulator with 2.30 format.  
00051  * The accumulator maintains full precision of the intermediate multiplication results  
00052  * but provides only a single guard bit. There is no saturation on intermediate additions.  
00053  * Thus, if the accumulator overflows it wraps around and distorts the result.  
00054  * The input signals should be scaled down to avoid intermediate overflows.  
00055  * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,  
00056  * as maximum of min(srcALen, srcBLen) number of additions are carried internally.  
00057  * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.  
00058  *  
00059  * \par  
00060  * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.  
00061  */ 
00062  
00063 void arm_conv_fast_q15( 
00064   q15_t * pSrcA, 
00065   uint32_t srcALen, 
00066   q15_t * pSrcB, 
00067   uint32_t srcBLen, 
00068   q15_t * pDst) 
00069 { 
00070   q15_t *pIn1;                                   /* inputA pointer */ 
00071   q15_t *pIn2;                                   /* inputB pointer */ 
00072   q15_t *pOut = pDst;                            /* output pointer */ 
00073   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */ 
00074   q15_t *px;                                     /* Intermediate inputA pointer  */ 
00075   q15_t *py;                                     /* Intermediate inputB pointer  */ 
00076   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */ 
00077   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */ 
00078   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */ 
00079   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */ 
00080  
00081  
00082   /* The algorithm implementation is based on the lengths of the inputs. */ 
00083   /* srcB is always made to slide across srcA. */ 
00084   /* So srcBLen is always considered as shorter or equal to srcALen */ 
00085   if(srcALen >= srcBLen) 
00086   { 
00087     /* Initialization of inputA pointer */ 
00088     pIn1 = pSrcA; 
00089  
00090     /* Initialization of inputB pointer */ 
00091     pIn2 = pSrcB; 
00092   } 
00093   else 
00094   { 
00095     /* Initialization of inputA pointer */ 
00096     pIn1 = pSrcB; 
00097  
00098     /* Initialization of inputB pointer */ 
00099     pIn2 = pSrcA; 
00100  
00101     /* srcBLen is always considered as shorter or equal to srcALen */ 
00102     j = srcBLen; 
00103     srcBLen = srcALen; 
00104     srcALen = j; 
00105   } 
00106  
00107   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 
00108   /* The function is internally  
00109    * divided into three stages according to the number of multiplications that has to be  
00110    * taken place between inputA samples and inputB samples. In the first stage of the  
00111    * algorithm, the multiplications increase by one for every iteration.  
00112    * In the second stage of the algorithm, srcBLen number of multiplications are done.  
00113    * In the third stage of the algorithm, the multiplications decrease by one  
00114    * for every iteration. */ 
00115  
00116   /* The algorithm is implemented in three stages.  
00117      The loop counters of each stage is initiated here. */ 
00118   blockSize1 = srcBLen - 1u; 
00119   blockSize2 = srcALen - (srcBLen - 1u); 
00120   blockSize3 = blockSize1; 
00121  
00122   /* --------------------------  
00123    * Initializations of stage1  
00124    * -------------------------*/ 
00125  
00126   /* sum = x[0] * y[0]  
00127    * sum = x[0] * y[1] + x[1] * y[0]  
00128    * ....  
00129    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]  
00130    */ 
00131  
00132   /* In this stage the MAC operations are increased by 1 for every iteration.  
00133      The count variable holds the number of MAC operations performed */ 
00134   count = 1u; 
00135  
00136   /* Working pointer of inputA */ 
00137   px = pIn1; 
00138  
00139   /* Working pointer of inputB */ 
00140   py = pIn2; 
00141  
00142  
00143   /* ------------------------  
00144    * Stage1 process  
00145    * ----------------------*/ 
00146  
00147   /* For loop unrolling by 4, this stage is divided into two. */ 
00148   /* First part of this stage computes the MAC operations less than 4 */ 
00149   /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 
00150  
00151   /* The first part of the stage starts here */ 
00152   while((count < 4u) && (blockSize1 > 0u)) 
00153   { 
00154     /* Accumulator is made zero for every iteration */ 
00155     sum = 0; 
00156  
00157     /* Loop over number of MAC operations between  
00158      * inputA samples and inputB samples */ 
00159     k = count; 
00160  
00161     while(k > 0u) 
00162     { 
00163       /* Perform the multiply-accumulates */ 
00164       sum = __SMLAD(*px++, *py--, sum); 
00165  
00166       /* Decrement the loop counter */ 
00167       k--; 
00168     } 
00169  
00170     /* Store the result in the accumulator in the destination buffer. */ 
00171     *pOut++ = (q15_t) (sum >> 15); 
00172  
00173     /* Update the inputA and inputB pointers for next MAC calculation */ 
00174     py = pIn2 + count; 
00175     px = pIn1; 
00176  
00177     /* Increment the MAC count */ 
00178     count++; 
00179  
00180     /* Decrement the loop counter */ 
00181     blockSize1--; 
00182   } 
00183  
00184   /* The second part of the stage starts here */ 
00185   /* The internal loop, over count, is unrolled by 4 */ 
00186   /* To, read the last two inputB samples using SIMD:  
00187    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 
00188   py = py - 1; 
00189  
00190   while(blockSize1 > 0u) 
00191   { 
00192     /* Accumulator is made zero for every iteration */ 
00193     sum = 0; 
00194  
00195     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00196     k = count >> 2u; 
00197  
00198     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00199      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00200     while(k > 0u) 
00201     { 
00202       /* Perform the multiply-accumulates */ 
00203       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 
00204       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00205       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 
00206       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00207  
00208       /* Decrement the loop counter */ 
00209       k--; 
00210     } 
00211  
00212     /* For the next MAC operations, the pointer py is used without SIMD  
00213      * So, py is incremented by 1 */ 
00214     py = py + 1u; 
00215  
00216     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00217      ** No loop unrolling is used. */ 
00218     k = count % 0x4u; 
00219  
00220     while(k > 0u) 
00221     { 
00222       /* Perform the multiply-accumulates */ 
00223       sum = __SMLAD(*px++, *py--, sum); 
00224  
00225       /* Decrement the loop counter */ 
00226       k--; 
00227     } 
00228  
00229     /* Store the result in the accumulator in the destination buffer. */ 
00230     *pOut++ = (q15_t) (sum >> 15); 
00231  
00232     /* Update the inputA and inputB pointers for next MAC calculation */ 
00233     py = pIn2 + (count - 1u); 
00234     px = pIn1; 
00235  
00236     /* Increment the MAC count */ 
00237     count++; 
00238  
00239     /* Decrement the loop counter */ 
00240     blockSize1--; 
00241   } 
00242  
00243   /* --------------------------  
00244    * Initializations of stage2  
00245    * ------------------------*/ 
00246  
00247   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]  
00248    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]  
00249    * ....  
00250    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]  
00251    */ 
00252  
00253   /* Working pointer of inputA */ 
00254   px = pIn1; 
00255  
00256   /* Working pointer of inputB */ 
00257   pSrc2 = pIn2 + (srcBLen - 1u); 
00258   py = pSrc2; 
00259  
00260   /* Initialize inputB pointer of type q31 */ 
00261   pb = (q31_t *) (py - 1u); 
00262  
00263   /* count is the index by which the pointer pIn1 to be incremented */ 
00264   count = 1u; 
00265  
00266  
00267   /* --------------------  
00268    * Stage2 process  
00269    * -------------------*/ 
00270  
00271   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00272    * So, to loop unroll over blockSize2,  
00273    * srcBLen should be greater than or equal to 4 */ 
00274   if(srcBLen >= 4u) 
00275   { 
00276     /* Loop unroll over blockSize2, by 4 */ 
00277     blkCnt = blockSize2 >> 2u; 
00278  
00279     while(blkCnt > 0u) 
00280     { 
00281       /* Set all accumulators to zero */ 
00282       acc0 = 0; 
00283       acc1 = 0; 
00284       acc2 = 0; 
00285       acc3 = 0; 
00286  
00287  
00288       /* read x[0], x[1] samples */ 
00289       x0 = *(q31_t *) (px++); 
00290       /* read x[1], x[2] samples */ 
00291       x1 = *(q31_t *) (px++); 
00292  
00293  
00294       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00295       k = srcBLen >> 2u; 
00296  
00297       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00298        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00299       do 
00300       { 
00301         /* Read the last two inputB samples using SIMD:  
00302          * y[srcBLen - 1] and y[srcBLen - 2] */ 
00303         c0 = *(pb--); 
00304  
00305         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 
00306         acc0 = __SMLADX(x0, c0, acc0); 
00307  
00308         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 
00309         acc1 = __SMLADX(x1, c0, acc1); 
00310  
00311         /* Read x[2], x[3] */ 
00312         x2 = *(q31_t *) (px++); 
00313  
00314         /* Read x[3], x[4] */ 
00315         x3 = *(q31_t *) (px++); 
00316  
00317         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 
00318         acc2 = __SMLADX(x2, c0, acc2); 
00319  
00320         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 
00321         acc3 = __SMLADX(x3, c0, acc3); 
00322  
00323         /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 
00324         c0 = *(pb--); 
00325  
00326         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 
00327         acc0 = __SMLADX(x2, c0, acc0); 
00328  
00329         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 
00330         acc1 = __SMLADX(x3, c0, acc1); 
00331  
00332         /* Read x[4], x[5] */ 
00333         x0 = *(q31_t *) (px++); 
00334  
00335         /* Read x[5], x[6] */ 
00336         x1 = *(q31_t *) (px++); 
00337  
00338         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 
00339         acc2 = __SMLADX(x0, c0, acc2); 
00340  
00341         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 
00342         acc3 = __SMLADX(x1, c0, acc3); 
00343  
00344       } while(--k); 
00345  
00346       /* For the next MAC operations, SIMD is not used  
00347        * So, the 16 bit pointer if inputB, py is updated */ 
00348       py = (q15_t *) pb; 
00349       py = py + 1; 
00350  
00351       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00352        ** No loop unrolling is used. */ 
00353       k = srcBLen % 0x4u; 
00354  
00355       if(k == 1u) 
00356       { 
00357         /* Read y[srcBLen - 5] */ 
00358         c0 = *(py); 
00359  
00360         /* Read x[7] */ 
00361         x3 = *(q31_t *) px++; 
00362  
00363         /* Perform the multiply-accumulates */ 
00364         acc0 = __SMLAD(x0, c0, acc0); 
00365         acc1 = __SMLAD(x1, c0, acc1); 
00366         acc2 = __SMLADX(x1, c0, acc2); 
00367         acc3 = __SMLADX(x3, c0, acc3); 
00368       } 
00369  
00370       if(k == 2u) 
00371       { 
00372         /* Read y[srcBLen - 5], y[srcBLen - 6] */ 
00373         c0 = *(pb); 
00374  
00375         /* Read x[7], x[8] */ 
00376         x3 = *(q31_t *) px++; 
00377  
00378         /* Read x[9] */ 
00379         x2 = *(q31_t *) px++; 
00380  
00381         /* Perform the multiply-accumulates */ 
00382         acc0 = __SMLADX(x0, c0, acc0); 
00383         acc1 = __SMLADX(x1, c0, acc1); 
00384         acc2 = __SMLADX(x3, c0, acc2); 
00385         acc3 = __SMLADX(x2, c0, acc3); 
00386       } 
00387  
00388       if(k == 3u) 
00389       { 
00390         /* Read y[srcBLen - 5], y[srcBLen - 6] */ 
00391         c0 = *pb--; 
00392  
00393         /* Read x[7], x[8] */ 
00394         x3 = *(q31_t *) px++; 
00395  
00396         /* Read x[9] */ 
00397         x2 = *(q31_t *) px++; 
00398  
00399         /* Perform the multiply-accumulates */ 
00400         acc0 = __SMLADX(x0, c0, acc0); 
00401         acc1 = __SMLADX(x1, c0, acc1); 
00402         acc2 = __SMLADX(x3, c0, acc2); 
00403         acc3 = __SMLADX(x2, c0, acc3); 
00404  
00405         /* Read y[srcBLen - 7] */ 
00406         c0 = (q15_t) (*pb >> 16); 
00407  
00408         /* Read x[10] */ 
00409         x3 = *(q31_t *) px++; 
00410  
00411         /* Perform the multiply-accumulates */ 
00412         acc0 = __SMLADX(x1, c0, acc0); 
00413         acc1 = __SMLAD(x2, c0, acc1); 
00414         acc2 = __SMLADX(x2, c0, acc2); 
00415         acc3 = __SMLADX(x3, c0, acc3); 
00416       } 
00417  
00418       /* Store the results in the accumulators in the destination buffer. */ 
00419       *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16); 
00420       *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16); 
00421  
00422       /* Update the inputA and inputB pointers for next MAC calculation */ 
00423       px = pIn1 + (count * 4u); 
00424       py = pSrc2; 
00425       pb = (q31_t *) (py - 1); 
00426  
00427       /* Increment the pointer pIn1 index, count by 1 */ 
00428       count++; 
00429  
00430       /* Decrement the loop counter */ 
00431       blkCnt--; 
00432     } 
00433  
00434     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00435      ** No loop unrolling is used. */ 
00436     blkCnt = blockSize2 % 0x4u; 
00437  
00438     while(blkCnt > 0u) 
00439     { 
00440       /* Accumulator is made zero for every iteration */ 
00441       sum = 0; 
00442  
00443       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00444       k = srcBLen >> 2u; 
00445  
00446       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00447        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00448       while(k > 0u) 
00449       { 
00450         /* Perform the multiply-accumulates */ 
00451         sum += ((q31_t) * px++ * *py--); 
00452         sum += ((q31_t) * px++ * *py--); 
00453         sum += ((q31_t) * px++ * *py--); 
00454         sum += ((q31_t) * px++ * *py--); 
00455  
00456         /* Decrement the loop counter */ 
00457         k--; 
00458       } 
00459  
00460       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00461        ** No loop unrolling is used. */ 
00462       k = srcBLen % 0x4u; 
00463  
00464       while(k > 0u) 
00465       { 
00466         /* Perform the multiply-accumulates */ 
00467         sum += ((q31_t) * px++ * *py--); 
00468  
00469         /* Decrement the loop counter */ 
00470         k--; 
00471       } 
00472  
00473       /* Store the result in the accumulator in the destination buffer. */ 
00474       *pOut++ = (q15_t) (sum >> 15); 
00475  
00476       /* Update the inputA and inputB pointers for next MAC calculation */ 
00477       px = pIn1 + count; 
00478       py = pSrc2; 
00479  
00480       /* Increment the pointer pIn1 index, count by 1 */ 
00481       count++; 
00482  
00483       /* Decrement the loop counter */ 
00484       blkCnt--; 
00485     } 
00486   } 
00487   else 
00488   { 
00489     /* If the srcBLen is not a multiple of 4,  
00490      * the blockSize2 loop cannot be unrolled by 4 */ 
00491     blkCnt = blockSize2; 
00492  
00493     while(blkCnt > 0u) 
00494     { 
00495       /* Accumulator is made zero for every iteration */ 
00496       sum = 0; 
00497  
00498       /* srcBLen number of MACS should be performed */ 
00499       k = srcBLen; 
00500  
00501       while(k > 0u) 
00502       { 
00503         /* Perform the multiply-accumulate */ 
00504         sum += ((q31_t) * px++ * *py--); 
00505  
00506         /* Decrement the loop counter */ 
00507         k--; 
00508       } 
00509  
00510       /* Store the result in the accumulator in the destination buffer. */ 
00511       *pOut++ = (q15_t) (sum >> 15); 
00512  
00513       /* Update the inputA and inputB pointers for next MAC calculation */ 
00514       px = pIn1 + count; 
00515       py = pSrc2; 
00516  
00517       /* Increment the MAC count */ 
00518       count++; 
00519  
00520       /* Decrement the loop counter */ 
00521       blkCnt--; 
00522     } 
00523   } 
00524  
00525  
00526   /* --------------------------  
00527    * Initializations of stage3  
00528    * -------------------------*/ 
00529  
00530   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]  
00531    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]  
00532    * ....  
00533    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]  
00534    * sum +=  x[srcALen-1] * y[srcBLen-1]  
00535    */ 
00536  
00537   /* In this stage the MAC operations are decreased by 1 for every iteration.  
00538      The blockSize3 variable holds the number of MAC operations performed */ 
00539  
00540   /* Working pointer of inputA */ 
00541   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00542   px = pSrc1; 
00543  
00544   /* Working pointer of inputB */ 
00545   pSrc2 = pIn2 + (srcBLen - 1u); 
00546   pIn2 = pSrc2 - 1u; 
00547   py = pIn2; 
00548  
00549   /* -------------------  
00550    * Stage3 process  
00551    * ------------------*/ 
00552  
00553   /* For loop unrolling by 4, this stage is divided into two. */ 
00554   /* First part of this stage computes the MAC operations greater than 4 */ 
00555   /* Second part of this stage computes the MAC operations less than or equal to 4 */ 
00556  
00557   /* The first part of the stage starts here */ 
00558   j = blockSize3 >> 2u; 
00559  
00560   while((j > 0u) && (blockSize3 > 0u)) 
00561   { 
00562     /* Accumulator is made zero for every iteration */ 
00563     sum = 0; 
00564  
00565     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00566     k = blockSize3 >> 2u; 
00567  
00568     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00569      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00570     while(k > 0u) 
00571     { 
00572       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied  
00573        * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 
00574       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00575       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied  
00576        * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 
00577       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00578  
00579       /* Decrement the loop counter */ 
00580       k--; 
00581     } 
00582  
00583     /* For the next MAC operations, the pointer py is used without SIMD  
00584      * So, py is incremented by 1 */ 
00585     py = py + 1u; 
00586  
00587     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.  
00588      ** No loop unrolling is used. */ 
00589     k = blockSize3 % 0x4u; 
00590  
00591     while(k > 0u) 
00592     { 
00593       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 
00594       sum = __SMLAD(*px++, *py--, sum); 
00595  
00596       /* Decrement the loop counter */ 
00597       k--; 
00598     } 
00599  
00600     /* Store the result in the accumulator in the destination buffer. */ 
00601     *pOut++ = (q15_t) (sum >> 15); 
00602  
00603     /* Update the inputA and inputB pointers for next MAC calculation */ 
00604     px = ++pSrc1; 
00605     py = pIn2; 
00606  
00607     /* Decrement the loop counter */ 
00608     blockSize3--; 
00609  
00610     j--; 
00611   } 
00612  
00613   /* The second part of the stage starts here */ 
00614   /* SIMD is not used for the next MAC operations,  
00615    * so pointer py is updated to read only one sample at a time */ 
00616   py = py + 1u; 
00617  
00618   while(blockSize3 > 0u) 
00619   { 
00620     /* Accumulator is made zero for every iteration */ 
00621     sum = 0; 
00622  
00623     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00624     k = blockSize3; 
00625  
00626     while(k > 0u) 
00627     { 
00628       /* Perform the multiply-accumulates */ 
00629       /* sum +=  x[srcALen-1] * y[srcBLen-1] */ 
00630       sum = __SMLAD(*px++, *py--, sum); 
00631  
00632       /* Decrement the loop counter */ 
00633       k--; 
00634     } 
00635  
00636     /* Store the result in the accumulator in the destination buffer. */ 
00637     *pOut++ = (q15_t) (sum >> 15); 
00638  
00639     /* Update the inputA and inputB pointers for next MAC calculation */ 
00640     px = ++pSrc1; 
00641     py = pSrc2; 
00642  
00643     /* Decrement the loop counter */ 
00644     blockSize3--; 
00645   } 
00646  
00647 } 
00648  
00649 /**  
00650  * @} end of Conv group  
00651  */