CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_q15.c Source File

arm_conv_q15.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_conv_q15.c  
00009 *  
00010 * Description:  Q15 Convolution.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated  
00025 *  
00026 * Version 0.0.7  2010/06/10   
00027 *    Misra-C changes done  
00028 *  
00029 * -------------------------------------------------------------------- */ 
00030  
00031 #include "arm_math.h" 
00032  
00033 /**  
00034  * @ingroup groupFilters  
00035  */ 
00036  
00037 /**  
00038  * @addtogroup Conv  
00039  * @{  
00040  */ 
00041  
00042 /**  
00043  * @brief Convolution of Q15 sequences.  
00044  * @param[in] *pSrcA points to the first input sequence.  
00045  * @param[in] srcALen length of the first input sequence.  
00046  * @param[in] *pSrcB points to the second input sequence.  
00047  * @param[in] srcBLen length of the second input sequence.  
00048  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.  
00049  * @return none.  
00050  *  
00051  * @details  
00052  * <b>Scaling and Overflow Behavior:</b>  
00053  *  
00054  * \par  
00055  * The function is implemented using a 64-bit internal accumulator.  
00056  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.  
00057  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.  
00058  * This approach provides 33 guard bits and there is no risk of overflow.  
00059  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.  
00060  *  
00061  * \par  
00062  * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function.  
00063  */ 
00064  
00065 void arm_conv_q15( 
00066   q15_t * pSrcA, 
00067   uint32_t srcALen, 
00068   q15_t * pSrcB, 
00069   uint32_t srcBLen, 
00070   q15_t * pDst) 
00071 { 
00072   q15_t *pIn1;                                   /* inputA pointer */ 
00073   q15_t *pIn2;                                   /* inputB pointer */ 
00074   q15_t *pOut = pDst;                            /* output pointer */ 
00075   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */ 
00076   q15_t *px;                                     /* Intermediate inputA pointer  */ 
00077   q15_t *py;                                     /* Intermediate inputB pointer  */ 
00078   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */ 
00079   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */ 
00080   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */ 
00081   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */ 
00082  
00083  
00084   /* The algorithm implementation is based on the lengths of the inputs. */ 
00085   /* srcB is always made to slide across srcA. */ 
00086   /* So srcBLen is always considered as shorter or equal to srcALen */ 
00087   if(srcALen >= srcBLen) 
00088   { 
00089     /* Initialization of inputA pointer */ 
00090     pIn1 = pSrcA; 
00091  
00092     /* Initialization of inputB pointer */ 
00093     pIn2 = pSrcB; 
00094   } 
00095   else 
00096   { 
00097     /* Initialization of inputA pointer */ 
00098     pIn1 = pSrcB; 
00099  
00100     /* Initialization of inputB pointer */ 
00101     pIn2 = pSrcA; 
00102  
00103     /* srcBLen is always considered as shorter or equal to srcALen */ 
00104     j = srcBLen; 
00105     srcBLen = srcALen; 
00106     srcALen = j; 
00107   } 
00108  
00109   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 
00110   /* The function is internally  
00111    * divided into three stages according to the number of multiplications that has to be  
00112    * taken place between inputA samples and inputB samples. In the first stage of the  
00113    * algorithm, the multiplications increase by one for every iteration.  
00114    * In the second stage of the algorithm, srcBLen number of multiplications are done.  
00115    * In the third stage of the algorithm, the multiplications decrease by one  
00116    * for every iteration. */ 
00117  
00118   /* The algorithm is implemented in three stages.  
00119      The loop counters of each stage is initiated here. */ 
00120   blockSize1 = srcBLen - 1u; 
00121   blockSize2 = srcALen - (srcBLen - 1u); 
00122  
00123   /* --------------------------  
00124    * Initializations of stage1  
00125    * -------------------------*/ 
00126  
00127   /* sum = x[0] * y[0]  
00128    * sum = x[0] * y[1] + x[1] * y[0]  
00129    * ....  
00130    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]  
00131    */ 
00132  
00133   /* In this stage the MAC operations are increased by 1 for every iteration.  
00134      The count variable holds the number of MAC operations performed */ 
00135   count = 1u; 
00136  
00137   /* Working pointer of inputA */ 
00138   px = pIn1; 
00139  
00140   /* Working pointer of inputB */ 
00141   py = pIn2; 
00142  
00143  
00144   /* ------------------------  
00145    * Stage1 process  
00146    * ----------------------*/ 
00147  
00148   /* For loop unrolling by 4, this stage is divided into two. */ 
00149   /* First part of this stage computes the MAC operations less than 4 */ 
00150   /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 
00151  
00152   /* The first part of the stage starts here */ 
00153   while((count < 4u) && (blockSize1 > 0u)) 
00154   { 
00155     /* Accumulator is made zero for every iteration */ 
00156     sum = 0; 
00157  
00158     /* Loop over number of MAC operations between  
00159      * inputA samples and inputB samples */ 
00160     k = count; 
00161  
00162     while(k > 0u) 
00163     { 
00164       /* Perform the multiply-accumulates */ 
00165       sum = __SMLALD(*px++, *py--, sum); 
00166  
00167       /* Decrement the loop counter */ 
00168       k--; 
00169     } 
00170  
00171     /* Store the result in the accumulator in the destination buffer. */ 
00172     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 
00173  
00174     /* Update the inputA and inputB pointers for next MAC calculation */ 
00175     py = pIn2 + count; 
00176     px = pIn1; 
00177  
00178     /* Increment the MAC count */ 
00179     count++; 
00180  
00181     /* Decrement the loop counter */ 
00182     blockSize1--; 
00183   } 
00184  
00185   /* The second part of the stage starts here */ 
00186   /* The internal loop, over count, is unrolled by 4 */ 
00187   /* To, read the last two inputB samples using SIMD:  
00188    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 
00189   py = py - 1; 
00190  
00191   while(blockSize1 > 0u) 
00192   { 
00193     /* Accumulator is made zero for every iteration */ 
00194     sum = 0; 
00195  
00196     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00197     k = count >> 2u; 
00198  
00199     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00200      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00201     while(k > 0u) 
00202     { 
00203       /* Perform the multiply-accumulates */ 
00204       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 
00205       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00206       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 
00207       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00208  
00209       /* Decrement the loop counter */ 
00210       k--; 
00211     } 
00212  
00213     /* For the next MAC operations, the pointer py is used without SIMD  
00214      * So, py is incremented by 1 */ 
00215     py = py + 1u; 
00216  
00217     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00218      ** No loop unrolling is used. */ 
00219     k = count % 0x4u; 
00220  
00221     while(k > 0u) 
00222     { 
00223       /* Perform the multiply-accumulates */ 
00224       sum = __SMLALD(*px++, *py--, sum); 
00225  
00226       /* Decrement the loop counter */ 
00227       k--; 
00228     } 
00229  
00230     /* Store the result in the accumulator in the destination buffer. */ 
00231     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 
00232  
00233     /* Update the inputA and inputB pointers for next MAC calculation */ 
00234     py = pIn2 + (count - 1u); 
00235     px = pIn1; 
00236  
00237     /* Increment the MAC count */ 
00238     count++; 
00239  
00240     /* Decrement the loop counter */ 
00241     blockSize1--; 
00242   } 
00243  
00244   /* --------------------------  
00245    * Initializations of stage2  
00246    * ------------------------*/ 
00247  
00248   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]  
00249    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]  
00250    * ....  
00251    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]  
00252    */ 
00253  
00254   /* Working pointer of inputA */ 
00255   px = pIn1; 
00256  
00257   /* Working pointer of inputB */ 
00258   pSrc2 = pIn2 + (srcBLen - 1u); 
00259   py = pSrc2; 
00260  
00261   /* Initialize inputB pointer of type q31 */ 
00262   pb = (q31_t *) (py - 1u); 
00263  
00264   /* count is the index by which the pointer pIn1 to be incremented */ 
00265   count = 1u; 
00266  
00267  
00268   /* --------------------  
00269    * Stage2 process  
00270    * -------------------*/ 
00271  
00272   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00273    * So, to loop unroll over blockSize2,  
00274    * srcBLen should be greater than or equal to 4 */ 
00275   if(srcBLen >= 4u) 
00276   { 
00277     /* Loop unroll over blockSize2, by 4 */ 
00278     blkCnt = blockSize2 >> 2u; 
00279  
00280     while(blkCnt > 0u) 
00281     { 
00282       /* Set all accumulators to zero */ 
00283       acc0 = 0; 
00284       acc1 = 0; 
00285       acc2 = 0; 
00286       acc3 = 0; 
00287  
00288  
00289       /* read x[0], x[1] samples */ 
00290       x0 = *(q31_t *) (px++); 
00291       /* read x[1], x[2] samples */ 
00292       x1 = *(q31_t *) (px++); 
00293  
00294  
00295       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00296       k = srcBLen >> 2u; 
00297  
00298       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00299        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00300       do 
00301       { 
00302         /* Read the last two inputB samples using SIMD:  
00303          * y[srcBLen - 1] and y[srcBLen - 2] */ 
00304         c0 = *(pb--); 
00305  
00306         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 
00307         acc0 = __SMLALDX(x0, c0, acc0); 
00308  
00309         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 
00310         acc1 = __SMLALDX(x1, c0, acc1); 
00311  
00312         /* Read x[2], x[3] */ 
00313         x2 = *(q31_t *) (px++); 
00314  
00315         /* Read x[3], x[4] */ 
00316         x3 = *(q31_t *) (px++); 
00317  
00318         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 
00319         acc2 = __SMLALDX(x2, c0, acc2); 
00320  
00321         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 
00322         acc3 = __SMLALDX(x3, c0, acc3); 
00323  
00324         /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 
00325         c0 = *(pb--); 
00326  
00327         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 
00328         acc0 = __SMLALDX(x2, c0, acc0); 
00329  
00330         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 
00331         acc1 = __SMLALDX(x3, c0, acc1); 
00332  
00333         /* Read x[4], x[5] */ 
00334         x0 = *(q31_t *) (px++); 
00335  
00336         /* Read x[5], x[6] */ 
00337         x1 = *(q31_t *) (px++); 
00338  
00339         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 
00340         acc2 = __SMLALDX(x0, c0, acc2); 
00341  
00342         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 
00343         acc3 = __SMLALDX(x1, c0, acc3); 
00344  
00345       } while(--k); 
00346  
00347       /* For the next MAC operations, SIMD is not used  
00348        * So, the 16 bit pointer if inputB, py is updated */ 
00349       py = (q15_t *) pb; 
00350       py = py + 1; 
00351  
00352       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00353        ** No loop unrolling is used. */ 
00354       k = srcBLen % 0x4u; 
00355  
00356       if(k == 1u) 
00357       { 
00358         /* Read y[srcBLen - 5] */ 
00359         c0 = *(py); 
00360  
00361         /* Read x[7] */ 
00362         x3 = *(q31_t *) px++; 
00363  
00364         /* Perform the multiply-accumulates */ 
00365         acc0 = __SMLALD(x0, c0, acc0); 
00366         acc1 = __SMLALD(x1, c0, acc1); 
00367         acc2 = __SMLALDX(x1, c0, acc2); 
00368         acc3 = __SMLALDX(x3, c0, acc3); 
00369       } 
00370  
00371       if(k == 2u) 
00372       { 
00373         /* Read y[srcBLen - 5], y[srcBLen - 6] */ 
00374         c0 = *(pb); 
00375  
00376         /* Read x[7], x[8] */ 
00377         x3 = *(q31_t *) px++; 
00378  
00379         /* Read x[9] */ 
00380         x2 = *(q31_t *) px++; 
00381  
00382         /* Perform the multiply-accumulates */ 
00383         acc0 = __SMLALDX(x0, c0, acc0); 
00384         acc1 = __SMLALDX(x1, c0, acc1); 
00385         acc2 = __SMLALDX(x3, c0, acc2); 
00386         acc3 = __SMLALDX(x2, c0, acc3); 
00387       } 
00388  
00389       if(k == 3u) 
00390       { 
00391         /* Read y[srcBLen - 5], y[srcBLen - 6] */ 
00392         c0 = *pb--; 
00393  
00394         /* Read x[7], x[8] */ 
00395         x3 = *(q31_t *) px++; 
00396  
00397         /* Read x[9] */ 
00398         x2 = *(q31_t *) px++; 
00399  
00400         /* Perform the multiply-accumulates */ 
00401         acc0 = __SMLALDX(x0, c0, acc0); 
00402         acc1 = __SMLALDX(x1, c0, acc1); 
00403         acc2 = __SMLALDX(x3, c0, acc2); 
00404         acc3 = __SMLALDX(x2, c0, acc3); 
00405  
00406         /* Read y[srcBLen - 7] */ 
00407         c0 = (q15_t) (*pb >> 16); 
00408  
00409         /* Read x[10] */ 
00410         x3 = *(q31_t *) px++; 
00411  
00412         /* Perform the multiply-accumulates */ 
00413         acc0 = __SMLALDX(x1, c0, acc0); 
00414         acc1 = __SMLALD(x2, c0, acc1); 
00415         acc2 = __SMLALDX(x2, c0, acc2); 
00416         acc3 = __SMLALDX(x3, c0, acc3); 
00417       } 
00418  
00419       /* Store the results in the accumulators in the destination buffer. */ 
00420       *__SIMD32(pOut)++ = 
00421         __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 
00422       *__SIMD32(pOut)++ = 
00423         __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 
00424  
00425       /* Update the inputA and inputB pointers for next MAC calculation */ 
00426       px = pIn1 + (count * 4u); 
00427       py = pSrc2; 
00428       pb = (q31_t *) (py - 1); 
00429  
00430       /* Increment the pointer pIn1 index, count by 1 */ 
00431       count++; 
00432  
00433       /* Decrement the loop counter */ 
00434       blkCnt--; 
00435     } 
00436  
00437     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00438      ** No loop unrolling is used. */ 
00439     blkCnt = blockSize2 % 0x4u; 
00440  
00441     while(blkCnt > 0u) 
00442     { 
00443       /* Accumulator is made zero for every iteration */ 
00444       sum = 0; 
00445  
00446       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00447       k = srcBLen >> 2u; 
00448  
00449       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00450        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00451       while(k > 0u) 
00452       { 
00453         /* Perform the multiply-accumulates */ 
00454         sum += (q63_t) ((q31_t) * px++ * *py--); 
00455         sum += (q63_t) ((q31_t) * px++ * *py--); 
00456         sum += (q63_t) ((q31_t) * px++ * *py--); 
00457         sum += (q63_t) ((q31_t) * px++ * *py--); 
00458  
00459         /* Decrement the loop counter */ 
00460         k--; 
00461       } 
00462  
00463       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00464        ** No loop unrolling is used. */ 
00465       k = srcBLen % 0x4u; 
00466  
00467       while(k > 0u) 
00468       { 
00469         /* Perform the multiply-accumulates */ 
00470         sum += (q63_t) ((q31_t) * px++ * *py--); 
00471  
00472         /* Decrement the loop counter */ 
00473         k--; 
00474       } 
00475  
00476       /* Store the result in the accumulator in the destination buffer. */ 
00477       *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 
00478  
00479       /* Update the inputA and inputB pointers for next MAC calculation */ 
00480       px = pIn1 + count; 
00481       py = pSrc2; 
00482  
00483       /* Increment the pointer pIn1 index, count by 1 */ 
00484       count++; 
00485  
00486       /* Decrement the loop counter */ 
00487       blkCnt--; 
00488     } 
00489   } 
00490   else 
00491   { 
00492     /* If the srcBLen is not a multiple of 4,  
00493      * the blockSize2 loop cannot be unrolled by 4 */ 
00494     blkCnt = blockSize2; 
00495  
00496     while(blkCnt > 0u) 
00497     { 
00498       /* Accumulator is made zero for every iteration */ 
00499       sum = 0; 
00500  
00501       /* srcBLen number of MACS should be performed */ 
00502       k = srcBLen; 
00503  
00504       while(k > 0u) 
00505       { 
00506         /* Perform the multiply-accumulate */ 
00507         sum += (q63_t) ((q31_t) * px++ * *py--); 
00508  
00509         /* Decrement the loop counter */ 
00510         k--; 
00511       } 
00512  
00513       /* Store the result in the accumulator in the destination buffer. */ 
00514       *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 
00515  
00516       /* Update the inputA and inputB pointers for next MAC calculation */ 
00517       px = pIn1 + count; 
00518       py = pSrc2; 
00519  
00520       /* Increment the MAC count */ 
00521       count++; 
00522  
00523       /* Decrement the loop counter */ 
00524       blkCnt--; 
00525     } 
00526   } 
00527  
00528  
00529   /* --------------------------  
00530    * Initializations of stage3  
00531    * -------------------------*/ 
00532  
00533   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]  
00534    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]  
00535    * ....  
00536    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]  
00537    * sum +=  x[srcALen-1] * y[srcBLen-1]  
00538    */ 
00539  
00540   /* In this stage the MAC operations are decreased by 1 for every iteration.  
00541      The blockSize3 variable holds the number of MAC operations performed */ 
00542  
00543   blockSize3 = srcBLen - 1u; 
00544  
00545   /* Working pointer of inputA */ 
00546   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00547   px = pSrc1; 
00548  
00549   /* Working pointer of inputB */ 
00550   pSrc2 = pIn2 + (srcBLen - 1u); 
00551   pIn2 = pSrc2 - 1u; 
00552   py = pIn2; 
00553  
00554   /* -------------------  
00555    * Stage3 process  
00556    * ------------------*/ 
00557  
00558   /* For loop unrolling by 4, this stage is divided into two. */ 
00559   /* First part of this stage computes the MAC operations greater than 4 */ 
00560   /* Second part of this stage computes the MAC operations less than or equal to 4 */ 
00561  
00562   /* The first part of the stage starts here */ 
00563   j = blockSize3 >> 2u; 
00564  
00565   while((j > 0u) && (blockSize3 > 0u)) 
00566   { 
00567     /* Accumulator is made zero for every iteration */ 
00568     sum = 0; 
00569  
00570     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00571     k = blockSize3 >> 2u; 
00572  
00573     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00574      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00575     while(k > 0u) 
00576     { 
00577       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied  
00578        * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 
00579       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00580       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied  
00581        * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 
00582       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00583  
00584       /* Decrement the loop counter */ 
00585       k--; 
00586     } 
00587  
00588     /* For the next MAC operations, the pointer py is used without SIMD  
00589      * So, py is incremented by 1 */ 
00590     py = py + 1u; 
00591  
00592     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.  
00593      ** No loop unrolling is used. */ 
00594     k = blockSize3 % 0x4u; 
00595  
00596     while(k > 0u) 
00597     { 
00598       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 
00599       sum = __SMLALD(*px++, *py--, sum); 
00600  
00601       /* Decrement the loop counter */ 
00602       k--; 
00603     } 
00604  
00605     /* Store the result in the accumulator in the destination buffer. */ 
00606     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 
00607  
00608     /* Update the inputA and inputB pointers for next MAC calculation */ 
00609     px = ++pSrc1; 
00610     py = pIn2; 
00611  
00612     /* Decrement the loop counter */ 
00613     blockSize3--; 
00614  
00615     j--; 
00616   } 
00617  
00618   /* The second part of the stage starts here */ 
00619   /* SIMD is not used for the next MAC operations,  
00620    * so pointer py is updated to read only one sample at a time */ 
00621   py = py + 1u; 
00622  
00623   while(blockSize3 > 0u) 
00624   { 
00625     /* Accumulator is made zero for every iteration */ 
00626     sum = 0; 
00627  
00628     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00629     k = blockSize3; 
00630  
00631     while(k > 0u) 
00632     { 
00633       /* Perform the multiply-accumulates */ 
00634       /* sum +=  x[srcALen-1] * y[srcBLen-1] */ 
00635       sum = __SMLALD(*px++, *py--, sum); 
00636  
00637       /* Decrement the loop counter */ 
00638       k--; 
00639     } 
00640  
00641     /* Store the result in the accumulator in the destination buffer. */ 
00642     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 
00643  
00644     /* Update the inputA and inputB pointers for next MAC calculation */ 
00645     px = ++pSrc1; 
00646     py = pSrc2; 
00647  
00648     /* Decrement the loop counter */ 
00649     blockSize3--; 
00650   } 
00651  
00652 } 
00653  
00654 /**  
00655  * @} end of Conv group  
00656  */