CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_q7.c Source File

arm_conv_q7.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_conv_q7.c  
00009 *  
00010 * Description:  Q7 Convolution.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated  
00025 *  
00026 * Version 0.0.7  2010/06/10   
00027 *    Misra-C changes done  
00028 *  
00029 * -------------------------------------------------------------------- */ 
00030  
00031 #include "arm_math.h" 
00032  
00033 /**  
00034  * @ingroup groupFilters  
00035  */ 
00036  
00037 /**  
00038  * @addtogroup Conv  
00039  * @{  
00040  */ 
00041  
00042 /**  
00043  * @brief Convolution of Q7 sequences.  
00044  * @param[in] *pSrcA points to the first input sequence.  
00045  * @param[in] srcALen length of the first input sequence.  
00046  * @param[in] *pSrcB points to the second input sequence.  
00047  * @param[in] srcBLen length of the second input sequence.  
00048  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.  
00049  * @return none.  
00050  *  
00051  * @details  
00052  * <b>Scaling and Overflow Behavior:</b>  
00053  *  
00054  * \par  
00055  * The function is implemented using a 32-bit internal accumulator.  
00056  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.  
00057  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.  
00058  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.  
00059  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.  
00060  */ 
00061  
00062 void arm_conv_q7( 
00063   q7_t * pSrcA, 
00064   uint32_t srcALen, 
00065   q7_t * pSrcB, 
00066   uint32_t srcBLen, 
00067   q7_t * pDst) 
00068 { 
00069   q7_t *pIn1;                                    /* inputA pointer */ 
00070   q7_t *pIn2;                                    /* inputB pointer */ 
00071   q7_t *pOut = pDst;                             /* output pointer */ 
00072   q7_t *px;                                      /* Intermediate inputA pointer */ 
00073   q7_t *py;                                      /* Intermediate inputB pointer */ 
00074   q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */ 
00075   q7_t x0, x1, x2, x3, c0, c1;                   /* Temporary variables to hold state and coefficient values */ 
00076   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */ 
00077   q31_t input1, input2;                          /* Temporary input variables */ 
00078   q15_t in1, in2;                                /* Temporary input variables */ 
00079   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */ 
00080  
00081  
00082   /* The algorithm implementation is based on the lengths of the inputs. */ 
00083   /* srcB is always made to slide across srcA. */ 
00084   /* So srcBLen is always considered as shorter or equal to srcALen */ 
00085   if(srcALen >= srcBLen) 
00086   { 
00087     /* Initialization of inputA pointer */ 
00088     pIn1 = pSrcA; 
00089  
00090     /* Initialization of inputB pointer */ 
00091     pIn2 = pSrcB; 
00092   } 
00093   else 
00094   { 
00095     /* Initialization of inputA pointer */ 
00096     pIn1 = pSrcB; 
00097  
00098     /* Initialization of inputB pointer */ 
00099     pIn2 = pSrcA; 
00100  
00101     /* srcBLen is always considered as shorter or equal to srcALen */ 
00102     j = srcBLen; 
00103     srcBLen = srcALen; 
00104     srcALen = j; 
00105   } 
00106  
00107   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 
00108   /* The function is internally  
00109    * divided into three stages according to the number of multiplications that has to be  
00110    * taken place between inputA samples and inputB samples. In the first stage of the  
00111    * algorithm, the multiplications increase by one for every iteration.  
00112    * In the second stage of the algorithm, srcBLen number of multiplications are done.  
00113    * In the third stage of the algorithm, the multiplications decrease by one  
00114    * for every iteration. */ 
00115  
00116   /* The algorithm is implemented in three stages.  
00117      The loop counters of each stage is initiated here. */ 
00118   blockSize1 = srcBLen - 1u; 
00119   blockSize2 = (srcALen - srcBLen) + 1u; 
00120   blockSize3 = blockSize1; 
00121  
00122   /* --------------------------  
00123    * Initializations of stage1  
00124    * -------------------------*/ 
00125  
00126   /* sum = x[0] * y[0]  
00127    * sum = x[0] * y[1] + x[1] * y[0]  
00128    * ....  
00129    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]  
00130    */ 
00131  
00132   /* In this stage the MAC operations are increased by 1 for every iteration.  
00133      The count variable holds the number of MAC operations performed */ 
00134   count = 1u; 
00135  
00136   /* Working pointer of inputA */ 
00137   px = pIn1; 
00138  
00139   /* Working pointer of inputB */ 
00140   py = pIn2; 
00141  
00142  
00143   /* ------------------------  
00144    * Stage1 process  
00145    * ----------------------*/ 
00146  
00147   /* The first stage starts here */ 
00148   while(blockSize1 > 0u) 
00149   { 
00150     /* Accumulator is made zero for every iteration */ 
00151     sum = 0; 
00152  
00153     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00154     k = count >> 2u; 
00155  
00156     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00157      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00158     while(k > 0u) 
00159     { 
00160       /* x[0] , x[1] */ 
00161       in1 = (q15_t) * px++; 
00162       in2 = (q15_t) * px++; 
00163       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00164  
00165       /* y[srcBLen - 1] , y[srcBLen - 2] */ 
00166       in1 = (q15_t) * py--; 
00167       in2 = (q15_t) * py--; 
00168       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00169  
00170       /* x[0] * y[srcBLen - 1] */ 
00171       /* x[1] * y[srcBLen - 2] */ 
00172       sum = __SMLAD(input1, input2, sum); 
00173  
00174       /* x[2] , x[3] */ 
00175       in1 = (q15_t) * px++; 
00176       in2 = (q15_t) * px++; 
00177       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00178  
00179       /* y[srcBLen - 3] , y[srcBLen - 4] */ 
00180       in1 = (q15_t) * py--; 
00181       in2 = (q15_t) * py--; 
00182       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00183  
00184       /* x[2] * y[srcBLen - 3] */ 
00185       /* x[3] * y[srcBLen - 4] */ 
00186       sum = __SMLAD(input1, input2, sum); 
00187  
00188       /* Decrement the loop counter */ 
00189       k--; 
00190     } 
00191  
00192     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00193      ** No loop unrolling is used. */ 
00194     k = count % 0x4u; 
00195  
00196     while(k > 0u) 
00197     { 
00198       /* Perform the multiply-accumulates */ 
00199       sum += ((q15_t) * px++ * *py--); 
00200  
00201       /* Decrement the loop counter */ 
00202       k--; 
00203     } 
00204  
00205     /* Store the result in the accumulator in the destination buffer. */ 
00206     *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 
00207  
00208     /* Update the inputA and inputB pointers for next MAC calculation */ 
00209     py = pIn2 + count; 
00210     px = pIn1; 
00211  
00212     /* Increment the MAC count */ 
00213     count++; 
00214  
00215     /* Decrement the loop counter */ 
00216     blockSize1--; 
00217   } 
00218  
00219   /* --------------------------  
00220    * Initializations of stage2  
00221    * ------------------------*/ 
00222  
00223   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]  
00224    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]  
00225    * ....  
00226    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]  
00227    */ 
00228  
00229   /* Working pointer of inputA */ 
00230   px = pIn1; 
00231  
00232   /* Working pointer of inputB */ 
00233   pSrc2 = pIn2 + (srcBLen - 1u); 
00234   py = pSrc2; 
00235  
00236   /* count is index by which the pointer pIn1 to be incremented */ 
00237   count = 1u; 
00238  
00239   /* -------------------  
00240    * Stage2 process  
00241    * ------------------*/ 
00242  
00243   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00244    * So, to loop unroll over blockSize2,  
00245    * srcBLen should be greater than or equal to 4 */ 
00246   if(srcBLen >= 4u) 
00247   { 
00248     /* Loop unroll over blockSize2, by 4 */ 
00249     blkCnt = blockSize2 >> 2u; 
00250  
00251     while(blkCnt > 0u) 
00252     { 
00253       /* Set all accumulators to zero */ 
00254       acc0 = 0; 
00255       acc1 = 0; 
00256       acc2 = 0; 
00257       acc3 = 0; 
00258  
00259       /* read x[0], x[1], x[2] samples */ 
00260       x0 = *(px++); 
00261       x1 = *(px++); 
00262       x2 = *(px++); 
00263  
00264       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00265       k = srcBLen >> 2u; 
00266  
00267       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00268        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00269       do 
00270       { 
00271         /* Read y[srcBLen - 1] sample */ 
00272         c0 = *(py--); 
00273         /* Read y[srcBLen - 2] sample */ 
00274         c1 = *(py--); 
00275  
00276         /* Read x[3] sample */ 
00277         x3 = *(px++); 
00278  
00279         /* x[0] and x[1] are packed */ 
00280         in1 = (q15_t) x0; 
00281         in2 = (q15_t) x1; 
00282  
00283         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00284  
00285         /* y[srcBLen - 1]   and y[srcBLen - 2] are packed */ 
00286         in1 = (q15_t) c0; 
00287         in2 = (q15_t) c1; 
00288  
00289         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00290  
00291         /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2]  */ 
00292         acc0 = __SMLAD(input1, input2, acc0); 
00293  
00294         /* x[1] and x[2] are packed */ 
00295         in1 = (q15_t) x1; 
00296         in2 = (q15_t) x2; 
00297  
00298         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00299  
00300         /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2]  */ 
00301         acc1 = __SMLAD(input1, input2, acc1); 
00302  
00303         /* x[2] and x[3] are packed */ 
00304         in1 = (q15_t) x2; 
00305         in2 = (q15_t) x3; 
00306  
00307         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00308  
00309         /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2]  */ 
00310         acc2 = __SMLAD(input1, input2, acc2); 
00311  
00312         /* Read x[4] sample */ 
00313         x0 = *(px++); 
00314  
00315         /* x[3] and x[4] are packed */ 
00316         in1 = (q15_t) x3; 
00317         in2 = (q15_t) x0; 
00318  
00319         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00320  
00321         /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2]  */ 
00322         acc3 = __SMLAD(input1, input2, acc3); 
00323  
00324         /* Read y[srcBLen - 3] sample */ 
00325         c0 = *(py--); 
00326         /* Read y[srcBLen - 4] sample */ 
00327         c1 = *(py--); 
00328  
00329         /* Read x[5] sample */ 
00330         x1 = *(px++); 
00331  
00332         /* x[2] and x[3] are packed */ 
00333         in1 = (q15_t) x2; 
00334         in2 = (q15_t) x3; 
00335  
00336         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00337  
00338         /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 
00339         in1 = (q15_t) c0; 
00340         in2 = (q15_t) c1; 
00341  
00342         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00343  
00344         /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */ 
00345         acc0 = __SMLAD(input1, input2, acc0); 
00346  
00347         /* x[3] and x[4] are packed */ 
00348         in1 = (q15_t) x3; 
00349         in2 = (q15_t) x0; 
00350  
00351         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00352  
00353         /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4]  */ 
00354         acc1 = __SMLAD(input1, input2, acc1); 
00355  
00356         /* x[4] and x[5] are packed */ 
00357         in1 = (q15_t) x0; 
00358         in2 = (q15_t) x1; 
00359  
00360         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00361  
00362         /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */ 
00363         acc2 = __SMLAD(input1, input2, acc2); 
00364  
00365         /* Read x[6] sample */ 
00366         x2 = *(px++); 
00367  
00368         /* x[5] and x[6] are packed */ 
00369         in1 = (q15_t) x1; 
00370         in2 = (q15_t) x2; 
00371  
00372         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00373  
00374         /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4]  */ 
00375         acc3 = __SMLAD(input1, input2, acc3); 
00376  
00377       } while(--k); 
00378  
00379       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00380        ** No loop unrolling is used. */ 
00381       k = srcBLen % 0x4u; 
00382  
00383       while(k > 0u) 
00384       { 
00385         /* Read y[srcBLen - 5] sample */ 
00386         c0 = *(py--); 
00387  
00388         /* Read x[7] sample */ 
00389         x3 = *(px++); 
00390  
00391         /* Perform the multiply-accumulates */ 
00392         /* acc0 +=  x[4] * y[srcBLen - 5] */ 
00393         acc0 += ((q15_t) x0 * c0); 
00394         /* acc1 +=  x[5] * y[srcBLen - 5] */ 
00395         acc1 += ((q15_t) x1 * c0); 
00396         /* acc2 +=  x[6] * y[srcBLen - 5] */ 
00397         acc2 += ((q15_t) x2 * c0); 
00398         /* acc3 +=  x[7] * y[srcBLen - 5] */ 
00399         acc3 += ((q15_t) x3 * c0); 
00400  
00401         /* Reuse the present samples for the next MAC */ 
00402         x0 = x1; 
00403         x1 = x2; 
00404         x2 = x3; 
00405  
00406         /* Decrement the loop counter */ 
00407         k--; 
00408       } 
00409  
00410  
00411       /* Store the result in the accumulator in the destination buffer. */ 
00412       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 
00413       *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8)); 
00414       *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8)); 
00415       *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8)); 
00416  
00417       /* Update the inputA and inputB pointers for next MAC calculation */ 
00418       px = pIn1 + (count * 4u); 
00419       py = pSrc2; 
00420  
00421       /* Increment the pointer pIn1 index, count by 1 */ 
00422       count++; 
00423  
00424       /* Decrement the loop counter */ 
00425       blkCnt--; 
00426     } 
00427  
00428     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00429      ** No loop unrolling is used. */ 
00430     blkCnt = blockSize2 % 0x4u; 
00431  
00432     while(blkCnt > 0u) 
00433     { 
00434       /* Accumulator is made zero for every iteration */ 
00435       sum = 0; 
00436  
00437       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00438       k = srcBLen >> 2u; 
00439  
00440       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00441        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00442       while(k > 0u) 
00443       { 
00444  
00445         /* Reading two inputs of SrcA buffer and packing */ 
00446         in1 = (q15_t) * px++; 
00447         in2 = (q15_t) * px++; 
00448         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00449  
00450         /* Reading two inputs of SrcB buffer and packing */ 
00451         in1 = (q15_t) * py--; 
00452         in2 = (q15_t) * py--; 
00453         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00454  
00455         /* Perform the multiply-accumulates */ 
00456         sum = __SMLAD(input1, input2, sum); 
00457  
00458         /* Reading two inputs of SrcA buffer and packing */ 
00459         in1 = (q15_t) * px++; 
00460         in2 = (q15_t) * px++; 
00461         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00462  
00463         /* Reading two inputs of SrcB buffer and packing */ 
00464         in1 = (q15_t) * py--; 
00465         in2 = (q15_t) * py--; 
00466         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00467  
00468         /* Perform the multiply-accumulates */ 
00469         sum = __SMLAD(input1, input2, sum); 
00470  
00471         /* Decrement the loop counter */ 
00472         k--; 
00473       } 
00474  
00475       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00476        ** No loop unrolling is used. */ 
00477       k = srcBLen % 0x4u; 
00478  
00479       while(k > 0u) 
00480       { 
00481         /* Perform the multiply-accumulates */ 
00482         sum += ((q15_t) * px++ * *py--); 
00483  
00484         /* Decrement the loop counter */ 
00485         k--; 
00486       } 
00487  
00488       /* Store the result in the accumulator in the destination buffer. */ 
00489       *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 
00490  
00491       /* Update the inputA and inputB pointers for next MAC calculation */ 
00492       px = pIn1 + count; 
00493       py = pSrc2; 
00494  
00495       /* Increment the pointer pIn1 index, count by 1 */ 
00496       count++; 
00497  
00498       /* Decrement the loop counter */ 
00499       blkCnt--; 
00500     } 
00501   } 
00502   else 
00503   { 
00504     /* If the srcBLen is not a multiple of 4,  
00505      * the blockSize2 loop cannot be unrolled by 4 */ 
00506     blkCnt = blockSize2; 
00507  
00508     while(blkCnt > 0u) 
00509     { 
00510       /* Accumulator is made zero for every iteration */ 
00511       sum = 0; 
00512  
00513       /* srcBLen number of MACS should be performed */ 
00514       k = srcBLen; 
00515  
00516       while(k > 0u) 
00517       { 
00518         /* Perform the multiply-accumulate */ 
00519         sum += ((q15_t) * px++ * *py--); 
00520  
00521         /* Decrement the loop counter */ 
00522         k--; 
00523       } 
00524  
00525       /* Store the result in the accumulator in the destination buffer. */ 
00526       *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 
00527  
00528       /* Update the inputA and inputB pointers for next MAC calculation */ 
00529       px = pIn1 + count; 
00530       py = pSrc2; 
00531  
00532       /* Increment the MAC count */ 
00533       count++; 
00534  
00535       /* Decrement the loop counter */ 
00536       blkCnt--; 
00537     } 
00538   } 
00539  
00540  
00541   /* --------------------------  
00542    * Initializations of stage3  
00543    * -------------------------*/ 
00544  
00545   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]  
00546    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]  
00547    * ....  
00548    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]  
00549    * sum +=  x[srcALen-1] * y[srcBLen-1]  
00550    */ 
00551  
00552   /* In this stage the MAC operations are decreased by 1 for every iteration.  
00553      The blockSize3 variable holds the number of MAC operations performed */ 
00554  
00555   /* Working pointer of inputA */ 
00556   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 
00557   px = pSrc1; 
00558  
00559   /* Working pointer of inputB */ 
00560   pSrc2 = pIn2 + (srcBLen - 1u); 
00561   py = pSrc2; 
00562  
00563   /* -------------------  
00564    * Stage3 process  
00565    * ------------------*/ 
00566  
00567   while(blockSize3 > 0u) 
00568   { 
00569     /* Accumulator is made zero for every iteration */ 
00570     sum = 0; 
00571  
00572     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00573     k = blockSize3 >> 2u; 
00574  
00575     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00576      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00577     while(k > 0u) 
00578     { 
00579       /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 
00580       in1 = (q15_t) * px++; 
00581       in2 = (q15_t) * px++; 
00582       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00583  
00584       /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 
00585       in1 = (q15_t) * py--; 
00586       in2 = (q15_t) * py--; 
00587       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00588  
00589       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 
00590       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 
00591       sum = __SMLAD(input1, input2, sum); 
00592  
00593       /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 
00594       in1 = (q15_t) * px++; 
00595       in2 = (q15_t) * px++; 
00596       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00597  
00598       /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 
00599       in1 = (q15_t) * py--; 
00600       in2 = (q15_t) * py--; 
00601       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 
00602  
00603       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 
00604       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 
00605       sum = __SMLAD(input1, input2, sum); 
00606  
00607       /* Decrement the loop counter */ 
00608       k--; 
00609     } 
00610  
00611     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.  
00612      ** No loop unrolling is used. */ 
00613     k = blockSize3 % 0x4u; 
00614  
00615     while(k > 0u) 
00616     { 
00617       /* Perform the multiply-accumulates */ 
00618       sum += ((q15_t) * px++ * *py--); 
00619  
00620       /* Decrement the loop counter */ 
00621       k--; 
00622     } 
00623  
00624     /* Store the result in the accumulator in the destination buffer. */ 
00625     *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 
00626  
00627     /* Update the inputA and inputB pointers for next MAC calculation */ 
00628     px = ++pSrc1; 
00629     py = pSrc2; 
00630  
00631     /* Decrement the loop counter */ 
00632     blockSize3--; 
00633   } 
00634  
00635 } 
00636  
00637 /**  
00638  * @} end of Conv group  
00639  */