CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_fast_q15.c Source File

arm_conv_partial_fast_q15.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_conv_partial_fast_q15.c  
00009 *  
00010 * Description:  Fast Q15 Partial convolution.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated.  
00025 * -------------------------------------------------------------------- */ 
00026  
00027 #include "arm_math.h" 
00028  
00029 /**  
00030  * @ingroup groupFilters  
00031  */ 
00032  
00033 /**  
00034  * @addtogroup PartialConv  
00035  * @{  
00036  */ 
00037  
00038 /**  
00039  * @brief Partial convolution of Q15 sequences (fast version).  
00040  * @param[in]       *pSrcA points to the first input sequence.  
00041  * @param[in]       srcALen length of the first input sequence.  
00042  * @param[in]       *pSrcB points to the second input sequence.  
00043  * @param[in]       srcBLen length of the second input sequence.  
00044  * @param[out]      *pDst points to the location where the output result is written.  
00045  * @param[in]       firstIndex is the first output sample to start with.  
00046  * @param[in]       numPoints is the number of output points to be computed.  
00047  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].  
00048  *  
00049  * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.  
00050  */ 
00051  
00052  
00053 arm_status arm_conv_partial_fast_q15( 
00054   q15_t * pSrcA, 
00055   uint32_t srcALen, 
00056   q15_t * pSrcB, 
00057   uint32_t srcBLen, 
00058   q15_t * pDst, 
00059   uint32_t firstIndex, 
00060   uint32_t numPoints) 
00061 { 
00062   q15_t *pIn1;                                   /* inputA pointer               */ 
00063   q15_t *pIn2;                                   /* inputB pointer               */ 
00064   q15_t *pOut = pDst;                            /* output pointer               */ 
00065   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */ 
00066   q15_t *px;                                     /* Intermediate inputA pointer  */ 
00067   q15_t *py;                                     /* Intermediate inputB pointer  */ 
00068   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */ 
00069   q31_t x0, x1, x2, x3, c0; 
00070   uint32_t j, k, count, check, blkCnt; 
00071   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */ 
00072   arm_status status;                             /* status of Partial convolution */ 
00073   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */ 
00074  
00075   /* Check for range of output samples to be calculated */ 
00076   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 
00077   { 
00078     /* Set status as ARM_MATH_ARGUMENT_ERROR */ 
00079     status = ARM_MATH_ARGUMENT_ERROR; 
00080   } 
00081   else 
00082   { 
00083  
00084     /* The algorithm implementation is based on the lengths of the inputs. */ 
00085     /* srcB is always made to slide across srcA. */ 
00086     /* So srcBLen is always considered as shorter or equal to srcALen */ 
00087     if(srcALen >= srcBLen) 
00088     { 
00089       /* Initialization of inputA pointer */ 
00090       pIn1 = pSrcA; 
00091  
00092       /* Initialization of inputB pointer */ 
00093       pIn2 = pSrcB; 
00094     } 
00095     else 
00096     { 
00097       /* Initialization of inputA pointer */ 
00098       pIn1 = pSrcB; 
00099  
00100       /* Initialization of inputB pointer */ 
00101       pIn2 = pSrcA; 
00102  
00103       /* srcBLen is always considered as shorter or equal to srcALen */ 
00104       j = srcBLen; 
00105       srcBLen = srcALen; 
00106       srcALen = j; 
00107     } 
00108  
00109     /* Conditions to check which loopCounter holds  
00110      * the first and last indices of the output samples to be calculated. */ 
00111     check = firstIndex + numPoints; 
00112     blockSize3 = ((int32_t) check - (int32_t) srcALen); 
00113     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 
00114     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 
00115     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :  
00116                                                                (int32_t) numPoints) : 0; 
00117     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +  
00118                  (int32_t) firstIndex); 
00119     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 
00120  
00121     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 
00122     /* The function is internally  
00123      * divided into three stages according to the number of multiplications that has to be  
00124      * taken place between inputA samples and inputB samples. In the first stage of the  
00125      * algorithm, the multiplications increase by one for every iteration.  
00126      * In the second stage of the algorithm, srcBLen number of multiplications are done.  
00127      * In the third stage of the algorithm, the multiplications decrease by one  
00128      * for every iteration. */ 
00129  
00130     /* Set the output pointer to point to the firstIndex  
00131      * of the output sample to be calculated. */ 
00132     pOut = pDst + firstIndex; 
00133  
00134     /* --------------------------  
00135      * Initializations of stage1  
00136      * -------------------------*/ 
00137  
00138     /* sum = x[0] * y[0]  
00139      * sum = x[0] * y[1] + x[1] * y[0]  
00140      * ....  
00141      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]  
00142      */ 
00143  
00144     /* In this stage the MAC operations are increased by 1 for every iteration.  
00145        The count variable holds the number of MAC operations performed.  
00146        Since the partial convolution starts from firstIndex  
00147        Number of Macs to be performed is firstIndex + 1 */ 
00148     count = 1u + firstIndex; 
00149  
00150     /* Working pointer of inputA */ 
00151     px = pIn1; 
00152  
00153     /* Working pointer of inputB */ 
00154     pSrc2 = pIn2 + firstIndex; 
00155     py = pSrc2; 
00156  
00157     /* ------------------------  
00158      * Stage1 process  
00159      * ----------------------*/ 
00160  
00161     /* For loop unrolling by 4, this stage is divided into two. */ 
00162     /* First part of this stage computes the MAC operations less than 4 */ 
00163     /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 
00164  
00165     /* The first part of the stage starts here */ 
00166     while((count < 4u) && (blockSize1 > 0)) 
00167     { 
00168       /* Accumulator is made zero for every iteration */ 
00169       sum = 0; 
00170  
00171       /* Loop over number of MAC operations between  
00172        * inputA samples and inputB samples */ 
00173       k = count; 
00174  
00175       while(k > 0u) 
00176       { 
00177         /* Perform the multiply-accumulates */ 
00178         sum = __SMLAD(*px++, *py--, sum); 
00179  
00180         /* Decrement the loop counter */ 
00181         k--; 
00182       } 
00183  
00184       /* Store the result in the accumulator in the destination buffer. */ 
00185       *pOut++ = (q15_t) (sum >> 15); 
00186  
00187       /* Update the inputA and inputB pointers for next MAC calculation */ 
00188       py = ++pSrc2; 
00189       px = pIn1; 
00190  
00191       /* Increment the MAC count */ 
00192       count++; 
00193  
00194       /* Decrement the loop counter */ 
00195       blockSize1--; 
00196     } 
00197  
00198     /* The second part of the stage starts here */ 
00199     /* The internal loop, over count, is unrolled by 4 */ 
00200     /* To, read the last two inputB samples using SIMD:  
00201      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 
00202     py = py - 1; 
00203  
00204     while(blockSize1 > 0) 
00205     { 
00206       /* Accumulator is made zero for every iteration */ 
00207       sum = 0; 
00208  
00209       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00210       k = count >> 2u; 
00211  
00212       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00213        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00214       while(k > 0u) 
00215       { 
00216         /* Perform the multiply-accumulates */ 
00217         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 
00218         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00219         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 
00220         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00221  
00222         /* Decrement the loop counter */ 
00223         k--; 
00224       } 
00225  
00226       /* For the next MAC operations, the pointer py is used without SIMD  
00227        * So, py is incremented by 1 */ 
00228       py = py + 1u; 
00229  
00230       /* If the count is not a multiple of 4, compute any remaining MACs here.  
00231        ** No loop unrolling is used. */ 
00232       k = count % 0x4u; 
00233  
00234       while(k > 0u) 
00235       { 
00236         /* Perform the multiply-accumulates */ 
00237         sum = __SMLAD(*px++, *py--, sum); 
00238  
00239         /* Decrement the loop counter */ 
00240         k--; 
00241       } 
00242  
00243       /* Store the result in the accumulator in the destination buffer. */ 
00244       *pOut++ = (q15_t) (sum >> 15); 
00245  
00246       /* Update the inputA and inputB pointers for next MAC calculation */ 
00247       py = ++pSrc2 - 1u; 
00248       px = pIn1; 
00249  
00250       /* Increment the MAC count */ 
00251       count++; 
00252  
00253       /* Decrement the loop counter */ 
00254       blockSize1--; 
00255     } 
00256  
00257     /* --------------------------  
00258      * Initializations of stage2  
00259      * ------------------------*/ 
00260  
00261     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]  
00262      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]  
00263      * ....  
00264      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]  
00265      */ 
00266  
00267     /* Working pointer of inputA */ 
00268     px = pIn1; 
00269  
00270     /* Working pointer of inputB */ 
00271     pSrc2 = pIn2 + (srcBLen - 1u); 
00272     py = pSrc2; 
00273  
00274     /* Initialize inputB pointer of type q31 */ 
00275     pb = (q31_t *) (py - 1u); 
00276  
00277     /* count is the index by which the pointer pIn1 to be incremented */ 
00278     count = 1u; 
00279  
00280  
00281     /* --------------------  
00282      * Stage2 process  
00283      * -------------------*/ 
00284  
00285     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00286      * So, to loop unroll over blockSize2,  
00287      * srcBLen should be greater than or equal to 4 */ 
00288     if(srcBLen >= 4u) 
00289     { 
00290       /* Loop unroll over blockSize2, by 4 */ 
00291       blkCnt = ((uint32_t) blockSize2 >> 2u); 
00292  
00293       while(blkCnt > 0u) 
00294       { 
00295         /* Set all accumulators to zero */ 
00296         acc0 = 0; 
00297         acc1 = 0; 
00298         acc2 = 0; 
00299         acc3 = 0; 
00300  
00301  
00302         /* read x[0], x[1] samples */ 
00303         x0 = *(q31_t *) (px++); 
00304         /* read x[1], x[2] samples */ 
00305         x1 = *(q31_t *) (px++); 
00306  
00307  
00308         /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00309         k = srcBLen >> 2u; 
00310  
00311         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00312          ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00313         do 
00314         { 
00315           /* Read the last two inputB samples using SIMD:  
00316            * y[srcBLen - 1] and y[srcBLen - 2] */ 
00317           c0 = *(pb--); 
00318  
00319           /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 
00320           acc0 = __SMLADX(x0, c0, acc0); 
00321  
00322           /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 
00323           acc1 = __SMLADX(x1, c0, acc1); 
00324  
00325           /* Read x[2], x[3] */ 
00326           x2 = *(q31_t *) (px++); 
00327  
00328           /* Read x[3], x[4] */ 
00329           x3 = *(q31_t *) (px++); 
00330  
00331           /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 
00332           acc2 = __SMLADX(x2, c0, acc2); 
00333  
00334           /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 
00335           acc3 = __SMLADX(x3, c0, acc3); 
00336  
00337           /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 
00338           c0 = *(pb--); 
00339  
00340           /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 
00341           acc0 = __SMLADX(x2, c0, acc0); 
00342  
00343           /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 
00344           acc1 = __SMLADX(x3, c0, acc1); 
00345  
00346           /* Read x[4], x[5] */ 
00347           x0 = *(q31_t *) (px++); 
00348  
00349           /* Read x[5], x[6] */ 
00350           x1 = *(q31_t *) (px++); 
00351  
00352           /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 
00353           acc2 = __SMLADX(x0, c0, acc2); 
00354  
00355           /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 
00356           acc3 = __SMLADX(x1, c0, acc3); 
00357  
00358         } while(--k); 
00359  
00360         /* For the next MAC operations, SIMD is not used  
00361          * So, the 16 bit pointer if inputB, py is updated */ 
00362         py = (q15_t *) pb; 
00363         py = py + 1; 
00364  
00365         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00366          ** No loop unrolling is used. */ 
00367         k = srcBLen % 0x4u; 
00368  
00369         if(k == 1u) 
00370         { 
00371           /* Read y[srcBLen - 5] */ 
00372           c0 = *(py); 
00373  
00374           /* Read x[7] */ 
00375           x3 = *(q31_t *) px++; 
00376  
00377           /* Perform the multiply-accumulates */ 
00378           acc0 = __SMLAD(x0, c0, acc0); 
00379           acc1 = __SMLAD(x1, c0, acc1); 
00380           acc2 = __SMLADX(x1, c0, acc2); 
00381           acc3 = __SMLADX(x3, c0, acc3); 
00382         } 
00383  
00384         if(k == 2u) 
00385         { 
00386           /* Read y[srcBLen - 5], y[srcBLen - 6] */ 
00387           c0 = *(pb); 
00388  
00389           /* Read x[7], x[8] */ 
00390           x3 = *(q31_t *) px++; 
00391  
00392           /* Read x[9] */ 
00393           x2 = *(q31_t *) px++; 
00394  
00395           /* Perform the multiply-accumulates */ 
00396           acc0 = __SMLADX(x0, c0, acc0); 
00397           acc1 = __SMLADX(x1, c0, acc1); 
00398           acc2 = __SMLADX(x3, c0, acc2); 
00399           acc3 = __SMLADX(x2, c0, acc3); 
00400         } 
00401  
00402         if(k == 3u) 
00403         { 
00404           /* Read y[srcBLen - 5], y[srcBLen - 6] */ 
00405           c0 = *pb--; 
00406  
00407           /* Read x[7], x[8] */ 
00408           x3 = *(q31_t *) px++; 
00409  
00410           /* Read x[9] */ 
00411           x2 = *(q31_t *) px++; 
00412  
00413           /* Perform the multiply-accumulates */ 
00414           acc0 = __SMLADX(x0, c0, acc0); 
00415           acc1 = __SMLADX(x1, c0, acc1); 
00416           acc2 = __SMLADX(x3, c0, acc2); 
00417           acc3 = __SMLADX(x2, c0, acc3); 
00418  
00419           /* Read y[srcBLen - 7] */ 
00420           c0 = (q15_t) (*pb >> 16); 
00421  
00422           /* Read x[10] */ 
00423           x3 = *(q31_t *) px++; 
00424  
00425           /* Perform the multiply-accumulates */ 
00426           acc0 = __SMLADX(x1, c0, acc0); 
00427           acc1 = __SMLAD(x2, c0, acc1); 
00428           acc2 = __SMLADX(x2, c0, acc2); 
00429           acc3 = __SMLADX(x3, c0, acc3); 
00430         } 
00431  
00432         /* Store the results in the accumulators in the destination buffer. */ 
00433         *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16); 
00434         *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16); 
00435  
00436         /* Update the inputA and inputB pointers for next MAC calculation */ 
00437         px = pIn1 + (count * 4u); 
00438         py = pSrc2; 
00439         pb = (q31_t *) (py - 1); 
00440  
00441         /* Increment the pointer pIn1 index, count by 1 */ 
00442         count++; 
00443  
00444         /* Decrement the loop counter */ 
00445         blkCnt--; 
00446       } 
00447  
00448       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00449        ** No loop unrolling is used. */ 
00450       blkCnt = (uint32_t) blockSize2 % 0x4u; 
00451  
00452       while(blkCnt > 0u) 
00453       { 
00454         /* Accumulator is made zero for every iteration */ 
00455         sum = 0; 
00456  
00457         /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00458         k = srcBLen >> 2u; 
00459  
00460         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00461          ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00462         while(k > 0u) 
00463         { 
00464           /* Perform the multiply-accumulates */ 
00465           sum += ((q31_t) * px++ * *py--); 
00466           sum += ((q31_t) * px++ * *py--); 
00467           sum += ((q31_t) * px++ * *py--); 
00468           sum += ((q31_t) * px++ * *py--); 
00469  
00470           /* Decrement the loop counter */ 
00471           k--; 
00472         } 
00473  
00474         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00475          ** No loop unrolling is used. */ 
00476         k = srcBLen % 0x4u; 
00477  
00478         while(k > 0u) 
00479         { 
00480           /* Perform the multiply-accumulates */ 
00481           sum += ((q31_t) * px++ * *py--); 
00482  
00483           /* Decrement the loop counter */ 
00484           k--; 
00485         } 
00486  
00487         /* Store the result in the accumulator in the destination buffer. */ 
00488         *pOut++ = (q15_t) (sum >> 15); 
00489  
00490         /* Update the inputA and inputB pointers for next MAC calculation */ 
00491         px = pIn1 + count; 
00492         py = pSrc2; 
00493  
00494         /* Increment the pointer pIn1 index, count by 1 */ 
00495         count++; 
00496  
00497         /* Decrement the loop counter */ 
00498         blkCnt--; 
00499       } 
00500     } 
00501     else 
00502     { 
00503       /* If the srcBLen is not a multiple of 4,  
00504        * the blockSize2 loop cannot be unrolled by 4 */ 
00505       blkCnt = (uint32_t) blockSize2; 
00506  
00507       while(blkCnt > 0u) 
00508       { 
00509         /* Accumulator is made zero for every iteration */ 
00510         sum = 0; 
00511  
00512         /* srcBLen number of MACS should be performed */ 
00513         k = srcBLen; 
00514  
00515         while(k > 0u) 
00516         { 
00517           /* Perform the multiply-accumulate */ 
00518           sum += ((q31_t) * px++ * *py--); 
00519  
00520           /* Decrement the loop counter */ 
00521           k--; 
00522         } 
00523  
00524         /* Store the result in the accumulator in the destination buffer. */ 
00525         *pOut++ = (q15_t) (sum >> 15); 
00526  
00527         /* Update the inputA and inputB pointers for next MAC calculation */ 
00528         px = pIn1 + count; 
00529         py = pSrc2; 
00530  
00531         /* Increment the MAC count */ 
00532         count++; 
00533  
00534         /* Decrement the loop counter */ 
00535         blkCnt--; 
00536       } 
00537     } 
00538  
00539  
00540     /* --------------------------  
00541      * Initializations of stage3  
00542      * -------------------------*/ 
00543  
00544     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]  
00545      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]  
00546      * ....  
00547      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]  
00548      * sum +=  x[srcALen-1] * y[srcBLen-1]  
00549      */ 
00550  
00551     /* In this stage the MAC operations are decreased by 1 for every iteration.  
00552        The count variable holds the number of MAC operations performed */ 
00553     count = srcBLen - 1u; 
00554  
00555     /* Working pointer of inputA */ 
00556     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00557     px = pSrc1; 
00558  
00559     /* Working pointer of inputB */ 
00560     pSrc2 = pIn2 + (srcBLen - 1u); 
00561     pIn2 = pSrc2 - 1u; 
00562     py = pIn2; 
00563  
00564     /* -------------------  
00565      * Stage3 process  
00566      * ------------------*/ 
00567  
00568     /* For loop unrolling by 4, this stage is divided into two. */ 
00569     /* First part of this stage computes the MAC operations greater than 4 */ 
00570     /* Second part of this stage computes the MAC operations less than or equal to 4 */ 
00571  
00572     /* The first part of the stage starts here */ 
00573     j = count >> 2u; 
00574  
00575     while((j > 0u) && (blockSize3 > 0)) 
00576     { 
00577       /* Accumulator is made zero for every iteration */ 
00578       sum = 0; 
00579  
00580       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00581       k = count >> 2u; 
00582  
00583       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00584        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00585       while(k > 0u) 
00586       { 
00587         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied  
00588          * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 
00589         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00590         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied  
00591          * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 
00592         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00593  
00594         /* Decrement the loop counter */ 
00595         k--; 
00596       } 
00597  
00598       /* For the next MAC operations, the pointer py is used without SIMD  
00599        * So, py is incremented by 1 */ 
00600       py = py + 1u; 
00601  
00602       /* If the count is not a multiple of 4, compute any remaining MACs here.  
00603        ** No loop unrolling is used. */ 
00604       k = count % 0x4u; 
00605  
00606       while(k > 0u) 
00607       { 
00608         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 
00609         sum = __SMLAD(*px++, *py--, sum); 
00610  
00611         /* Decrement the loop counter */ 
00612         k--; 
00613       } 
00614  
00615       /* Store the result in the accumulator in the destination buffer. */ 
00616       *pOut++ = (q15_t) (sum >> 15); 
00617  
00618       /* Update the inputA and inputB pointers for next MAC calculation */ 
00619       px = ++pSrc1; 
00620       py = pIn2; 
00621  
00622       /* Decrement the MAC count */ 
00623       count--; 
00624  
00625       /* Decrement the loop counter */ 
00626       blockSize3--; 
00627  
00628       j--; 
00629     } 
00630  
00631     /* The second part of the stage starts here */ 
00632     /* SIMD is not used for the next MAC operations,  
00633      * so pointer py is updated to read only one sample at a time */ 
00634     py = py + 1u; 
00635  
00636     while(blockSize3 > 0) 
00637     { 
00638       /* Accumulator is made zero for every iteration */ 
00639       sum = 0; 
00640  
00641       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00642       k = count; 
00643  
00644       while(k > 0u) 
00645       { 
00646         /* Perform the multiply-accumulates */ 
00647         /* sum +=  x[srcALen-1] * y[srcBLen-1] */ 
00648         sum = __SMLAD(*px++, *py--, sum); 
00649  
00650         /* Decrement the loop counter */ 
00651         k--; 
00652       } 
00653  
00654       /* Store the result in the accumulator in the destination buffer. */ 
00655       *pOut++ = (q15_t) (sum >> 15); 
00656  
00657       /* Update the inputA and inputB pointers for next MAC calculation */ 
00658       px = ++pSrc1; 
00659       py = pSrc2; 
00660  
00661       /* Decrement the MAC count */ 
00662       count--; 
00663  
00664       /* Decrement the loop counter */ 
00665       blockSize3--; 
00666     } 
00667  
00668     /* set status as ARM_MATH_SUCCESS */ 
00669     status = ARM_MATH_SUCCESS; 
00670   } 
00671  
00672   /* Return to application */ 
00673   return (status); 
00674  
00675 } 
00676  
00677 /**  
00678  * @} end of PartialConv group  
00679  */