CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_q15.c Source File

arm_conv_partial_q15.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_conv_partial_q15.c  
00009 *  
00010 * Description:  Q15 Partial convolution.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated  
00025 *  
00026 * Version 0.0.7  2010/06/10   
00027 *    Misra-C changes done  
00028 *  
00029 * -------------------------------------------------------------------- */ 
00030  
00031 #include "arm_math.h" 
00032  
00033 /**  
00034  * @ingroup groupFilters  
00035  */ 
00036  
00037 /**  
00038  * @addtogroup PartialConv  
00039  * @{  
00040  */ 
00041  
00042 /**  
00043  * @brief Partial convolution of Q15 sequences.  
00044  * @param[in]       *pSrcA points to the first input sequence.  
00045  * @param[in]       srcALen length of the first input sequence.  
00046  * @param[in]       *pSrcB points to the second input sequence.  
00047  * @param[in]       srcBLen length of the second input sequence.  
00048  * @param[out]      *pDst points to the location where the output result is written.  
00049  * @param[in]       firstIndex is the first output sample to start with.  
00050  * @param[in]       numPoints is the number of output points to be computed.  
00051  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].  
00052  *  
00053  * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function.  
00054  */ 
00055  
00056  
00057 arm_status arm_conv_partial_q15( 
00058   q15_t * pSrcA, 
00059   uint32_t srcALen, 
00060   q15_t * pSrcB, 
00061   uint32_t srcBLen, 
00062   q15_t * pDst, 
00063   uint32_t firstIndex, 
00064   uint32_t numPoints) 
00065 { 
00066   q15_t *pIn1;                                   /* inputA pointer               */ 
00067   q15_t *pIn2;                                   /* inputB pointer               */ 
00068   q15_t *pOut = pDst;                            /* output pointer               */ 
00069   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */ 
00070   q15_t *px;                                     /* Intermediate inputA pointer  */ 
00071   q15_t *py;                                     /* Intermediate inputB pointer  */ 
00072   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */ 
00073   q31_t x0, x1, x2, x3, c0;                      /* Temporary input variables */ 
00074   uint32_t j, k, count, check, blkCnt; 
00075   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */ 
00076   arm_status status;                             /* status of Partial convolution */ 
00077   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */ 
00078  
00079   /* Check for range of output samples to be calculated */ 
00080   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 
00081   { 
00082     /* Set status as ARM_MATH_ARGUMENT_ERROR */ 
00083     status = ARM_MATH_ARGUMENT_ERROR; 
00084   } 
00085   else 
00086   { 
00087  
00088     /* The algorithm implementation is based on the lengths of the inputs. */ 
00089     /* srcB is always made to slide across srcA. */ 
00090     /* So srcBLen is always considered as shorter or equal to srcALen */ 
00091     if(srcALen >= srcBLen) 
00092     { 
00093       /* Initialization of inputA pointer */ 
00094       pIn1 = pSrcA; 
00095  
00096       /* Initialization of inputB pointer */ 
00097       pIn2 = pSrcB; 
00098     } 
00099     else 
00100     { 
00101       /* Initialization of inputA pointer */ 
00102       pIn1 = pSrcB; 
00103  
00104       /* Initialization of inputB pointer */ 
00105       pIn2 = pSrcA; 
00106  
00107       /* srcBLen is always considered as shorter or equal to srcALen */ 
00108       j = srcBLen; 
00109       srcBLen = srcALen; 
00110       srcALen = j; 
00111     } 
00112  
00113     /* Conditions to check which loopCounter holds  
00114      * the first and last indices of the output samples to be calculated. */ 
00115     check = firstIndex + numPoints; 
00116     blockSize3 = ((int32_t) check - (int32_t) srcALen); 
00117     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 
00118     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 
00119     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 
00120                                     (int32_t) numPoints) : 0; 
00121     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 
00122                                     (int32_t) firstIndex); 
00123     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 
00124  
00125     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 
00126     /* The function is internally  
00127      * divided into three stages according to the number of multiplications that has to be  
00128      * taken place between inputA samples and inputB samples. In the first stage of the  
00129      * algorithm, the multiplications increase by one for every iteration.  
00130      * In the second stage of the algorithm, srcBLen number of multiplications are done.  
00131      * In the third stage of the algorithm, the multiplications decrease by one  
00132      * for every iteration. */ 
00133  
00134     /* Set the output pointer to point to the firstIndex  
00135      * of the output sample to be calculated. */ 
00136     pOut = pDst + firstIndex; 
00137  
00138     /* --------------------------  
00139      * Initializations of stage1  
00140      * -------------------------*/ 
00141  
00142     /* sum = x[0] * y[0]  
00143      * sum = x[0] * y[1] + x[1] * y[0]  
00144      * ....  
00145      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]  
00146      */ 
00147  
00148     /* In this stage the MAC operations are increased by 1 for every iteration.  
00149        The count variable holds the number of MAC operations performed.  
00150        Since the partial convolution starts from firstIndex  
00151        Number of Macs to be performed is firstIndex + 1 */ 
00152     count = 1u + firstIndex; 
00153  
00154     /* Working pointer of inputA */ 
00155     px = pIn1; 
00156  
00157     /* Working pointer of inputB */ 
00158     pSrc2 = pIn2 + firstIndex; 
00159     py = pSrc2; 
00160  
00161     /* ------------------------  
00162      * Stage1 process  
00163      * ----------------------*/ 
00164  
00165     /* For loop unrolling by 4, this stage is divided into two. */ 
00166     /* First part of this stage computes the MAC operations less than 4 */ 
00167     /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 
00168  
00169     /* The first part of the stage starts here */ 
00170     while((count < 4u) && (blockSize1 > 0)) 
00171     { 
00172       /* Accumulator is made zero for every iteration */ 
00173       sum = 0; 
00174  
00175       /* Loop over number of MAC operations between  
00176        * inputA samples and inputB samples */ 
00177       k = count; 
00178  
00179       while(k > 0u) 
00180       { 
00181         /* Perform the multiply-accumulates */ 
00182         sum = __SMLALD(*px++, *py--, sum); 
00183  
00184         /* Decrement the loop counter */ 
00185         k--; 
00186       } 
00187  
00188       /* Store the result in the accumulator in the destination buffer. */ 
00189       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 
00190  
00191       /* Update the inputA and inputB pointers for next MAC calculation */ 
00192       py = ++pSrc2; 
00193       px = pIn1; 
00194  
00195       /* Increment the MAC count */ 
00196       count++; 
00197  
00198       /* Decrement the loop counter */ 
00199       blockSize1--; 
00200     } 
00201  
00202     /* The second part of the stage starts here */ 
00203     /* The internal loop, over count, is unrolled by 4 */ 
00204     /* To, read the last two inputB samples using SIMD:  
00205      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 
00206     py = py - 1; 
00207  
00208     while(blockSize1 > 0) 
00209     { 
00210       /* Accumulator is made zero for every iteration */ 
00211       sum = 0; 
00212  
00213       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00214       k = count >> 2u; 
00215  
00216       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00217        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00218       while(k > 0u) 
00219       { 
00220         /* Perform the multiply-accumulates */ 
00221         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 
00222         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00223         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 
00224         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00225  
00226         /* Decrement the loop counter */ 
00227         k--; 
00228       } 
00229  
00230       /* For the next MAC operations, the pointer py is used without SIMD  
00231        * So, py is incremented by 1 */ 
00232       py = py + 1u; 
00233  
00234       /* If the count is not a multiple of 4, compute any remaining MACs here.  
00235        ** No loop unrolling is used. */ 
00236       k = count % 0x4u; 
00237  
00238       while(k > 0u) 
00239       { 
00240         /* Perform the multiply-accumulates */ 
00241         sum = __SMLALD(*px++, *py--, sum); 
00242  
00243         /* Decrement the loop counter */ 
00244         k--; 
00245       } 
00246  
00247       /* Store the result in the accumulator in the destination buffer. */ 
00248       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 
00249  
00250       /* Update the inputA and inputB pointers for next MAC calculation */ 
00251       py = ++pSrc2 - 1u; 
00252       px = pIn1; 
00253  
00254       /* Increment the MAC count */ 
00255       count++; 
00256  
00257       /* Decrement the loop counter */ 
00258       blockSize1--; 
00259     } 
00260  
00261     /* --------------------------  
00262      * Initializations of stage2  
00263      * ------------------------*/ 
00264  
00265     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]  
00266      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]  
00267      * ....  
00268      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]  
00269      */ 
00270  
00271     /* Working pointer of inputA */ 
00272     px = pIn1; 
00273  
00274     /* Working pointer of inputB */ 
00275     pSrc2 = pIn2 + (srcBLen - 1u); 
00276     py = pSrc2; 
00277  
00278     /* Initialize inputB pointer of type q31 */ 
00279     pb = (q31_t *) (py - 1u); 
00280  
00281     /* count is the index by which the pointer pIn1 to be incremented */ 
00282     count = 1u; 
00283  
00284  
00285     /* --------------------  
00286      * Stage2 process  
00287      * -------------------*/ 
00288  
00289     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00290      * So, to loop unroll over blockSize2,  
00291      * srcBLen should be greater than or equal to 4 */ 
00292     if(srcBLen >= 4u) 
00293     { 
00294       /* Loop unroll over blockSize2, by 4 */ 
00295       blkCnt = ((uint32_t) blockSize2 >> 2u); 
00296  
00297       while(blkCnt > 0u) 
00298       { 
00299         /* Set all accumulators to zero */ 
00300         acc0 = 0; 
00301         acc1 = 0; 
00302         acc2 = 0; 
00303         acc3 = 0; 
00304  
00305  
00306         /* read x[0], x[1] samples */ 
00307         x0 = *(q31_t *) (px++); 
00308         /* read x[1], x[2] samples */ 
00309         x1 = *(q31_t *) (px++); 
00310  
00311  
00312         /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00313         k = srcBLen >> 2u; 
00314  
00315         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00316          ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00317         do 
00318         { 
00319           /* Read the last two inputB samples using SIMD:  
00320            * y[srcBLen - 1] and y[srcBLen - 2] */ 
00321           c0 = *(pb--); 
00322  
00323           /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 
00324           acc0 = __SMLALDX(x0, c0, acc0); 
00325  
00326           /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 
00327           acc1 = __SMLALDX(x1, c0, acc1); 
00328  
00329           /* Read x[2], x[3] */ 
00330           x2 = *(q31_t *) (px++); 
00331  
00332           /* Read x[3], x[4] */ 
00333           x3 = *(q31_t *) (px++); 
00334  
00335           /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 
00336           acc2 = __SMLALDX(x2, c0, acc2); 
00337  
00338           /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 
00339           acc3 = __SMLALDX(x3, c0, acc3); 
00340  
00341           /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 
00342           c0 = *(pb--); 
00343  
00344           /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 
00345           acc0 = __SMLALDX(x2, c0, acc0); 
00346  
00347           /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 
00348           acc1 = __SMLALDX(x3, c0, acc1); 
00349  
00350           /* Read x[4], x[5] */ 
00351           x0 = *(q31_t *) (px++); 
00352  
00353           /* Read x[5], x[6] */ 
00354           x1 = *(q31_t *) (px++); 
00355  
00356           /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 
00357           acc2 = __SMLALDX(x0, c0, acc2); 
00358  
00359           /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 
00360           acc3 = __SMLALDX(x1, c0, acc3); 
00361  
00362         } while(--k); 
00363  
00364         /* For the next MAC operations, SIMD is not used  
00365          * So, the 16 bit pointer if inputB, py is updated */ 
00366         py = (q15_t *) pb; 
00367         py = py + 1; 
00368  
00369         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00370          ** No loop unrolling is used. */ 
00371         k = srcBLen % 0x4u; 
00372  
00373         if(k == 1u) 
00374         { 
00375           /* Read y[srcBLen - 5] */ 
00376           c0 = *(py); 
00377  
00378           /* Read x[7] */ 
00379           x3 = *(q31_t *) px++; 
00380  
00381           /* Perform the multiply-accumulates */ 
00382           acc0 = __SMLALD(x0, c0, acc0); 
00383           acc1 = __SMLALD(x1, c0, acc1); 
00384           acc2 = __SMLALDX(x1, c0, acc2); 
00385           acc3 = __SMLALDX(x3, c0, acc3); 
00386         } 
00387  
00388         if(k == 2u) 
00389         { 
00390           /* Read y[srcBLen - 5], y[srcBLen - 6] */ 
00391           c0 = *(pb); 
00392  
00393           /* Read x[7], x[8] */ 
00394           x3 = *(q31_t *) px++; 
00395  
00396           /* Read x[9] */ 
00397           x2 = *(q31_t *) px++; 
00398  
00399           /* Perform the multiply-accumulates */ 
00400           acc0 = __SMLALDX(x0, c0, acc0); 
00401           acc1 = __SMLALDX(x1, c0, acc1); 
00402           acc2 = __SMLALDX(x3, c0, acc2); 
00403           acc3 = __SMLALDX(x2, c0, acc3); 
00404         } 
00405  
00406         if(k == 3u) 
00407         { 
00408           /* Read y[srcBLen - 5], y[srcBLen - 6] */ 
00409           c0 = *pb--; 
00410  
00411           /* Read x[7], x[8] */ 
00412           x3 = *(q31_t *) px++; 
00413  
00414           /* Read x[9] */ 
00415           x2 = *(q31_t *) px++; 
00416  
00417           /* Perform the multiply-accumulates */ 
00418           acc0 = __SMLALDX(x0, c0, acc0); 
00419           acc1 = __SMLALDX(x1, c0, acc1); 
00420           acc2 = __SMLALDX(x3, c0, acc2); 
00421           acc3 = __SMLALDX(x2, c0, acc3); 
00422  
00423           /* Read y[srcBLen - 7] */ 
00424           c0 = (q15_t) (*pb >> 16); 
00425  
00426           /* Read x[10] */ 
00427           x3 = *(q31_t *) px++; 
00428  
00429           /* Perform the multiply-accumulates */ 
00430           acc0 = __SMLALDX(x1, c0, acc0); 
00431           acc1 = __SMLALD(x2, c0, acc1); 
00432           acc2 = __SMLALDX(x2, c0, acc2); 
00433           acc3 = __SMLALDX(x3, c0, acc3); 
00434         } 
00435  
00436         /* Store the results in the accumulators in the destination buffer. */ 
00437         *__SIMD32(pOut)++ = 
00438           __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 
00439         *__SIMD32(pOut)++ = 
00440           __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 
00441  
00442         /* Update the inputA and inputB pointers for next MAC calculation */ 
00443         px = pIn1 + (count * 4u); 
00444         py = pSrc2; 
00445         pb = (q31_t *) (py - 1); 
00446  
00447         /* Increment the pointer pIn1 index, count by 1 */ 
00448         count++; 
00449  
00450         /* Decrement the loop counter */ 
00451         blkCnt--; 
00452       } 
00453  
00454       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00455        ** No loop unrolling is used. */ 
00456       blkCnt = (uint32_t) blockSize2 % 0x4u; 
00457  
00458       while(blkCnt > 0u) 
00459       { 
00460         /* Accumulator is made zero for every iteration */ 
00461         sum = 0; 
00462  
00463         /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00464         k = srcBLen >> 2u; 
00465  
00466         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00467          ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00468         while(k > 0u) 
00469         { 
00470           /* Perform the multiply-accumulates */ 
00471           sum += (q63_t) ((q31_t) * px++ * *py--); 
00472           sum += (q63_t) ((q31_t) * px++ * *py--); 
00473           sum += (q63_t) ((q31_t) * px++ * *py--); 
00474           sum += (q63_t) ((q31_t) * px++ * *py--); 
00475  
00476           /* Decrement the loop counter */ 
00477           k--; 
00478         } 
00479  
00480         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00481          ** No loop unrolling is used. */ 
00482         k = srcBLen % 0x4u; 
00483  
00484         while(k > 0u) 
00485         { 
00486           /* Perform the multiply-accumulates */ 
00487           sum += (q63_t) ((q31_t) * px++ * *py--); 
00488  
00489           /* Decrement the loop counter */ 
00490           k--; 
00491         } 
00492  
00493         /* Store the result in the accumulator in the destination buffer. */ 
00494         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 
00495  
00496         /* Update the inputA and inputB pointers for next MAC calculation */ 
00497         px = pIn1 + count; 
00498         py = pSrc2; 
00499  
00500         /* Increment the pointer pIn1 index, count by 1 */ 
00501         count++; 
00502  
00503         /* Decrement the loop counter */ 
00504         blkCnt--; 
00505       } 
00506     } 
00507     else 
00508     { 
00509       /* If the srcBLen is not a multiple of 4,  
00510        * the blockSize2 loop cannot be unrolled by 4 */ 
00511       blkCnt = (uint32_t) blockSize2; 
00512  
00513       while(blkCnt > 0u) 
00514       { 
00515         /* Accumulator is made zero for every iteration */ 
00516         sum = 0; 
00517  
00518         /* srcBLen number of MACS should be performed */ 
00519         k = srcBLen; 
00520  
00521         while(k > 0u) 
00522         { 
00523           /* Perform the multiply-accumulate */ 
00524           sum += (q63_t) ((q31_t) * px++ * *py--); 
00525  
00526           /* Decrement the loop counter */ 
00527           k--; 
00528         } 
00529  
00530         /* Store the result in the accumulator in the destination buffer. */ 
00531         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 
00532  
00533         /* Update the inputA and inputB pointers for next MAC calculation */ 
00534         px = pIn1 + count; 
00535         py = pSrc2; 
00536  
00537         /* Increment the MAC count */ 
00538         count++; 
00539  
00540         /* Decrement the loop counter */ 
00541         blkCnt--; 
00542       } 
00543     } 
00544  
00545  
00546     /* --------------------------  
00547      * Initializations of stage3  
00548      * -------------------------*/ 
00549  
00550     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]  
00551      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]  
00552      * ....  
00553      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]  
00554      * sum +=  x[srcALen-1] * y[srcBLen-1]  
00555      */ 
00556  
00557     /* In this stage the MAC operations are decreased by 1 for every iteration.  
00558        The count variable holds the number of MAC operations performed */ 
00559     count = srcBLen - 1u; 
00560  
00561     /* Working pointer of inputA */ 
00562     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00563     px = pSrc1; 
00564  
00565     /* Working pointer of inputB */ 
00566     pSrc2 = pIn2 + (srcBLen - 1u); 
00567     pIn2 = pSrc2 - 1u; 
00568     py = pIn2; 
00569  
00570     /* -------------------  
00571      * Stage3 process  
00572      * ------------------*/ 
00573  
00574     /* For loop unrolling by 4, this stage is divided into two. */ 
00575     /* First part of this stage computes the MAC operations greater than 4 */ 
00576     /* Second part of this stage computes the MAC operations less than or equal to 4 */ 
00577  
00578     /* The first part of the stage starts here */ 
00579     j = count >> 2u; 
00580  
00581     while((j > 0u) && (blockSize3 > 0)) 
00582     { 
00583       /* Accumulator is made zero for every iteration */ 
00584       sum = 0; 
00585  
00586       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00587       k = count >> 2u; 
00588  
00589       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00590        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00591       while(k > 0u) 
00592       { 
00593         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied  
00594          * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 
00595         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00596         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied  
00597          * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 
00598         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 
00599  
00600         /* Decrement the loop counter */ 
00601         k--; 
00602       } 
00603  
00604       /* For the next MAC operations, the pointer py is used without SIMD  
00605        * So, py is incremented by 1 */ 
00606       py = py + 1u; 
00607  
00608       /* If the count is not a multiple of 4, compute any remaining MACs here.  
00609        ** No loop unrolling is used. */ 
00610       k = count % 0x4u; 
00611  
00612       while(k > 0u) 
00613       { 
00614         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 
00615         sum = __SMLALD(*px++, *py--, sum); 
00616  
00617         /* Decrement the loop counter */ 
00618         k--; 
00619       } 
00620  
00621       /* Store the result in the accumulator in the destination buffer. */ 
00622       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 
00623  
00624       /* Update the inputA and inputB pointers for next MAC calculation */ 
00625       px = ++pSrc1; 
00626       py = pIn2; 
00627  
00628       /* Decrement the MAC count */ 
00629       count--; 
00630  
00631       /* Decrement the loop counter */ 
00632       blockSize3--; 
00633  
00634       j--; 
00635     } 
00636  
00637     /* The second part of the stage starts here */ 
00638     /* SIMD is not used for the next MAC operations,  
00639      * so pointer py is updated to read only one sample at a time */ 
00640     py = py + 1u; 
00641  
00642     while(blockSize3 > 0) 
00643     { 
00644       /* Accumulator is made zero for every iteration */ 
00645       sum = 0; 
00646  
00647       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00648       k = count; 
00649  
00650       while(k > 0u) 
00651       { 
00652         /* Perform the multiply-accumulates */ 
00653         /* sum +=  x[srcALen-1] * y[srcBLen-1] */ 
00654         sum = __SMLALD(*px++, *py--, sum); 
00655  
00656         /* Decrement the loop counter */ 
00657         k--; 
00658       } 
00659  
00660       /* Store the result in the accumulator in the destination buffer. */ 
00661       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 
00662  
00663       /* Update the inputA and inputB pointers for next MAC calculation */ 
00664       px = ++pSrc1; 
00665       py = pSrc2; 
00666  
00667       /* Decrement the MAC count */ 
00668       count--; 
00669  
00670       /* Decrement the loop counter */ 
00671       blockSize3--; 
00672     } 
00673  
00674     /* set status as ARM_MATH_SUCCESS */ 
00675     status = ARM_MATH_SUCCESS; 
00676   } 
00677  
00678   /* Return to application */ 
00679   return (status); 
00680  
00681 } 
00682  
00683 /**  
00684  * @} end of PartialConv group  
00685  */