CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_q7.c Source File

arm_conv_partial_q7.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_conv_partial_q7.c  
00009 *  
00010 * Description:  Q7 Partial convolution.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated  
00025 *  
00026 * Version 0.0.7  2010/06/10   
00027 *    Misra-C changes done  
00028 *  
00029 * -------------------------------------------------------------------- */ 
00030  
00031 #include "arm_math.h" 
00032  
00033 /**  
00034  * @ingroup groupFilters  
00035  */ 
00036  
00037 /**  
00038  * @addtogroup PartialConv  
00039  * @{  
00040  */ 
00041  
00042 /**  
00043  * @brief Partial convolution of Q7 sequences  
00044  * @param[in]       *pSrcA points to the first input sequence.  
00045  * @param[in]       srcALen length of the first input sequence.  
00046  * @param[in]       *pSrcB points to the second input sequence.  
00047  * @param[in]       srcBLen length of the second input sequence.  
00048  * @param[out]      *pDst points to the location where the output result is written.  
00049  * @param[in]       firstIndex is the first output sample to start with.  
00050  * @param[in]       numPoints is the number of output points to be computed.  
00051  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].  
00052  *  
00053  */ 
00054  
00055 arm_status arm_conv_partial_q7( 
00056   q7_t * pSrcA, 
00057   uint32_t srcALen, 
00058   q7_t * pSrcB, 
00059   uint32_t srcBLen, 
00060   q7_t * pDst, 
00061   uint32_t firstIndex, 
00062   uint32_t numPoints) 
00063 { 
00064   q7_t *pIn1;                                    /* inputA pointer */ 
00065   q7_t *pIn2;                                    /* inputB pointer */ 
00066   q7_t *pOut = pDst;                             /* output pointer */ 
00067   q7_t *px;                                      /* Intermediate inputA pointer */ 
00068   q7_t *py;                                      /* Intermediate inputB pointer */ 
00069   q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */ 
00070   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */ 
00071   q31_t input1, input2; 
00072   q15_t in1, in2; 
00073   q7_t x0, x1, x2, x3, c0, c1; 
00074   uint32_t j, k, count, check, blkCnt; 
00075   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter */ 
00076   arm_status status; 
00077  
00078  
00079   /* Check for range of output samples to be calculated */ 
00080   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 
00081   { 
00082     /* Set status as ARM_MATH_ARGUMENT_ERROR */ 
00083     status = ARM_MATH_ARGUMENT_ERROR; 
00084   } 
00085   else 
00086   { 
00087  
00088     /* The algorithm implementation is based on the lengths of the inputs. */ 
00089     /* srcB is always made to slide across srcA. */ 
00090     /* So srcBLen is always considered as shorter or equal to srcALen */ 
00091     if(srcALen >= srcBLen) 
00092     { 
00093       /* Initialization of inputA pointer */ 
00094       pIn1 = pSrcA; 
00095  
00096       /* Initialization of inputB pointer */ 
00097       pIn2 = pSrcB; 
00098     } 
00099     else 
00100     { 
00101       /* Initialization of inputA pointer */ 
00102       pIn1 = pSrcB; 
00103  
00104       /* Initialization of inputB pointer */ 
00105       pIn2 = pSrcA; 
00106  
00107       /* srcBLen is always considered as shorter or equal to srcALen */ 
00108       j = srcBLen; 
00109       srcBLen = srcALen; 
00110       srcALen = j; 
00111     } 
00112  
00113     /* Conditions to check which loopCounter holds  
00114      * the first and last indices of the output samples to be calculated. */ 
00115     check = firstIndex + numPoints; 
00116     blockSize3 = ((int32_t) check - (int32_t) srcALen); 
00117     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 
00118     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 
00119     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 
00120                                     (int32_t) numPoints) : 0; 
00121     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +  
00122                                     (int32_t) firstIndex); 
00123     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 
00124  
00125     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 
00126     /* The function is internally  
00127      * divided into three stages according to the number of multiplications that has to be  
00128      * taken place between inputA samples and inputB samples. In the first stage of the  
00129      * algorithm, the multiplications increase by one for every iteration.  
00130      * In the second stage of the algorithm, srcBLen number of multiplications are done.  
00131      * In the third stage of the algorithm, the multiplications decrease by one  
00132      * for every iteration. */ 
00133  
00134     /* Set the output pointer to point to the firstIndex  
00135      * of the output sample to be calculated. */ 
00136     pOut = pDst + firstIndex; 
00137  
00138     /* --------------------------  
00139      * Initializations of stage1  
00140      * -------------------------*/ 
00141  
00142     /* sum = x[0] * y[0]  
00143      * sum = x[0] * y[1] + x[1] * y[0]  
00144      * ....  
00145      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]  
00146      */ 
00147  
00148     /* In this stage the MAC operations are increased by 1 for every iteration.  
00149        The count variable holds the number of MAC operations performed.  
00150        Since the partial convolution starts from from firstIndex  
00151        Number of Macs to be performed is firstIndex + 1 */ 
00152     count = 1u + firstIndex; 
00153  
00154     /* Working pointer of inputA */ 
00155     px = pIn1; 
00156  
00157     /* Working pointer of inputB */ 
00158     pSrc2 = pIn2 + firstIndex; 
00159     py = pSrc2; 
00160  
00161     /* ------------------------  
00162      * Stage1 process  
00163      * ----------------------*/ 
00164  
00165     /* The first stage starts here */ 
00166     while(blockSize1 > 0) 
00167     { 
00168       /* Accumulator is made zero for every iteration */ 
00169       sum = 0; 
00170  
00171       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00172       k = count >> 2u; 
00173  
00174       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00175        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00176       while(k > 0u) 
00177       { 
00178         /* x[0] , x[1] */ 
00179         in1 = (q15_t) * px++; 
00180         in2 = (q15_t) * px++; 
00181         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00182  
00183         /* y[srcBLen - 1] , y[srcBLen - 2] */ 
00184         in1 = (q15_t) * py--; 
00185         in2 = (q15_t) * py--; 
00186         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00187  
00188         /* x[0] * y[srcBLen - 1] */ 
00189         /* x[1] * y[srcBLen - 2] */ 
00190         sum = __SMLAD(input1, input2, sum); 
00191  
00192         /* x[2] , x[3] */ 
00193         in1 = (q15_t) * px++; 
00194         in2 = (q15_t) * px++; 
00195         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00196  
00197         /* y[srcBLen - 3] , y[srcBLen - 4] */ 
00198         in1 = (q15_t) * py--; 
00199         in2 = (q15_t) * py--; 
00200         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00201  
00202         /* x[2] * y[srcBLen - 3] */ 
00203         /* x[3] * y[srcBLen - 4] */ 
00204         sum = __SMLAD(input1, input2, sum); 
00205  
00206         /* Decrement the loop counter */ 
00207         k--; 
00208       } 
00209  
00210       /* If the count is not a multiple of 4, compute any remaining MACs here.  
00211        ** No loop unrolling is used. */ 
00212       k = count % 0x4u; 
00213  
00214       while(k > 0u) 
00215       { 
00216         /* Perform the multiply-accumulates */ 
00217         sum += ((q31_t) * px++ * *py--); 
00218  
00219         /* Decrement the loop counter */ 
00220         k--; 
00221       } 
00222  
00223       /* Store the result in the accumulator in the destination buffer. */ 
00224       *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 
00225  
00226       /* Update the inputA and inputB pointers for next MAC calculation */ 
00227       py = ++pSrc2; 
00228       px = pIn1; 
00229  
00230       /* Increment the MAC count */ 
00231       count++; 
00232  
00233       /* Decrement the loop counter */ 
00234       blockSize1--; 
00235     } 
00236  
00237     /* --------------------------  
00238      * Initializations of stage2  
00239      * ------------------------*/ 
00240  
00241     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]  
00242      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]  
00243      * ....  
00244      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]  
00245      */ 
00246  
00247     /* Working pointer of inputA */ 
00248     px = pIn1; 
00249  
00250     /* Working pointer of inputB */ 
00251     pSrc2 = pIn2 + (srcBLen - 1u); 
00252     py = pSrc2; 
00253  
00254     /* count is index by which the pointer pIn1 to be incremented */ 
00255     count = 1u; 
00256  
00257     /* -------------------  
00258      * Stage2 process  
00259      * ------------------*/ 
00260  
00261     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00262      * So, to loop unroll over blockSize2,  
00263      * srcBLen should be greater than or equal to 4 */ 
00264     if(srcBLen >= 4u) 
00265     { 
00266       /* Loop unroll over blockSize2, by 4 */ 
00267       blkCnt = ((uint32_t) blockSize2 >> 2u); 
00268  
00269       while(blkCnt > 0u) 
00270       { 
00271         /* Set all accumulators to zero */ 
00272         acc0 = 0; 
00273         acc1 = 0; 
00274         acc2 = 0; 
00275         acc3 = 0; 
00276  
00277         /* read x[0], x[1], x[2] samples */ 
00278         x0 = *(px++); 
00279         x1 = *(px++); 
00280         x2 = *(px++); 
00281  
00282         /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00283         k = srcBLen >> 2u; 
00284  
00285         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00286          ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00287         do 
00288         { 
00289           /* Read y[srcBLen - 1] sample */ 
00290           c0 = *(py--); 
00291           /* Read y[srcBLen - 2] sample */ 
00292           c1 = *(py--); 
00293  
00294           /* Read x[3] sample */ 
00295           x3 = *(px++); 
00296  
00297           /* x[0] and x[1] are packed */ 
00298           in1 = (q15_t) x0; 
00299           in2 = (q15_t) x1; 
00300  
00301           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00302  
00303           /* y[srcBLen - 1]   and y[srcBLen - 2] are packed */ 
00304           in1 = (q15_t) c0; 
00305           in2 = (q15_t) c1; 
00306  
00307           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00308  
00309           /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2]  */ 
00310           acc0 = __SMLAD(input1, input2, acc0); 
00311  
00312           /* x[1] and x[2] are packed */ 
00313           in1 = (q15_t) x1; 
00314           in2 = (q15_t) x2; 
00315  
00316           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00317  
00318           /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2]  */ 
00319           acc1 = __SMLAD(input1, input2, acc1); 
00320  
00321           /* x[2] and x[3] are packed */ 
00322           in1 = (q15_t) x2; 
00323           in2 = (q15_t) x3; 
00324  
00325           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00326  
00327           /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2]  */ 
00328           acc2 = __SMLAD(input1, input2, acc2); 
00329  
00330           /* Read x[4] sample */ 
00331           x0 = *(px++); 
00332  
00333           /* x[3] and x[4] are packed */ 
00334           in1 = (q15_t) x3; 
00335           in2 = (q15_t) x0; 
00336  
00337           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00338  
00339           /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2]  */ 
00340           acc3 = __SMLAD(input1, input2, acc3); 
00341  
00342           /* Read y[srcBLen - 3] sample */ 
00343           c0 = *(py--); 
00344           /* Read y[srcBLen - 4] sample */ 
00345           c1 = *(py--); 
00346  
00347           /* Read x[5] sample */ 
00348           x1 = *(px++); 
00349  
00350           /* x[2] and x[3] are packed */ 
00351           in1 = (q15_t) x2; 
00352           in2 = (q15_t) x3; 
00353  
00354           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00355  
00356           /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 
00357           in1 = (q15_t) c0; 
00358           in2 = (q15_t) c1; 
00359  
00360           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00361  
00362           /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */ 
00363           acc0 = __SMLAD(input1, input2, acc0); 
00364  
00365           /* x[3] and x[4] are packed */ 
00366           in1 = (q15_t) x3; 
00367           in2 = (q15_t) x0; 
00368  
00369           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00370  
00371           /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4]  */ 
00372           acc1 = __SMLAD(input1, input2, acc1); 
00373  
00374           /* x[4] and x[5] are packed */ 
00375           in1 = (q15_t) x0; 
00376           in2 = (q15_t) x1; 
00377  
00378           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00379  
00380           /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */ 
00381           acc2 = __SMLAD(input1, input2, acc2); 
00382  
00383           /* Read x[6] sample */ 
00384           x2 = *(px++); 
00385  
00386           /* x[5] and x[6] are packed */ 
00387           in1 = (q15_t) x1; 
00388           in2 = (q15_t) x2; 
00389  
00390           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00391  
00392           /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4]  */ 
00393           acc3 = __SMLAD(input1, input2, acc3); 
00394  
00395         } while(--k); 
00396  
00397         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00398          ** No loop unrolling is used. */ 
00399         k = srcBLen % 0x4u; 
00400  
00401         while(k > 0u) 
00402         { 
00403           /* Read y[srcBLen - 5] sample */ 
00404           c0 = *(py--); 
00405  
00406           /* Read x[7] sample */ 
00407           x3 = *(px++); 
00408  
00409           /* Perform the multiply-accumulates */ 
00410           /* acc0 +=  x[4] * y[srcBLen - 5] */ 
00411           acc0 += ((q31_t) x0 * c0); 
00412           /* acc1 +=  x[5] * y[srcBLen - 5] */ 
00413           acc1 += ((q31_t) x1 * c0); 
00414           /* acc2 +=  x[6] * y[srcBLen - 5] */ 
00415           acc2 += ((q31_t) x2 * c0); 
00416           /* acc3 +=  x[7] * y[srcBLen - 5] */ 
00417           acc3 += ((q31_t) x3 * c0); 
00418  
00419           /* Reuse the present samples for the next MAC */ 
00420           x0 = x1; 
00421           x1 = x2; 
00422           x2 = x3; 
00423  
00424           /* Decrement the loop counter */ 
00425           k--; 
00426         } 
00427  
00428         /* Store the result in the accumulator in the destination buffer. */ 
00429         *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8)); 
00430         *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8)); 
00431         *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8)); 
00432         *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8)); 
00433  
00434         /* Update the inputA and inputB pointers for next MAC calculation */ 
00435         px = pIn1 + count * 4u; 
00436         py = pSrc2; 
00437  
00438         /* Increment the pointer pIn1 index, count by 1 */ 
00439         count++; 
00440  
00441         /* Decrement the loop counter */ 
00442         blkCnt--; 
00443       } 
00444  
00445       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00446        ** No loop unrolling is used. */ 
00447       blkCnt = (uint32_t) blockSize2 % 0x4u; 
00448  
00449       while(blkCnt > 0u) 
00450       { 
00451         /* Accumulator is made zero for every iteration */ 
00452         sum = 0; 
00453  
00454         /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00455         k = srcBLen >> 2u; 
00456  
00457         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00458          ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00459         while(k > 0u) 
00460         { 
00461  
00462           /* Reading two inputs of SrcA buffer and packing */ 
00463           in1 = (q15_t) * px++; 
00464           in2 = (q15_t) * px++; 
00465           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00466  
00467           /* Reading two inputs of SrcB buffer and packing */ 
00468           in1 = (q15_t) * py--; 
00469           in2 = (q15_t) * py--; 
00470           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00471  
00472           /* Perform the multiply-accumulates */ 
00473           sum = __SMLAD(input1, input2, sum); 
00474  
00475           /* Reading two inputs of SrcA buffer and packing */ 
00476           in1 = (q15_t) * px++; 
00477           in2 = (q15_t) * px++; 
00478           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00479  
00480           /* Reading two inputs of SrcB buffer and packing */ 
00481           in1 = (q15_t) * py--; 
00482           in2 = (q15_t) * py--; 
00483           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00484  
00485           /* Perform the multiply-accumulates */ 
00486           sum = __SMLAD(input1, input2, sum); 
00487  
00488           /* Decrement the loop counter */ 
00489           k--; 
00490         } 
00491  
00492         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00493          ** No loop unrolling is used. */ 
00494         k = srcBLen % 0x4u; 
00495  
00496         while(k > 0u) 
00497         { 
00498           /* Perform the multiply-accumulates */ 
00499           sum += ((q31_t) * px++ * *py--); 
00500  
00501           /* Decrement the loop counter */ 
00502           k--; 
00503         } 
00504  
00505         /* Store the result in the accumulator in the destination buffer. */ 
00506         *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 
00507  
00508         /* Update the inputA and inputB pointers for next MAC calculation */ 
00509         px = pIn1 + count; 
00510         py = pSrc2; 
00511  
00512         /* Increment the pointer pIn1 index, count by 1 */ 
00513         count++; 
00514  
00515         /* Decrement the loop counter */ 
00516         blkCnt--; 
00517       } 
00518     } 
00519     else 
00520     { 
00521       /* If the srcBLen is not a multiple of 4,  
00522        * the blockSize2 loop cannot be unrolled by 4 */ 
00523       blkCnt = (uint32_t) blockSize2; 
00524  
00525       while(blkCnt > 0u) 
00526       { 
00527         /* Accumulator is made zero for every iteration */ 
00528         sum = 0; 
00529  
00530         /* srcBLen number of MACS should be performed */ 
00531         k = srcBLen; 
00532  
00533         while(k > 0u) 
00534         { 
00535           /* Perform the multiply-accumulate */ 
00536           sum += ((q31_t) * px++ * *py--); 
00537  
00538           /* Decrement the loop counter */ 
00539           k--; 
00540         } 
00541  
00542         /* Store the result in the accumulator in the destination buffer. */ 
00543         *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 
00544  
00545         /* Update the inputA and inputB pointers for next MAC calculation */ 
00546         px = pIn1 + count; 
00547         py = pSrc2; 
00548  
00549         /* Increment the MAC count */ 
00550         count++; 
00551  
00552         /* Decrement the loop counter */ 
00553         blkCnt--; 
00554       } 
00555     } 
00556  
00557  
00558     /* --------------------------  
00559      * Initializations of stage3  
00560      * -------------------------*/ 
00561  
00562     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]  
00563      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]  
00564      * ....  
00565      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]  
00566      * sum +=  x[srcALen-1] * y[srcBLen-1]  
00567      */ 
00568  
00569     /* In this stage the MAC operations are decreased by 1 for every iteration.  
00570        The count variable holds the number of MAC operations performed */ 
00571     count = srcBLen - 1u; 
00572  
00573     /* Working pointer of inputA */ 
00574     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00575     px = pSrc1; 
00576  
00577     /* Working pointer of inputB */ 
00578     pSrc2 = pIn2 + (srcBLen - 1u); 
00579     py = pSrc2; 
00580  
00581     /* -------------------  
00582      * Stage3 process  
00583      * ------------------*/ 
00584  
00585     while(blockSize3 > 0) 
00586     { 
00587       /* Accumulator is made zero for every iteration */ 
00588       sum = 0; 
00589  
00590       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00591       k = count >> 2u; 
00592  
00593       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00594        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00595       while(k > 0u) 
00596       { 
00597         /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 
00598         in1 = (q15_t) * px++; 
00599         in2 = (q15_t) * px++; 
00600         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00601  
00602         /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 
00603         in1 = (q15_t) * py--; 
00604         in2 = (q15_t) * py--; 
00605         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00606  
00607         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 
00608         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 
00609         sum = __SMLAD(input1, input2, sum); 
00610  
00611         /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 
00612         in1 = (q15_t) * px++; 
00613         in2 = (q15_t) * px++; 
00614         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00615  
00616         /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 
00617         in1 = (q15_t) * py--; 
00618         in2 = (q15_t) * py--; 
00619         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00620  
00621         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 
00622         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 
00623         sum = __SMLAD(input1, input2, sum); 
00624  
00625         /* Decrement the loop counter */ 
00626         k--; 
00627       } 
00628  
00629       /* If the count is not a multiple of 4, compute any remaining MACs here.  
00630        ** No loop unrolling is used. */ 
00631       k = count % 0x4u; 
00632  
00633       while(k > 0u) 
00634       { 
00635         /* Perform the multiply-accumulates */ 
00636         /* sum +=  x[srcALen-1] * y[srcBLen-1] */ 
00637         sum += ((q31_t) * px++ * *py--); 
00638  
00639         /* Decrement the loop counter */ 
00640         k--; 
00641       } 
00642  
00643       /* Store the result in the accumulator in the destination buffer. */ 
00644       *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 
00645  
00646       /* Update the inputA and inputB pointers for next MAC calculation */ 
00647       px = ++pSrc1; 
00648       py = pSrc2; 
00649  
00650       /* Decrement the MAC count */ 
00651       count--; 
00652  
00653       /* Decrement the loop counter */ 
00654       blockSize3--; 
00655  
00656     } 
00657  
00658     /* set status as ARM_MATH_SUCCESS */ 
00659     status = ARM_MATH_SUCCESS; 
00660   } 
00661  
00662   /* Return to application */ 
00663   return (status); 
00664  
00665 } 
00666  
00667 /**  
00668  * @} end of PartialConv group  
00669  */