CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_fast_q31.c Source File

arm_conv_partial_fast_q31.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_conv_partial_fast_q31.c    
00009 *    
00010 * Description:  Fast Q31 Partial convolution.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup PartialConv    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.    
00054  * @param[in]       *pSrcA points to the first input sequence.    
00055  * @param[in]       srcALen length of the first input sequence.    
00056  * @param[in]       *pSrcB points to the second input sequence.    
00057  * @param[in]       srcBLen length of the second input sequence.    
00058  * @param[out]      *pDst points to the location where the output result is written.    
00059  * @param[in]       firstIndex is the first output sample to start with.    
00060  * @param[in]       numPoints is the number of output points to be computed.    
00061  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].    
00062  *    
00063  * \par    
00064  * See <code>arm_conv_partial_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.    
00065  */
00066 
00067 arm_status arm_conv_partial_fast_q31(
00068   q31_t * pSrcA,
00069   uint32_t srcALen,
00070   q31_t * pSrcB,
00071   uint32_t srcBLen,
00072   q31_t * pDst,
00073   uint32_t firstIndex,
00074   uint32_t numPoints)
00075 {
00076   q31_t *pIn1;                                   /* inputA pointer               */
00077   q31_t *pIn2;                                   /* inputB pointer               */
00078   q31_t *pOut = pDst;                            /* output pointer               */
00079   q31_t *px;                                     /* Intermediate inputA pointer  */
00080   q31_t *py;                                     /* Intermediate inputB pointer  */
00081   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00082   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
00083   q31_t x0, x1, x2, x3, c0;
00084   uint32_t j, k, count, check, blkCnt;
00085   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
00086   arm_status status;                             /* status of Partial convolution */
00087 
00088 
00089   /* Check for range of output samples to be calculated */
00090   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00091   {
00092     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00093     status = ARM_MATH_ARGUMENT_ERROR;
00094   }
00095   else
00096   {
00097 
00098     /* The algorithm implementation is based on the lengths of the inputs. */
00099     /* srcB is always made to slide across srcA. */
00100     /* So srcBLen is always considered as shorter or equal to srcALen */
00101     if(srcALen >= srcBLen)
00102     {
00103       /* Initialization of inputA pointer */
00104       pIn1 = pSrcA;
00105 
00106       /* Initialization of inputB pointer */
00107       pIn2 = pSrcB;
00108     }
00109     else
00110     {
00111       /* Initialization of inputA pointer */
00112       pIn1 = pSrcB;
00113 
00114       /* Initialization of inputB pointer */
00115       pIn2 = pSrcA;
00116 
00117       /* srcBLen is always considered as shorter or equal to srcALen */
00118       j = srcBLen;
00119       srcBLen = srcALen;
00120       srcALen = j;
00121     }
00122 
00123     /* Conditions to check which loopCounter holds    
00124      * the first and last indices of the output samples to be calculated. */
00125     check = firstIndex + numPoints;
00126     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00127     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00128     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00129     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00130                                      (int32_t) numPoints) : 0;
00131     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00132                                     (int32_t) firstIndex);
00133     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00134 
00135     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00136     /* The function is internally    
00137      * divided into three stages according to the number of multiplications that has to be    
00138      * taken place between inputA samples and inputB samples. In the first stage of the    
00139      * algorithm, the multiplications increase by one for every iteration.    
00140      * In the second stage of the algorithm, srcBLen number of multiplications are done.    
00141      * In the third stage of the algorithm, the multiplications decrease by one    
00142      * for every iteration. */
00143 
00144     /* Set the output pointer to point to the firstIndex    
00145      * of the output sample to be calculated. */
00146     pOut = pDst + firstIndex;
00147 
00148     /* --------------------------    
00149      * Initializations of stage1    
00150      * -------------------------*/
00151 
00152     /* sum = x[0] * y[0]    
00153      * sum = x[0] * y[1] + x[1] * y[0]    
00154      * ....    
00155      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]    
00156      */
00157 
00158     /* In this stage the MAC operations are increased by 1 for every iteration.    
00159        The count variable holds the number of MAC operations performed.    
00160        Since the partial convolution starts from firstIndex    
00161        Number of Macs to be performed is firstIndex + 1 */
00162     count = 1u + firstIndex;
00163 
00164     /* Working pointer of inputA */
00165     px = pIn1;
00166 
00167     /* Working pointer of inputB */
00168     pSrc2 = pIn2 + firstIndex;
00169     py = pSrc2;
00170 
00171     /* ------------------------    
00172      * Stage1 process    
00173      * ----------------------*/
00174 
00175     /* The first loop starts here */
00176     while(blockSize1 > 0)
00177     {
00178       /* Accumulator is made zero for every iteration */
00179       sum = 0;
00180 
00181       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00182       k = count >> 2u;
00183 
00184       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00185        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00186       while(k > 0u)
00187       {
00188         /* x[0] * y[srcBLen - 1] */
00189         sum = (q31_t) ((((q63_t) sum << 32) +
00190                         ((q63_t) * px++ * (*py--))) >> 32);
00191 
00192         /* x[1] * y[srcBLen - 2] */
00193         sum = (q31_t) ((((q63_t) sum << 32) +
00194                         ((q63_t) * px++ * (*py--))) >> 32);
00195 
00196         /* x[2] * y[srcBLen - 3] */
00197         sum = (q31_t) ((((q63_t) sum << 32) +
00198                         ((q63_t) * px++ * (*py--))) >> 32);
00199 
00200         /* x[3] * y[srcBLen - 4] */
00201         sum = (q31_t) ((((q63_t) sum << 32) +
00202                         ((q63_t) * px++ * (*py--))) >> 32);
00203 
00204         /* Decrement the loop counter */
00205         k--;
00206       }
00207 
00208       /* If the count is not a multiple of 4, compute any remaining MACs here.    
00209        ** No loop unrolling is used. */
00210       k = count % 0x4u;
00211 
00212       while(k > 0u)
00213       {
00214         /* Perform the multiply-accumulates */
00215         sum = (q31_t) ((((q63_t) sum << 32) +
00216                         ((q63_t) * px++ * (*py--))) >> 32);
00217 
00218         /* Decrement the loop counter */
00219         k--;
00220       }
00221 
00222       /* Store the result in the accumulator in the destination buffer. */
00223       *pOut++ = sum << 1;
00224 
00225       /* Update the inputA and inputB pointers for next MAC calculation */
00226       py = ++pSrc2;
00227       px = pIn1;
00228 
00229       /* Increment the MAC count */
00230       count++;
00231 
00232       /* Decrement the loop counter */
00233       blockSize1--;
00234     }
00235 
00236     /* --------------------------    
00237      * Initializations of stage2    
00238      * ------------------------*/
00239 
00240     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]    
00241      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]    
00242      * ....    
00243      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]    
00244      */
00245 
00246     /* Working pointer of inputA */
00247     if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00248     {
00249       px = pIn1 + firstIndex - srcBLen + 1;
00250     }
00251     else
00252     {
00253       px = pIn1;
00254     }
00255 
00256     /* Working pointer of inputB */
00257     pSrc2 = pIn2 + (srcBLen - 1u);
00258     py = pSrc2;
00259 
00260     /* count is index by which the pointer pIn1 to be incremented */
00261     count = 0u;
00262 
00263     /* -------------------    
00264      * Stage2 process    
00265      * ------------------*/
00266 
00267     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.    
00268      * So, to loop unroll over blockSize2,    
00269      * srcBLen should be greater than or equal to 4 */
00270     if(srcBLen >= 4u)
00271     {
00272       /* Loop unroll over blockSize2 */
00273       blkCnt = ((uint32_t) blockSize2 >> 2u);
00274 
00275       while(blkCnt > 0u)
00276       {
00277         /* Set all accumulators to zero */
00278         acc0 = 0;
00279         acc1 = 0;
00280         acc2 = 0;
00281         acc3 = 0;
00282 
00283         /* read x[0], x[1], x[2] samples */
00284         x0 = *(px++);
00285         x1 = *(px++);
00286         x2 = *(px++);
00287 
00288         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00289         k = srcBLen >> 2u;
00290 
00291         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00292          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00293         do
00294         {
00295           /* Read y[srcBLen - 1] sample */
00296           c0 = *(py--);
00297 
00298           /* Read x[3] sample */
00299           x3 = *(px++);
00300 
00301           /* Perform the multiply-accumulate */
00302           /* acc0 +=  x[0] * y[srcBLen - 1] */
00303           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00304 
00305           /* acc1 +=  x[1] * y[srcBLen - 1] */
00306           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00307 
00308           /* acc2 +=  x[2] * y[srcBLen - 1] */
00309           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
00310 
00311           /* acc3 +=  x[3] * y[srcBLen - 1] */
00312           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
00313 
00314           /* Read y[srcBLen - 2] sample */
00315           c0 = *(py--);
00316 
00317           /* Read x[4] sample */
00318           x0 = *(px++);
00319 
00320           /* Perform the multiply-accumulate */
00321           /* acc0 +=  x[1] * y[srcBLen - 2] */
00322           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
00323           /* acc1 +=  x[2] * y[srcBLen - 2] */
00324           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
00325           /* acc2 +=  x[3] * y[srcBLen - 2] */
00326           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
00327           /* acc3 +=  x[4] * y[srcBLen - 2] */
00328           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
00329 
00330           /* Read y[srcBLen - 3] sample */
00331           c0 = *(py--);
00332 
00333           /* Read x[5] sample */
00334           x1 = *(px++);
00335 
00336           /* Perform the multiply-accumulates */
00337           /* acc0 +=  x[2] * y[srcBLen - 3] */
00338           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
00339           /* acc1 +=  x[3] * y[srcBLen - 2] */
00340           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
00341           /* acc2 +=  x[4] * y[srcBLen - 2] */
00342           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
00343           /* acc3 +=  x[5] * y[srcBLen - 2] */
00344           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
00345 
00346           /* Read y[srcBLen - 4] sample */
00347           c0 = *(py--);
00348 
00349           /* Read x[6] sample */
00350           x2 = *(px++);
00351 
00352           /* Perform the multiply-accumulates */
00353           /* acc0 +=  x[3] * y[srcBLen - 4] */
00354           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
00355           /* acc1 +=  x[4] * y[srcBLen - 4] */
00356           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
00357           /* acc2 +=  x[5] * y[srcBLen - 4] */
00358           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
00359           /* acc3 +=  x[6] * y[srcBLen - 4] */
00360           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
00361 
00362 
00363         } while(--k);
00364 
00365         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00366          ** No loop unrolling is used. */
00367         k = srcBLen % 0x4u;
00368 
00369         while(k > 0u)
00370         {
00371           /* Read y[srcBLen - 5] sample */
00372           c0 = *(py--);
00373 
00374           /* Read x[7] sample */
00375           x3 = *(px++);
00376 
00377           /* Perform the multiply-accumulates */
00378           /* acc0 +=  x[4] * y[srcBLen - 5] */
00379           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00380           /* acc1 +=  x[5] * y[srcBLen - 5] */
00381           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00382           /* acc2 +=  x[6] * y[srcBLen - 5] */
00383           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
00384           /* acc3 +=  x[7] * y[srcBLen - 5] */
00385           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
00386 
00387           /* Reuse the present samples for the next MAC */
00388           x0 = x1;
00389           x1 = x2;
00390           x2 = x3;
00391 
00392           /* Decrement the loop counter */
00393           k--;
00394         }
00395 
00396         /* Store the result in the accumulator in the destination buffer. */
00397         *pOut++ = (q31_t) (acc0 << 1);
00398         *pOut++ = (q31_t) (acc1 << 1);
00399         *pOut++ = (q31_t) (acc2 << 1);
00400         *pOut++ = (q31_t) (acc3 << 1);
00401 
00402         /* Increment the pointer pIn1 index, count by 4 */
00403         count += 4u;
00404 
00405         /* Update the inputA and inputB pointers for next MAC calculation */
00406         px = pIn1 + count;
00407         py = pSrc2;
00408 
00409         /* Decrement the loop counter */
00410         blkCnt--;
00411       }
00412 
00413       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.    
00414        ** No loop unrolling is used. */
00415       blkCnt = (uint32_t) blockSize2 % 0x4u;
00416 
00417       while(blkCnt > 0u)
00418       {
00419         /* Accumulator is made zero for every iteration */
00420         sum = 0;
00421 
00422         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00423         k = srcBLen >> 2u;
00424 
00425         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00426          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00427         while(k > 0u)
00428         {
00429           /* Perform the multiply-accumulates */
00430           sum = (q31_t) ((((q63_t) sum << 32) +
00431                           ((q63_t) * px++ * (*py--))) >> 32);
00432           sum = (q31_t) ((((q63_t) sum << 32) +
00433                           ((q63_t) * px++ * (*py--))) >> 32);
00434           sum = (q31_t) ((((q63_t) sum << 32) +
00435                           ((q63_t) * px++ * (*py--))) >> 32);
00436           sum = (q31_t) ((((q63_t) sum << 32) +
00437                           ((q63_t) * px++ * (*py--))) >> 32);
00438 
00439           /* Decrement the loop counter */
00440           k--;
00441         }
00442 
00443         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00444          ** No loop unrolling is used. */
00445         k = srcBLen % 0x4u;
00446 
00447         while(k > 0u)
00448         {
00449           /* Perform the multiply-accumulate */
00450           sum = (q31_t) ((((q63_t) sum << 32) +
00451                           ((q63_t) * px++ * (*py--))) >> 32);
00452 
00453           /* Decrement the loop counter */
00454           k--;
00455         }
00456 
00457         /* Store the result in the accumulator in the destination buffer. */
00458         *pOut++ = sum << 1;
00459 
00460         /* Increment the MAC count */
00461         count++;
00462 
00463         /* Update the inputA and inputB pointers for next MAC calculation */
00464         px = pIn1 + count;
00465         py = pSrc2;
00466 
00467         /* Decrement the loop counter */
00468         blkCnt--;
00469       }
00470     }
00471     else
00472     {
00473       /* If the srcBLen is not a multiple of 4,    
00474        * the blockSize2 loop cannot be unrolled by 4 */
00475       blkCnt = (uint32_t) blockSize2;
00476 
00477       while(blkCnt > 0u)
00478       {
00479         /* Accumulator is made zero for every iteration */
00480         sum = 0;
00481 
00482         /* srcBLen number of MACS should be performed */
00483         k = srcBLen;
00484 
00485         while(k > 0u)
00486         {
00487           /* Perform the multiply-accumulate */
00488           sum = (q31_t) ((((q63_t) sum << 32) +
00489                           ((q63_t) * px++ * (*py--))) >> 32);
00490 
00491           /* Decrement the loop counter */
00492           k--;
00493         }
00494 
00495         /* Store the result in the accumulator in the destination buffer. */
00496         *pOut++ = sum << 1;
00497 
00498         /* Increment the MAC count */
00499         count++;
00500 
00501         /* Update the inputA and inputB pointers for next MAC calculation */
00502         px = pIn1 + count;
00503         py = pSrc2;
00504 
00505         /* Decrement the loop counter */
00506         blkCnt--;
00507       }
00508     }
00509 
00510 
00511     /* --------------------------    
00512      * Initializations of stage3    
00513      * -------------------------*/
00514 
00515     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]    
00516      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]    
00517      * ....    
00518      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]    
00519      * sum +=  x[srcALen-1] * y[srcBLen-1]    
00520      */
00521 
00522     /* In this stage the MAC operations are decreased by 1 for every iteration.    
00523        The count variable holds the number of MAC operations performed */
00524     count = srcBLen - 1u;
00525 
00526     /* Working pointer of inputA */
00527     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00528     px = pSrc1;
00529 
00530     /* Working pointer of inputB */
00531     pSrc2 = pIn2 + (srcBLen - 1u);
00532     py = pSrc2;
00533 
00534     /* -------------------    
00535      * Stage3 process    
00536      * ------------------*/
00537 
00538     while(blockSize3 > 0)
00539     {
00540       /* Accumulator is made zero for every iteration */
00541       sum = 0;
00542 
00543       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00544       k = count >> 2u;
00545 
00546       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00547        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00548       while(k > 0u)
00549       {
00550         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00551         sum = (q31_t) ((((q63_t) sum << 32) +
00552                         ((q63_t) * px++ * (*py--))) >> 32);
00553 
00554         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00555         sum = (q31_t) ((((q63_t) sum << 32) +
00556                         ((q63_t) * px++ * (*py--))) >> 32);
00557 
00558         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00559         sum = (q31_t) ((((q63_t) sum << 32) +
00560                         ((q63_t) * px++ * (*py--))) >> 32);
00561 
00562         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00563         sum = (q31_t) ((((q63_t) sum << 32) +
00564                         ((q63_t) * px++ * (*py--))) >> 32);
00565 
00566         /* Decrement the loop counter */
00567         k--;
00568       }
00569 
00570       /* If the count is not a multiple of 4, compute any remaining MACs here.    
00571        ** No loop unrolling is used. */
00572       k = count % 0x4u;
00573 
00574       while(k > 0u)
00575       {
00576         /* Perform the multiply-accumulates */
00577         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00578         sum = (q31_t) ((((q63_t) sum << 32) +
00579                         ((q63_t) * px++ * (*py--))) >> 32);
00580 
00581         /* Decrement the loop counter */
00582         k--;
00583       }
00584 
00585       /* Store the result in the accumulator in the destination buffer. */
00586       *pOut++ = sum << 1;
00587 
00588       /* Update the inputA and inputB pointers for next MAC calculation */
00589       px = ++pSrc1;
00590       py = pSrc2;
00591 
00592       /* Decrement the MAC count */
00593       count--;
00594 
00595       /* Decrement the loop counter */
00596       blockSize3--;
00597 
00598     }
00599 
00600     /* set status as ARM_MATH_SUCCESS */
00601     status = ARM_MATH_SUCCESS;
00602   }
00603 
00604   /* Return to application */
00605   return (status);
00606 
00607 }
00608 
00609 /**    
00610  * @} end of PartialConv group    
00611  */