CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_fast_q31.c Source File

arm_conv_partial_fast_q31.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_conv_partial_fast_q31.c    
00009 *    
00010 * Description:  Fast Q31 Partial convolution.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup PartialConv    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.    
00054  * @param[in]       *pSrcA points to the first input sequence.    
00055  * @param[in]       srcALen length of the first input sequence.    
00056  * @param[in]       *pSrcB points to the second input sequence.    
00057  * @param[in]       srcBLen length of the second input sequence.    
00058  * @param[out]      *pDst points to the location where the output result is written.    
00059  * @param[in]       firstIndex is the first output sample to start with.    
00060  * @param[in]       numPoints is the number of output points to be computed.    
00061  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].    
00062  *    
00063  * \par    
00064  * See <code>arm_conv_partial_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.    
00065  */
00066 
00067 arm_status arm_conv_partial_fast_q31(
00068   q31_t * pSrcA,
00069   uint32_t srcALen,
00070   q31_t * pSrcB,
00071   uint32_t srcBLen,
00072   q31_t * pDst,
00073   uint32_t firstIndex,
00074   uint32_t numPoints)
00075 {
00076   q31_t *pIn1;                                   /* inputA pointer               */
00077   q31_t *pIn2;                                   /* inputB pointer               */
00078   q31_t *pOut = pDst;                            /* output pointer               */
00079   q31_t *px;                                     /* Intermediate inputA pointer  */
00080   q31_t *py;                                     /* Intermediate inputB pointer  */
00081   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00082   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
00083   q31_t x0, x1, x2, x3, c0;
00084   uint32_t j, k, count, check, blkCnt;
00085   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
00086   arm_status status;                             /* status of Partial convolution */
00087 
00088 
00089   /* Check for range of output samples to be calculated */
00090   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00091   {
00092     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00093     status = ARM_MATH_ARGUMENT_ERROR;
00094   }
00095   else
00096   {
00097 
00098     /* The algorithm implementation is based on the lengths of the inputs. */
00099     /* srcB is always made to slide across srcA. */
00100     /* So srcBLen is always considered as shorter or equal to srcALen */
00101     if(srcALen >= srcBLen)
00102     {
00103       /* Initialization of inputA pointer */
00104       pIn1 = pSrcA;
00105 
00106       /* Initialization of inputB pointer */
00107       pIn2 = pSrcB;
00108     }
00109     else
00110     {
00111       /* Initialization of inputA pointer */
00112       pIn1 = pSrcB;
00113 
00114       /* Initialization of inputB pointer */
00115       pIn2 = pSrcA;
00116 
00117       /* srcBLen is always considered as shorter or equal to srcALen */
00118       j = srcBLen;
00119       srcBLen = srcALen;
00120       srcALen = j;
00121     }
00122 
00123     /* Conditions to check which loopCounter holds    
00124      * the first and last indices of the output samples to be calculated. */
00125     check = firstIndex + numPoints;
00126     blockSize3 = ((int32_t) check - (int32_t) srcALen);
00127     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
00128     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00129     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00130                                      (int32_t) numPoints) : 0;
00131     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00132                                     (int32_t) firstIndex);
00133     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00134 
00135     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00136     /* The function is internally    
00137      * divided into three stages according to the number of multiplications that has to be    
00138      * taken place between inputA samples and inputB samples. In the first stage of the    
00139      * algorithm, the multiplications increase by one for every iteration.    
00140      * In the second stage of the algorithm, srcBLen number of multiplications are done.    
00141      * In the third stage of the algorithm, the multiplications decrease by one    
00142      * for every iteration. */
00143 
00144     /* Set the output pointer to point to the firstIndex    
00145      * of the output sample to be calculated. */
00146     pOut = pDst + firstIndex;
00147 
00148     /* --------------------------    
00149      * Initializations of stage1    
00150      * -------------------------*/
00151 
00152     /* sum = x[0] * y[0]    
00153      * sum = x[0] * y[1] + x[1] * y[0]    
00154      * ....    
00155      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]    
00156      */
00157 
00158     /* In this stage the MAC operations are increased by 1 for every iteration.    
00159        The count variable holds the number of MAC operations performed.    
00160        Since the partial convolution starts from firstIndex    
00161        Number of Macs to be performed is firstIndex + 1 */
00162     count = 1u + firstIndex;
00163 
00164     /* Working pointer of inputA */
00165     px = pIn1;
00166 
00167     /* Working pointer of inputB */
00168     pSrc2 = pIn2 + firstIndex;
00169     py = pSrc2;
00170 
00171     /* ------------------------    
00172      * Stage1 process    
00173      * ----------------------*/
00174 
00175     /* The first loop starts here */
00176     while(blockSize1 > 0)
00177     {
00178       /* Accumulator is made zero for every iteration */
00179       sum = 0;
00180 
00181       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00182       k = count >> 2u;
00183 
00184       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00185        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00186       while(k > 0u)
00187       {
00188         /* x[0] * y[srcBLen - 1] */
00189         sum = (q31_t) ((((q63_t) sum << 32) +
00190                         ((q63_t) * px++ * (*py--))) >> 32);
00191 
00192         /* x[1] * y[srcBLen - 2] */
00193         sum = (q31_t) ((((q63_t) sum << 32) +
00194                         ((q63_t) * px++ * (*py--))) >> 32);
00195 
00196         /* x[2] * y[srcBLen - 3] */
00197         sum = (q31_t) ((((q63_t) sum << 32) +
00198                         ((q63_t) * px++ * (*py--))) >> 32);
00199 
00200         /* x[3] * y[srcBLen - 4] */
00201         sum = (q31_t) ((((q63_t) sum << 32) +
00202                         ((q63_t) * px++ * (*py--))) >> 32);
00203 
00204         /* Decrement the loop counter */
00205         k--;
00206       }
00207 
00208       /* If the count is not a multiple of 4, compute any remaining MACs here.    
00209        ** No loop unrolling is used. */
00210       k = count % 0x4u;
00211 
00212       while(k > 0u)
00213       {
00214         /* Perform the multiply-accumulates */
00215         sum = (q31_t) ((((q63_t) sum << 32) +
00216                         ((q63_t) * px++ * (*py--))) >> 32);
00217 
00218         /* Decrement the loop counter */
00219         k--;
00220       }
00221 
00222       /* Store the result in the accumulator in the destination buffer. */
00223       *pOut++ = sum << 1;
00224 
00225       /* Update the inputA and inputB pointers for next MAC calculation */
00226       py = ++pSrc2;
00227       px = pIn1;
00228 
00229       /* Increment the MAC count */
00230       count++;
00231 
00232       /* Decrement the loop counter */
00233       blockSize1--;
00234     }
00235 
00236     /* --------------------------    
00237      * Initializations of stage2    
00238      * ------------------------*/
00239 
00240     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]    
00241      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]    
00242      * ....    
00243      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]    
00244      */
00245 
00246     /* Working pointer of inputA */
00247     px = pIn1;
00248 
00249     /* Working pointer of inputB */
00250     pSrc2 = pIn2 + (srcBLen - 1u);
00251     py = pSrc2;
00252 
00253     /* count is index by which the pointer pIn1 to be incremented */
00254     count = 0u;
00255 
00256     /* -------------------    
00257      * Stage2 process    
00258      * ------------------*/
00259 
00260     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.    
00261      * So, to loop unroll over blockSize2,    
00262      * srcBLen should be greater than or equal to 4 */
00263     if(srcBLen >= 4u)
00264     {
00265       /* Loop unroll over blockSize2 */
00266       blkCnt = ((uint32_t) blockSize2 >> 2u);
00267 
00268       while(blkCnt > 0u)
00269       {
00270         /* Set all accumulators to zero */
00271         acc0 = 0;
00272         acc1 = 0;
00273         acc2 = 0;
00274         acc3 = 0;
00275 
00276         /* read x[0], x[1], x[2] samples */
00277         x0 = *(px++);
00278         x1 = *(px++);
00279         x2 = *(px++);
00280 
00281         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00282         k = srcBLen >> 2u;
00283 
00284         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00285          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00286         do
00287         {
00288           /* Read y[srcBLen - 1] sample */
00289           c0 = *(py--);
00290 
00291           /* Read x[3] sample */
00292           x3 = *(px++);
00293 
00294           /* Perform the multiply-accumulate */
00295           /* acc0 +=  x[0] * y[srcBLen - 1] */
00296           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00297 
00298           /* acc1 +=  x[1] * y[srcBLen - 1] */
00299           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00300 
00301           /* acc2 +=  x[2] * y[srcBLen - 1] */
00302           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
00303 
00304           /* acc3 +=  x[3] * y[srcBLen - 1] */
00305           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
00306 
00307           /* Read y[srcBLen - 2] sample */
00308           c0 = *(py--);
00309 
00310           /* Read x[4] sample */
00311           x0 = *(px++);
00312 
00313           /* Perform the multiply-accumulate */
00314           /* acc0 +=  x[1] * y[srcBLen - 2] */
00315           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
00316           /* acc1 +=  x[2] * y[srcBLen - 2] */
00317           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
00318           /* acc2 +=  x[3] * y[srcBLen - 2] */
00319           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
00320           /* acc3 +=  x[4] * y[srcBLen - 2] */
00321           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
00322 
00323           /* Read y[srcBLen - 3] sample */
00324           c0 = *(py--);
00325 
00326           /* Read x[5] sample */
00327           x1 = *(px++);
00328 
00329           /* Perform the multiply-accumulates */
00330           /* acc0 +=  x[2] * y[srcBLen - 3] */
00331           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
00332           /* acc1 +=  x[3] * y[srcBLen - 2] */
00333           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
00334           /* acc2 +=  x[4] * y[srcBLen - 2] */
00335           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
00336           /* acc3 +=  x[5] * y[srcBLen - 2] */
00337           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
00338 
00339           /* Read y[srcBLen - 4] sample */
00340           c0 = *(py--);
00341 
00342           /* Read x[6] sample */
00343           x2 = *(px++);
00344 
00345           /* Perform the multiply-accumulates */
00346           /* acc0 +=  x[3] * y[srcBLen - 4] */
00347           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
00348           /* acc1 +=  x[4] * y[srcBLen - 4] */
00349           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
00350           /* acc2 +=  x[5] * y[srcBLen - 4] */
00351           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
00352           /* acc3 +=  x[6] * y[srcBLen - 4] */
00353           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
00354 
00355 
00356         } while(--k);
00357 
00358         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00359          ** No loop unrolling is used. */
00360         k = srcBLen % 0x4u;
00361 
00362         while(k > 0u)
00363         {
00364           /* Read y[srcBLen - 5] sample */
00365           c0 = *(py--);
00366 
00367           /* Read x[7] sample */
00368           x3 = *(px++);
00369 
00370           /* Perform the multiply-accumulates */
00371           /* acc0 +=  x[4] * y[srcBLen - 5] */
00372           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00373           /* acc1 +=  x[5] * y[srcBLen - 5] */
00374           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00375           /* acc2 +=  x[6] * y[srcBLen - 5] */
00376           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
00377           /* acc3 +=  x[7] * y[srcBLen - 5] */
00378           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
00379 
00380           /* Reuse the present samples for the next MAC */
00381           x0 = x1;
00382           x1 = x2;
00383           x2 = x3;
00384 
00385           /* Decrement the loop counter */
00386           k--;
00387         }
00388 
00389         /* Store the result in the accumulator in the destination buffer. */
00390         *pOut++ = (q31_t) (acc0 << 1);
00391         *pOut++ = (q31_t) (acc1 << 1);
00392         *pOut++ = (q31_t) (acc2 << 1);
00393         *pOut++ = (q31_t) (acc3 << 1);
00394 
00395         /* Increment the pointer pIn1 index, count by 4 */
00396         count += 4u;
00397 
00398         /* Update the inputA and inputB pointers for next MAC calculation */
00399         px = pIn1 + count;
00400         py = pSrc2;
00401 
00402         /* Decrement the loop counter */
00403         blkCnt--;
00404       }
00405 
00406       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.    
00407        ** No loop unrolling is used. */
00408       blkCnt = (uint32_t) blockSize2 % 0x4u;
00409 
00410       while(blkCnt > 0u)
00411       {
00412         /* Accumulator is made zero for every iteration */
00413         sum = 0;
00414 
00415         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00416         k = srcBLen >> 2u;
00417 
00418         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00419          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00420         while(k > 0u)
00421         {
00422           /* Perform the multiply-accumulates */
00423           sum = (q31_t) ((((q63_t) sum << 32) +
00424                           ((q63_t) * px++ * (*py--))) >> 32);
00425           sum = (q31_t) ((((q63_t) sum << 32) +
00426                           ((q63_t) * px++ * (*py--))) >> 32);
00427           sum = (q31_t) ((((q63_t) sum << 32) +
00428                           ((q63_t) * px++ * (*py--))) >> 32);
00429           sum = (q31_t) ((((q63_t) sum << 32) +
00430                           ((q63_t) * px++ * (*py--))) >> 32);
00431 
00432           /* Decrement the loop counter */
00433           k--;
00434         }
00435 
00436         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00437          ** No loop unrolling is used. */
00438         k = srcBLen % 0x4u;
00439 
00440         while(k > 0u)
00441         {
00442           /* Perform the multiply-accumulate */
00443           sum = (q31_t) ((((q63_t) sum << 32) +
00444                           ((q63_t) * px++ * (*py--))) >> 32);
00445 
00446           /* Decrement the loop counter */
00447           k--;
00448         }
00449 
00450         /* Store the result in the accumulator in the destination buffer. */
00451         *pOut++ = sum << 1;
00452 
00453         /* Increment the MAC count */
00454         count++;
00455 
00456         /* Update the inputA and inputB pointers for next MAC calculation */
00457         px = pIn1 + count;
00458         py = pSrc2;
00459 
00460         /* Decrement the loop counter */
00461         blkCnt--;
00462       }
00463     }
00464     else
00465     {
00466       /* If the srcBLen is not a multiple of 4,    
00467        * the blockSize2 loop cannot be unrolled by 4 */
00468       blkCnt = (uint32_t) blockSize2;
00469 
00470       while(blkCnt > 0u)
00471       {
00472         /* Accumulator is made zero for every iteration */
00473         sum = 0;
00474 
00475         /* srcBLen number of MACS should be performed */
00476         k = srcBLen;
00477 
00478         while(k > 0u)
00479         {
00480           /* Perform the multiply-accumulate */
00481           sum = (q31_t) ((((q63_t) sum << 32) +
00482                           ((q63_t) * px++ * (*py--))) >> 32);
00483 
00484           /* Decrement the loop counter */
00485           k--;
00486         }
00487 
00488         /* Store the result in the accumulator in the destination buffer. */
00489         *pOut++ = sum << 1;
00490 
00491         /* Increment the MAC count */
00492         count++;
00493 
00494         /* Update the inputA and inputB pointers for next MAC calculation */
00495         px = pIn1 + count;
00496         py = pSrc2;
00497 
00498         /* Decrement the loop counter */
00499         blkCnt--;
00500       }
00501     }
00502 
00503 
00504     /* --------------------------    
00505      * Initializations of stage3    
00506      * -------------------------*/
00507 
00508     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]    
00509      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]    
00510      * ....    
00511      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]    
00512      * sum +=  x[srcALen-1] * y[srcBLen-1]    
00513      */
00514 
00515     /* In this stage the MAC operations are decreased by 1 for every iteration.    
00516        The count variable holds the number of MAC operations performed */
00517     count = srcBLen - 1u;
00518 
00519     /* Working pointer of inputA */
00520     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00521     px = pSrc1;
00522 
00523     /* Working pointer of inputB */
00524     pSrc2 = pIn2 + (srcBLen - 1u);
00525     py = pSrc2;
00526 
00527     /* -------------------    
00528      * Stage3 process    
00529      * ------------------*/
00530 
00531     while(blockSize3 > 0)
00532     {
00533       /* Accumulator is made zero for every iteration */
00534       sum = 0;
00535 
00536       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00537       k = count >> 2u;
00538 
00539       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00540        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00541       while(k > 0u)
00542       {
00543         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00544         sum = (q31_t) ((((q63_t) sum << 32) +
00545                         ((q63_t) * px++ * (*py--))) >> 32);
00546 
00547         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00548         sum = (q31_t) ((((q63_t) sum << 32) +
00549                         ((q63_t) * px++ * (*py--))) >> 32);
00550 
00551         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00552         sum = (q31_t) ((((q63_t) sum << 32) +
00553                         ((q63_t) * px++ * (*py--))) >> 32);
00554 
00555         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00556         sum = (q31_t) ((((q63_t) sum << 32) +
00557                         ((q63_t) * px++ * (*py--))) >> 32);
00558 
00559         /* Decrement the loop counter */
00560         k--;
00561       }
00562 
00563       /* If the count is not a multiple of 4, compute any remaining MACs here.    
00564        ** No loop unrolling is used. */
00565       k = count % 0x4u;
00566 
00567       while(k > 0u)
00568       {
00569         /* Perform the multiply-accumulates */
00570         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00571         sum = (q31_t) ((((q63_t) sum << 32) +
00572                         ((q63_t) * px++ * (*py--))) >> 32);
00573 
00574         /* Decrement the loop counter */
00575         k--;
00576       }
00577 
00578       /* Store the result in the accumulator in the destination buffer. */
00579       *pOut++ = sum << 1;
00580 
00581       /* Update the inputA and inputB pointers for next MAC calculation */
00582       px = ++pSrc1;
00583       py = pSrc2;
00584 
00585       /* Decrement the MAC count */
00586       count--;
00587 
00588       /* Decrement the loop counter */
00589       blockSize3--;
00590 
00591     }
00592 
00593     /* set status as ARM_MATH_SUCCESS */
00594     status = ARM_MATH_SUCCESS;
00595   }
00596 
00597   /* Return to application */
00598   return (status);
00599 
00600 }
00601 
00602 /**    
00603  * @} end of PartialConv group    
00604  */