Knight KE / Mbed OS Game_Master
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_fast_q15.c Source File

arm_conv_fast_q15.c

00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_conv_fast_q15.c   
00009 *   
00010 * Description:  Fast Q15 Convolution.   
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 * 
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.    
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**   
00044  * @ingroup groupFilters   
00045  */
00046 
00047 /**   
00048  * @addtogroup Conv   
00049  * @{   
00050  */
00051 
00052 /**   
00053  * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.   
00054  * @param[in] *pSrcA points to the first input sequence.   
00055  * @param[in] srcALen length of the first input sequence.   
00056  * @param[in] *pSrcB points to the second input sequence.   
00057  * @param[in] srcBLen length of the second input sequence.   
00058  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.   
00059  * @return none.   
00060  *   
00061  * <b>Scaling and Overflow Behavior:</b>   
00062  *   
00063  * \par   
00064  * This fast version uses a 32-bit accumulator with 2.30 format.   
00065  * The accumulator maintains full precision of the intermediate multiplication results   
00066  * but provides only a single guard bit. There is no saturation on intermediate additions.   
00067  * Thus, if the accumulator overflows it wraps around and distorts the result.   
00068  * The input signals should be scaled down to avoid intermediate overflows.   
00069  * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,   
00070  * as maximum of min(srcALen, srcBLen) number of additions are carried internally.   
00071  * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.   
00072  *   
00073  * \par   
00074  * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.   
00075  */
00076 
00077 void arm_conv_fast_q15(
00078   q15_t * pSrcA,
00079   uint32_t srcALen,
00080   q15_t * pSrcB,
00081   uint32_t srcBLen,
00082   q15_t * pDst)
00083 {
00084 #ifndef UNALIGNED_SUPPORT_DISABLE
00085   q15_t *pIn1;                                   /* inputA pointer */
00086   q15_t *pIn2;                                   /* inputB pointer */
00087   q15_t *pOut = pDst;                            /* output pointer */
00088   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00089   q15_t *px;                                     /* Intermediate inputA pointer  */
00090   q15_t *py;                                     /* Intermediate inputB pointer  */
00091   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
00092   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
00093   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */
00094 
00095   /* The algorithm implementation is based on the lengths of the inputs. */
00096   /* srcB is always made to slide across srcA. */
00097   /* So srcBLen is always considered as shorter or equal to srcALen */
00098   if(srcALen >= srcBLen)
00099   {
00100     /* Initialization of inputA pointer */
00101     pIn1 = pSrcA;
00102 
00103     /* Initialization of inputB pointer */
00104     pIn2 = pSrcB;
00105   }
00106   else
00107   {
00108     /* Initialization of inputA pointer */
00109     pIn1 = pSrcB;
00110 
00111     /* Initialization of inputB pointer */
00112     pIn2 = pSrcA;
00113 
00114     /* srcBLen is always considered as shorter or equal to srcALen */
00115     j = srcBLen;
00116     srcBLen = srcALen;
00117     srcALen = j;
00118   }
00119 
00120   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00121   /* The function is internally   
00122    * divided into three stages according to the number of multiplications that has to be   
00123    * taken place between inputA samples and inputB samples. In the first stage of the   
00124    * algorithm, the multiplications increase by one for every iteration.   
00125    * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00126    * In the third stage of the algorithm, the multiplications decrease by one   
00127    * for every iteration. */
00128 
00129   /* The algorithm is implemented in three stages.   
00130      The loop counters of each stage is initiated here. */
00131   blockSize1 = srcBLen - 1u;
00132   blockSize2 = srcALen - (srcBLen - 1u);
00133   blockSize3 = blockSize1;
00134 
00135   /* --------------------------   
00136    * Initializations of stage1   
00137    * -------------------------*/
00138 
00139   /* sum = x[0] * y[0]   
00140    * sum = x[0] * y[1] + x[1] * y[0]   
00141    * ....   
00142    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00143    */
00144 
00145   /* In this stage the MAC operations are increased by 1 for every iteration.   
00146      The count variable holds the number of MAC operations performed */
00147   count = 1u;
00148 
00149   /* Working pointer of inputA */
00150   px = pIn1;
00151 
00152   /* Working pointer of inputB */
00153   py = pIn2;
00154 
00155 
00156   /* ------------------------   
00157    * Stage1 process   
00158    * ----------------------*/
00159 
00160   /* For loop unrolling by 4, this stage is divided into two. */
00161   /* First part of this stage computes the MAC operations less than 4 */
00162   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00163 
00164   /* The first part of the stage starts here */
00165   while((count < 4u) && (blockSize1 > 0u))
00166   {
00167     /* Accumulator is made zero for every iteration */
00168     sum = 0;
00169 
00170     /* Loop over number of MAC operations between   
00171      * inputA samples and inputB samples */
00172     k = count;
00173 
00174     while(k > 0u)
00175     {
00176       /* Perform the multiply-accumulates */
00177       sum = __SMLAD(*px++, *py--, sum);
00178 
00179       /* Decrement the loop counter */
00180       k--;
00181     }
00182 
00183     /* Store the result in the accumulator in the destination buffer. */
00184     *pOut++ = (q15_t) (sum >> 15);
00185 
00186     /* Update the inputA and inputB pointers for next MAC calculation */
00187     py = pIn2 + count;
00188     px = pIn1;
00189 
00190     /* Increment the MAC count */
00191     count++;
00192 
00193     /* Decrement the loop counter */
00194     blockSize1--;
00195   }
00196 
00197   /* The second part of the stage starts here */
00198   /* The internal loop, over count, is unrolled by 4 */
00199   /* To, read the last two inputB samples using SIMD:   
00200    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00201   py = py - 1;
00202 
00203   while(blockSize1 > 0u)
00204   {
00205     /* Accumulator is made zero for every iteration */
00206     sum = 0;
00207 
00208     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00209     k = count >> 2u;
00210 
00211     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00212      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00213     while(k > 0u)
00214     {
00215       /* Perform the multiply-accumulates */
00216       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00217       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00218       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00219       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00220 
00221       /* Decrement the loop counter */
00222       k--;
00223     }
00224 
00225     /* For the next MAC operations, the pointer py is used without SIMD   
00226      * So, py is incremented by 1 */
00227     py = py + 1u;
00228 
00229     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00230      ** No loop unrolling is used. */
00231     k = count % 0x4u;
00232 
00233     while(k > 0u)
00234     {
00235       /* Perform the multiply-accumulates */
00236       sum = __SMLAD(*px++, *py--, sum);
00237 
00238       /* Decrement the loop counter */
00239       k--;
00240     }
00241 
00242     /* Store the result in the accumulator in the destination buffer. */
00243     *pOut++ = (q15_t) (sum >> 15);
00244 
00245     /* Update the inputA and inputB pointers for next MAC calculation */
00246     py = pIn2 + (count - 1u);
00247     px = pIn1;
00248 
00249     /* Increment the MAC count */
00250     count++;
00251 
00252     /* Decrement the loop counter */
00253     blockSize1--;
00254   }
00255 
00256   /* --------------------------   
00257    * Initializations of stage2   
00258    * ------------------------*/
00259 
00260   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00261    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00262    * ....   
00263    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00264    */
00265 
00266   /* Working pointer of inputA */
00267   px = pIn1;
00268 
00269   /* Working pointer of inputB */
00270   pSrc2 = pIn2 + (srcBLen - 1u);
00271   py = pSrc2;
00272 
00273   /* count is the index by which the pointer pIn1 to be incremented */
00274   count = 0u;
00275 
00276 
00277   /* --------------------   
00278    * Stage2 process   
00279    * -------------------*/
00280 
00281   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00282    * So, to loop unroll over blockSize2,   
00283    * srcBLen should be greater than or equal to 4 */
00284   if(srcBLen >= 4u)
00285   {
00286     /* Loop unroll over blockSize2, by 4 */
00287     blkCnt = blockSize2 >> 2u;
00288 
00289     while(blkCnt > 0u)
00290     {
00291       py = py - 1u;
00292 
00293       /* Set all accumulators to zero */
00294       acc0 = 0;
00295       acc1 = 0;
00296       acc2 = 0;
00297       acc3 = 0;
00298 
00299 
00300       /* read x[0], x[1] samples */
00301       x0 = *__SIMD32(px);
00302       /* read x[1], x[2] samples */
00303       x1 = _SIMD32_OFFSET(px+1);
00304       px+= 2u;
00305 
00306 
00307       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00308       k = srcBLen >> 2u;
00309 
00310       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00311        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00312       do
00313       {
00314         /* Read the last two inputB samples using SIMD:   
00315          * y[srcBLen - 1] and y[srcBLen - 2] */
00316         c0 = *__SIMD32(py)--;
00317 
00318         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00319         acc0 = __SMLADX(x0, c0, acc0);
00320 
00321         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00322         acc1 = __SMLADX(x1, c0, acc1);
00323 
00324         /* Read x[2], x[3] */
00325         x2 = *__SIMD32(px);
00326 
00327         /* Read x[3], x[4] */
00328         x3 = _SIMD32_OFFSET(px+1);
00329 
00330         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00331         acc2 = __SMLADX(x2, c0, acc2);
00332 
00333         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00334         acc3 = __SMLADX(x3, c0, acc3);
00335 
00336         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00337         c0 = *__SIMD32(py)--;
00338 
00339         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00340         acc0 = __SMLADX(x2, c0, acc0);
00341 
00342         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00343         acc1 = __SMLADX(x3, c0, acc1);
00344 
00345         /* Read x[4], x[5] */
00346         x0 = _SIMD32_OFFSET(px+2);
00347 
00348         /* Read x[5], x[6] */
00349         x1 = _SIMD32_OFFSET(px+3);
00350         px += 4u;
00351 
00352         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00353         acc2 = __SMLADX(x0, c0, acc2);
00354 
00355         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00356         acc3 = __SMLADX(x1, c0, acc3);
00357 
00358       } while(--k);
00359 
00360       /* For the next MAC operations, SIMD is not used   
00361        * So, the 16 bit pointer if inputB, py is updated */
00362 
00363       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00364        ** No loop unrolling is used. */
00365       k = srcBLen % 0x4u;
00366 
00367       if(k == 1u)
00368       {
00369         /* Read y[srcBLen - 5] */
00370         c0 = *(py+1);
00371 
00372 #ifdef  ARM_MATH_BIG_ENDIAN
00373 
00374         c0 = c0 << 16u;
00375 
00376 #else
00377 
00378         c0 = c0 & 0x0000FFFF;
00379 
00380 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00381 
00382         /* Read x[7] */
00383         x3 = *__SIMD32(px);
00384         px++;
00385 
00386         /* Perform the multiply-accumulates */
00387         acc0 = __SMLAD(x0, c0, acc0);
00388         acc1 = __SMLAD(x1, c0, acc1);
00389         acc2 = __SMLADX(x1, c0, acc2);
00390         acc3 = __SMLADX(x3, c0, acc3);
00391       }
00392 
00393       if(k == 2u)
00394       {
00395         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00396         c0 = _SIMD32_OFFSET(py);
00397 
00398         /* Read x[7], x[8] */
00399         x3 = *__SIMD32(px);
00400 
00401         /* Read x[9] */
00402         x2 = _SIMD32_OFFSET(px+1);
00403         px += 2u;
00404 
00405         /* Perform the multiply-accumulates */
00406         acc0 = __SMLADX(x0, c0, acc0);
00407         acc1 = __SMLADX(x1, c0, acc1);
00408         acc2 = __SMLADX(x3, c0, acc2);
00409         acc3 = __SMLADX(x2, c0, acc3);
00410       }
00411 
00412       if(k == 3u)
00413       {
00414         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00415         c0 = _SIMD32_OFFSET(py);
00416 
00417         /* Read x[7], x[8] */
00418         x3 = *__SIMD32(px);
00419 
00420         /* Read x[9] */
00421         x2 = _SIMD32_OFFSET(px+1);
00422 
00423         /* Perform the multiply-accumulates */
00424         acc0 = __SMLADX(x0, c0, acc0);
00425         acc1 = __SMLADX(x1, c0, acc1);
00426         acc2 = __SMLADX(x3, c0, acc2);
00427         acc3 = __SMLADX(x2, c0, acc3);
00428 
00429         /* Read y[srcBLen - 7] */
00430         c0 = *(py-1);
00431 #ifdef  ARM_MATH_BIG_ENDIAN
00432 
00433         c0 = c0 << 16u;
00434 #else
00435 
00436         c0 = c0 & 0x0000FFFF;
00437 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00438 
00439         /* Read x[10] */
00440         x3 =  _SIMD32_OFFSET(px+2);
00441         px += 3u;
00442 
00443         /* Perform the multiply-accumulates */
00444         acc0 = __SMLADX(x1, c0, acc0);
00445         acc1 = __SMLAD(x2, c0, acc1);
00446         acc2 = __SMLADX(x2, c0, acc2);
00447         acc3 = __SMLADX(x3, c0, acc3);
00448       }
00449 
00450       /* Store the results in the accumulators in the destination buffer. */
00451 #ifndef ARM_MATH_BIG_ENDIAN
00452 
00453       *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16);
00454       *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16);
00455 
00456 #else
00457 
00458       *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16);
00459       *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16);
00460 
00461 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00462 
00463       /* Increment the pointer pIn1 index, count by 4 */
00464       count += 4u;
00465 
00466       /* Update the inputA and inputB pointers for next MAC calculation */
00467       px = pIn1 + count;
00468       py = pSrc2;
00469 
00470       /* Decrement the loop counter */
00471       blkCnt--;
00472     }
00473 
00474     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00475      ** No loop unrolling is used. */
00476     blkCnt = blockSize2 % 0x4u;
00477 
00478     while(blkCnt > 0u)
00479     {
00480       /* Accumulator is made zero for every iteration */
00481       sum = 0;
00482 
00483       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00484       k = srcBLen >> 2u;
00485 
00486       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00487        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00488       while(k > 0u)
00489       {
00490         /* Perform the multiply-accumulates */
00491         sum += ((q31_t) * px++ * *py--);
00492         sum += ((q31_t) * px++ * *py--);
00493         sum += ((q31_t) * px++ * *py--);
00494         sum += ((q31_t) * px++ * *py--);
00495 
00496         /* Decrement the loop counter */
00497         k--;
00498       }
00499 
00500       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00501        ** No loop unrolling is used. */
00502       k = srcBLen % 0x4u;
00503 
00504       while(k > 0u)
00505       {
00506         /* Perform the multiply-accumulates */
00507         sum += ((q31_t) * px++ * *py--);
00508 
00509         /* Decrement the loop counter */
00510         k--;
00511       }
00512 
00513       /* Store the result in the accumulator in the destination buffer. */
00514       *pOut++ = (q15_t) (sum >> 15);
00515 
00516       /* Increment the pointer pIn1 index, count by 1 */
00517       count++;
00518 
00519       /* Update the inputA and inputB pointers for next MAC calculation */
00520       px = pIn1 + count;
00521       py = pSrc2;
00522 
00523       /* Decrement the loop counter */
00524       blkCnt--;
00525     }
00526   }
00527   else
00528   {
00529     /* If the srcBLen is not a multiple of 4,   
00530      * the blockSize2 loop cannot be unrolled by 4 */
00531     blkCnt = blockSize2;
00532 
00533     while(blkCnt > 0u)
00534     {
00535       /* Accumulator is made zero for every iteration */
00536       sum = 0;
00537 
00538       /* srcBLen number of MACS should be performed */
00539       k = srcBLen;
00540 
00541       while(k > 0u)
00542       {
00543         /* Perform the multiply-accumulate */
00544         sum += ((q31_t) * px++ * *py--);
00545 
00546         /* Decrement the loop counter */
00547         k--;
00548       }
00549 
00550       /* Store the result in the accumulator in the destination buffer. */
00551       *pOut++ = (q15_t) (sum >> 15);
00552 
00553       /* Increment the MAC count */
00554       count++;
00555 
00556       /* Update the inputA and inputB pointers for next MAC calculation */
00557       px = pIn1 + count;
00558       py = pSrc2;
00559 
00560       /* Decrement the loop counter */
00561       blkCnt--;
00562     }
00563   }
00564 
00565 
00566   /* --------------------------   
00567    * Initializations of stage3   
00568    * -------------------------*/
00569 
00570   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
00571    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
00572    * ....   
00573    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
00574    * sum +=  x[srcALen-1] * y[srcBLen-1]   
00575    */
00576 
00577   /* In this stage the MAC operations are decreased by 1 for every iteration.   
00578      The blockSize3 variable holds the number of MAC operations performed */
00579 
00580   /* Working pointer of inputA */
00581   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00582   px = pSrc1;
00583 
00584   /* Working pointer of inputB */
00585   pSrc2 = pIn2 + (srcBLen - 1u);
00586   pIn2 = pSrc2 - 1u;
00587   py = pIn2;
00588 
00589   /* -------------------   
00590    * Stage3 process   
00591    * ------------------*/
00592 
00593   /* For loop unrolling by 4, this stage is divided into two. */
00594   /* First part of this stage computes the MAC operations greater than 4 */
00595   /* Second part of this stage computes the MAC operations less than or equal to 4 */
00596 
00597   /* The first part of the stage starts here */
00598   j = blockSize3 >> 2u;
00599 
00600   while((j > 0u) && (blockSize3 > 0u))
00601   {
00602     /* Accumulator is made zero for every iteration */
00603     sum = 0;
00604 
00605     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00606     k = blockSize3 >> 2u;
00607 
00608     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00609      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00610     while(k > 0u)
00611     {
00612       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied   
00613        * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00614       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00615       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied   
00616        * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00617       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00618 
00619       /* Decrement the loop counter */
00620       k--;
00621     }
00622 
00623     /* For the next MAC operations, the pointer py is used without SIMD   
00624      * So, py is incremented by 1 */
00625     py = py + 1u;
00626 
00627     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.   
00628      ** No loop unrolling is used. */
00629     k = blockSize3 % 0x4u;
00630 
00631     while(k > 0u)
00632     {
00633       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00634       sum = __SMLAD(*px++, *py--, sum);
00635 
00636       /* Decrement the loop counter */
00637       k--;
00638     }
00639 
00640     /* Store the result in the accumulator in the destination buffer. */
00641     *pOut++ = (q15_t) (sum >> 15);
00642 
00643     /* Update the inputA and inputB pointers for next MAC calculation */
00644     px = ++pSrc1;
00645     py = pIn2;
00646 
00647     /* Decrement the loop counter */
00648     blockSize3--;
00649 
00650     j--;
00651   }
00652 
00653   /* The second part of the stage starts here */
00654   /* SIMD is not used for the next MAC operations,   
00655    * so pointer py is updated to read only one sample at a time */
00656   py = py + 1u;
00657 
00658   while(blockSize3 > 0u)
00659   {
00660     /* Accumulator is made zero for every iteration */
00661     sum = 0;
00662 
00663     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00664     k = blockSize3;
00665 
00666     while(k > 0u)
00667     {
00668       /* Perform the multiply-accumulates */
00669       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00670       sum = __SMLAD(*px++, *py--, sum);
00671 
00672       /* Decrement the loop counter */
00673       k--;
00674     }
00675 
00676     /* Store the result in the accumulator in the destination buffer. */
00677     *pOut++ = (q15_t) (sum >> 15);
00678 
00679     /* Update the inputA and inputB pointers for next MAC calculation */
00680     px = ++pSrc1;
00681     py = pSrc2;
00682 
00683     /* Decrement the loop counter */
00684     blockSize3--;
00685   }
00686 
00687 #else
00688   q15_t *pIn1;                                   /* inputA pointer */
00689   q15_t *pIn2;                                   /* inputB pointer */
00690   q15_t *pOut = pDst;                            /* output pointer */
00691   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00692   q15_t *px;                                     /* Intermediate inputA pointer  */
00693   q15_t *py;                                     /* Intermediate inputB pointer  */
00694   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
00695   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
00696   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */
00697   q15_t a, b;
00698 
00699   /* The algorithm implementation is based on the lengths of the inputs. */
00700   /* srcB is always made to slide across srcA. */
00701   /* So srcBLen is always considered as shorter or equal to srcALen */
00702   if(srcALen >= srcBLen)
00703   {
00704     /* Initialization of inputA pointer */
00705     pIn1 = pSrcA;
00706 
00707     /* Initialization of inputB pointer */
00708     pIn2 = pSrcB;
00709   }
00710   else
00711   {
00712     /* Initialization of inputA pointer */
00713     pIn1 = pSrcB;
00714 
00715     /* Initialization of inputB pointer */
00716     pIn2 = pSrcA;
00717 
00718     /* srcBLen is always considered as shorter or equal to srcALen */
00719     j = srcBLen;
00720     srcBLen = srcALen;
00721     srcALen = j;
00722   }
00723 
00724   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00725   /* The function is internally   
00726    * divided into three stages according to the number of multiplications that has to be   
00727    * taken place between inputA samples and inputB samples. In the first stage of the   
00728    * algorithm, the multiplications increase by one for every iteration.   
00729    * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00730    * In the third stage of the algorithm, the multiplications decrease by one   
00731    * for every iteration. */
00732 
00733   /* The algorithm is implemented in three stages.   
00734      The loop counters of each stage is initiated here. */
00735   blockSize1 = srcBLen - 1u;
00736   blockSize2 = srcALen - (srcBLen - 1u);
00737   blockSize3 = blockSize1;
00738 
00739   /* --------------------------   
00740    * Initializations of stage1   
00741    * -------------------------*/
00742 
00743   /* sum = x[0] * y[0]   
00744    * sum = x[0] * y[1] + x[1] * y[0]   
00745    * ....   
00746    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00747    */
00748 
00749   /* In this stage the MAC operations are increased by 1 for every iteration.   
00750      The count variable holds the number of MAC operations performed */
00751   count = 1u;
00752 
00753   /* Working pointer of inputA */
00754   px = pIn1;
00755 
00756   /* Working pointer of inputB */
00757   py = pIn2;
00758 
00759 
00760   /* ------------------------   
00761    * Stage1 process   
00762    * ----------------------*/
00763 
00764   /* For loop unrolling by 4, this stage is divided into two. */
00765   /* First part of this stage computes the MAC operations less than 4 */
00766   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00767 
00768   /* The first part of the stage starts here */
00769   while((count < 4u) && (blockSize1 > 0u))
00770   {
00771     /* Accumulator is made zero for every iteration */
00772     sum = 0;
00773 
00774     /* Loop over number of MAC operations between   
00775      * inputA samples and inputB samples */
00776     k = count;
00777 
00778     while(k > 0u)
00779     {
00780       /* Perform the multiply-accumulates */
00781       sum += ((q31_t) * px++ * *py--);
00782 
00783       /* Decrement the loop counter */
00784       k--;
00785     }
00786 
00787     /* Store the result in the accumulator in the destination buffer. */
00788     *pOut++ = (q15_t) (sum >> 15);
00789 
00790     /* Update the inputA and inputB pointers for next MAC calculation */
00791     py = pIn2 + count;
00792     px = pIn1;
00793 
00794     /* Increment the MAC count */
00795     count++;
00796 
00797     /* Decrement the loop counter */
00798     blockSize1--;
00799   }
00800 
00801   /* The second part of the stage starts here */
00802   /* The internal loop, over count, is unrolled by 4 */
00803   /* To, read the last two inputB samples using SIMD:   
00804    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00805   py = py - 1;
00806 
00807   while(blockSize1 > 0u)
00808   {
00809     /* Accumulator is made zero for every iteration */
00810     sum = 0;
00811 
00812     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00813     k = count >> 2u;
00814 
00815     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00816      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00817     py++;
00818 
00819     while(k > 0u)
00820     {
00821       /* Perform the multiply-accumulates */
00822         sum += ((q31_t) * px++ * *py--);
00823         sum += ((q31_t) * px++ * *py--);
00824         sum += ((q31_t) * px++ * *py--);
00825         sum += ((q31_t) * px++ * *py--);
00826 
00827       /* Decrement the loop counter */
00828       k--;
00829     }
00830 
00831     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00832      ** No loop unrolling is used. */
00833     k = count % 0x4u;
00834 
00835     while(k > 0u)
00836     {
00837       /* Perform the multiply-accumulates */
00838       sum += ((q31_t) * px++ * *py--);
00839 
00840       /* Decrement the loop counter */
00841       k--;
00842     }
00843 
00844     /* Store the result in the accumulator in the destination buffer. */
00845     *pOut++ = (q15_t) (sum >> 15);
00846 
00847     /* Update the inputA and inputB pointers for next MAC calculation */
00848     py = pIn2 + (count - 1u);
00849     px = pIn1;
00850 
00851     /* Increment the MAC count */
00852     count++;
00853 
00854     /* Decrement the loop counter */
00855     blockSize1--;
00856   }
00857 
00858   /* --------------------------   
00859    * Initializations of stage2   
00860    * ------------------------*/
00861 
00862   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00863    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00864    * ....   
00865    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00866    */
00867 
00868   /* Working pointer of inputA */
00869   px = pIn1;
00870 
00871   /* Working pointer of inputB */
00872   pSrc2 = pIn2 + (srcBLen - 1u);
00873   py = pSrc2;
00874 
00875   /* count is the index by which the pointer pIn1 to be incremented */
00876   count = 0u;
00877 
00878 
00879   /* --------------------   
00880    * Stage2 process   
00881    * -------------------*/
00882 
00883   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00884    * So, to loop unroll over blockSize2,   
00885    * srcBLen should be greater than or equal to 4 */
00886   if(srcBLen >= 4u)
00887   {
00888     /* Loop unroll over blockSize2, by 4 */
00889     blkCnt = blockSize2 >> 2u;
00890 
00891     while(blkCnt > 0u)
00892     {
00893       py = py - 1u;
00894 
00895       /* Set all accumulators to zero */
00896       acc0 = 0;
00897       acc1 = 0;
00898       acc2 = 0;
00899       acc3 = 0;   
00900 
00901       /* read x[0], x[1] samples */
00902       a = *px++;
00903       b = *px++;
00904 
00905 #ifndef ARM_MATH_BIG_ENDIAN
00906     
00907       x0 = __PKHBT(a, b, 16);
00908       a = *px;
00909       x1 = __PKHBT(b, a, 16);
00910 
00911 #else
00912 
00913       x0 = __PKHBT(b, a, 16);
00914       a = *px;
00915       x1 = __PKHBT(a, b, 16);
00916 
00917 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
00918 
00919       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00920       k = srcBLen >> 2u;
00921 
00922       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00923        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00924       do
00925       {
00926         /* Read the last two inputB samples using SIMD:   
00927          * y[srcBLen - 1] and y[srcBLen - 2] */
00928         a = *py;
00929         b = *(py+1);
00930         py -= 2;
00931 
00932 #ifndef ARM_MATH_BIG_ENDIAN
00933 
00934         c0 = __PKHBT(a, b, 16);
00935 
00936 #else
00937 
00938         c0 = __PKHBT(b, a, 16);;
00939 
00940 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00941 
00942         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00943         acc0 = __SMLADX(x0, c0, acc0);
00944 
00945         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00946         acc1 = __SMLADX(x1, c0, acc1);
00947 
00948       a = *px;
00949       b = *(px + 1);
00950 
00951 #ifndef ARM_MATH_BIG_ENDIAN
00952     
00953       x2 = __PKHBT(a, b, 16);
00954       a = *(px + 2);
00955       x3 = __PKHBT(b, a, 16);
00956 
00957 #else
00958 
00959       x2 = __PKHBT(b, a, 16);
00960       a = *(px + 2);
00961       x3 = __PKHBT(a, b, 16);
00962 
00963 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
00964 
00965         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00966         acc2 = __SMLADX(x2, c0, acc2);
00967 
00968         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00969         acc3 = __SMLADX(x3, c0, acc3);
00970 
00971         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00972         a = *py;
00973         b = *(py+1);
00974         py -= 2;
00975 
00976 #ifndef ARM_MATH_BIG_ENDIAN
00977 
00978         c0 = __PKHBT(a, b, 16);
00979 
00980 #else
00981 
00982         c0 = __PKHBT(b, a, 16);;
00983 
00984 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00985 
00986         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00987         acc0 = __SMLADX(x2, c0, acc0);
00988 
00989         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00990         acc1 = __SMLADX(x3, c0, acc1);
00991 
00992         /* Read x[4], x[5], x[6] */
00993       a = *(px + 2);
00994       b = *(px + 3);
00995 
00996 #ifndef ARM_MATH_BIG_ENDIAN
00997     
00998       x0 = __PKHBT(a, b, 16);
00999       a = *(px + 4);
01000       x1 = __PKHBT(b, a, 16);
01001 
01002 #else
01003 
01004       x0 = __PKHBT(b, a, 16);
01005       a = *(px + 4);
01006       x1 = __PKHBT(a, b, 16);
01007 
01008 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01009 
01010         px += 4u;
01011 
01012         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
01013         acc2 = __SMLADX(x0, c0, acc2);
01014 
01015         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
01016         acc3 = __SMLADX(x1, c0, acc3);
01017 
01018       } while(--k);
01019 
01020       /* For the next MAC operations, SIMD is not used   
01021        * So, the 16 bit pointer if inputB, py is updated */
01022 
01023       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
01024        ** No loop unrolling is used. */
01025       k = srcBLen % 0x4u;
01026 
01027       if(k == 1u)
01028       {
01029         /* Read y[srcBLen - 5] */
01030         c0 = *(py+1);
01031 
01032 #ifdef  ARM_MATH_BIG_ENDIAN
01033 
01034         c0 = c0 << 16u;
01035 
01036 #else
01037 
01038         c0 = c0 & 0x0000FFFF;
01039 
01040 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
01041 
01042         /* Read x[7] */
01043         a = *px;
01044         b = *(px+1);
01045         px++;
01046 
01047 #ifndef ARM_MATH_BIG_ENDIAN
01048 
01049         x3 = __PKHBT(a, b, 16);
01050 
01051 #else
01052 
01053         x3 = __PKHBT(b, a, 16);;
01054 
01055 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01056 
01057 
01058         /* Perform the multiply-accumulates */
01059         acc0 = __SMLAD(x0, c0, acc0);
01060         acc1 = __SMLAD(x1, c0, acc1);
01061         acc2 = __SMLADX(x1, c0, acc2);
01062         acc3 = __SMLADX(x3, c0, acc3);
01063       }
01064 
01065       if(k == 2u)
01066       {
01067         /* Read y[srcBLen - 5], y[srcBLen - 6] */
01068         a = *py;
01069         b = *(py+1);
01070 
01071 #ifndef ARM_MATH_BIG_ENDIAN
01072 
01073         c0 = __PKHBT(a, b, 16);
01074 
01075 #else
01076 
01077         c0 = __PKHBT(b, a, 16);;
01078 
01079 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01080 
01081         /* Read x[7], x[8], x[9] */
01082       a = *px;
01083       b = *(px + 1);
01084 
01085 #ifndef ARM_MATH_BIG_ENDIAN
01086     
01087       x3 = __PKHBT(a, b, 16);
01088       a = *(px + 2);
01089       x2 = __PKHBT(b, a, 16);
01090 
01091 #else
01092 
01093       x3 = __PKHBT(b, a, 16);
01094       a = *(px + 2);
01095       x2 = __PKHBT(a, b, 16);
01096 
01097 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01098         px += 2u;
01099 
01100         /* Perform the multiply-accumulates */
01101         acc0 = __SMLADX(x0, c0, acc0);
01102         acc1 = __SMLADX(x1, c0, acc1);
01103         acc2 = __SMLADX(x3, c0, acc2);
01104         acc3 = __SMLADX(x2, c0, acc3);
01105       }
01106 
01107       if(k == 3u)
01108       {
01109         /* Read y[srcBLen - 5], y[srcBLen - 6] */
01110         a = *py;
01111         b = *(py+1);
01112 
01113 #ifndef ARM_MATH_BIG_ENDIAN
01114 
01115         c0 = __PKHBT(a, b, 16);
01116 
01117 #else
01118 
01119         c0 = __PKHBT(b, a, 16);;
01120 
01121 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01122 
01123         /* Read x[7], x[8], x[9] */
01124       a = *px;
01125       b = *(px + 1);
01126 
01127 #ifndef ARM_MATH_BIG_ENDIAN
01128     
01129       x3 = __PKHBT(a, b, 16);
01130       a = *(px + 2);
01131       x2 = __PKHBT(b, a, 16);
01132 
01133 #else
01134 
01135       x3 = __PKHBT(b, a, 16);
01136       a = *(px + 2);
01137       x2 = __PKHBT(a, b, 16);
01138 
01139 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01140 
01141         /* Perform the multiply-accumulates */
01142         acc0 = __SMLADX(x0, c0, acc0);
01143         acc1 = __SMLADX(x1, c0, acc1);
01144         acc2 = __SMLADX(x3, c0, acc2);
01145         acc3 = __SMLADX(x2, c0, acc3);
01146 
01147         /* Read y[srcBLen - 7] */
01148         c0 = *(py-1);
01149 #ifdef  ARM_MATH_BIG_ENDIAN
01150 
01151         c0 = c0 << 16u;
01152 #else
01153 
01154         c0 = c0 & 0x0000FFFF;
01155 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
01156 
01157         /* Read x[10] */
01158         a = *(px+2);
01159         b = *(px+3);
01160 
01161 #ifndef ARM_MATH_BIG_ENDIAN
01162 
01163         x3 = __PKHBT(a, b, 16);
01164 
01165 #else
01166 
01167         x3 = __PKHBT(b, a, 16);;
01168 
01169 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01170 
01171         px += 3u;
01172 
01173         /* Perform the multiply-accumulates */
01174         acc0 = __SMLADX(x1, c0, acc0);
01175         acc1 = __SMLAD(x2, c0, acc1);
01176         acc2 = __SMLADX(x2, c0, acc2);
01177         acc3 = __SMLADX(x3, c0, acc3);
01178       }
01179 
01180       /* Store the results in the accumulators in the destination buffer. */
01181       *pOut++ = (q15_t)(acc0 >> 15);
01182       *pOut++ = (q15_t)(acc1 >> 15);
01183       *pOut++ = (q15_t)(acc2 >> 15);
01184       *pOut++ = (q15_t)(acc3 >> 15);
01185 
01186       /* Increment the pointer pIn1 index, count by 4 */
01187       count += 4u;
01188 
01189       /* Update the inputA and inputB pointers for next MAC calculation */
01190       px = pIn1 + count;
01191       py = pSrc2;
01192 
01193       /* Decrement the loop counter */
01194       blkCnt--;
01195     }
01196 
01197     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
01198      ** No loop unrolling is used. */
01199     blkCnt = blockSize2 % 0x4u;
01200 
01201     while(blkCnt > 0u)
01202     {
01203       /* Accumulator is made zero for every iteration */
01204       sum = 0;
01205 
01206       /* Apply loop unrolling and compute 4 MACs simultaneously. */
01207       k = srcBLen >> 2u;
01208 
01209       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
01210        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01211       while(k > 0u)
01212       {
01213         /* Perform the multiply-accumulates */
01214         sum += ((q31_t) * px++ * *py--);
01215         sum += ((q31_t) * px++ * *py--);
01216         sum += ((q31_t) * px++ * *py--);
01217         sum += ((q31_t) * px++ * *py--);
01218 
01219         /* Decrement the loop counter */
01220         k--;
01221       }
01222 
01223       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
01224        ** No loop unrolling is used. */
01225       k = srcBLen % 0x4u;
01226 
01227       while(k > 0u)
01228       {
01229         /* Perform the multiply-accumulates */
01230         sum += ((q31_t) * px++ * *py--);
01231 
01232         /* Decrement the loop counter */
01233         k--;
01234       }
01235 
01236       /* Store the result in the accumulator in the destination buffer. */
01237       *pOut++ = (q15_t) (sum >> 15);
01238 
01239       /* Increment the pointer pIn1 index, count by 1 */
01240       count++;
01241 
01242       /* Update the inputA and inputB pointers for next MAC calculation */
01243       px = pIn1 + count;
01244       py = pSrc2;
01245 
01246       /* Decrement the loop counter */
01247       blkCnt--;
01248     }
01249   }
01250   else
01251   {
01252     /* If the srcBLen is not a multiple of 4,   
01253      * the blockSize2 loop cannot be unrolled by 4 */
01254     blkCnt = blockSize2;
01255 
01256     while(blkCnt > 0u)
01257     {
01258       /* Accumulator is made zero for every iteration */
01259       sum = 0;
01260 
01261       /* srcBLen number of MACS should be performed */
01262       k = srcBLen;
01263 
01264       while(k > 0u)
01265       {
01266         /* Perform the multiply-accumulate */
01267         sum += ((q31_t) * px++ * *py--);
01268 
01269         /* Decrement the loop counter */
01270         k--;
01271       }
01272 
01273       /* Store the result in the accumulator in the destination buffer. */
01274       *pOut++ = (q15_t) (sum >> 15);
01275 
01276       /* Increment the MAC count */
01277       count++;
01278 
01279       /* Update the inputA and inputB pointers for next MAC calculation */
01280       px = pIn1 + count;
01281       py = pSrc2;
01282 
01283       /* Decrement the loop counter */
01284       blkCnt--;
01285     }
01286   }
01287 
01288 
01289   /* --------------------------   
01290    * Initializations of stage3   
01291    * -------------------------*/
01292 
01293   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
01294    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
01295    * ....   
01296    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
01297    * sum +=  x[srcALen-1] * y[srcBLen-1]   
01298    */
01299 
01300   /* In this stage the MAC operations are decreased by 1 for every iteration.   
01301      The blockSize3 variable holds the number of MAC operations performed */
01302 
01303   /* Working pointer of inputA */
01304   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
01305   px = pSrc1;
01306 
01307   /* Working pointer of inputB */
01308   pSrc2 = pIn2 + (srcBLen - 1u);
01309   pIn2 = pSrc2 - 1u;
01310   py = pIn2;
01311 
01312   /* -------------------   
01313    * Stage3 process   
01314    * ------------------*/
01315 
01316   /* For loop unrolling by 4, this stage is divided into two. */
01317   /* First part of this stage computes the MAC operations greater than 4 */
01318   /* Second part of this stage computes the MAC operations less than or equal to 4 */
01319 
01320   /* The first part of the stage starts here */
01321   j = blockSize3 >> 2u;
01322 
01323   while((j > 0u) && (blockSize3 > 0u))
01324   {
01325     /* Accumulator is made zero for every iteration */
01326     sum = 0;
01327 
01328     /* Apply loop unrolling and compute 4 MACs simultaneously. */
01329     k = blockSize3 >> 2u;
01330 
01331     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
01332      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01333     py++;
01334 
01335     while(k > 0u)
01336     {   
01337         sum += ((q31_t) * px++ * *py--);
01338         sum += ((q31_t) * px++ * *py--);
01339         sum += ((q31_t) * px++ * *py--);
01340         sum += ((q31_t) * px++ * *py--);
01341       /* Decrement the loop counter */
01342       k--;
01343     }
01344 
01345     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.   
01346      ** No loop unrolling is used. */
01347     k = blockSize3 % 0x4u;
01348 
01349     while(k > 0u)
01350     {
01351       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
01352         sum += ((q31_t) * px++ * *py--);
01353 
01354       /* Decrement the loop counter */
01355       k--;
01356     }
01357 
01358     /* Store the result in the accumulator in the destination buffer. */
01359     *pOut++ = (q15_t) (sum >> 15);
01360 
01361     /* Update the inputA and inputB pointers for next MAC calculation */
01362     px = ++pSrc1;
01363     py = pIn2;
01364 
01365     /* Decrement the loop counter */
01366     blockSize3--;
01367 
01368     j--;
01369   }
01370 
01371   /* The second part of the stage starts here */
01372   /* SIMD is not used for the next MAC operations,   
01373    * so pointer py is updated to read only one sample at a time */
01374   py = py + 1u;
01375 
01376   while(blockSize3 > 0u)
01377   {
01378     /* Accumulator is made zero for every iteration */
01379     sum = 0;
01380 
01381     /* Apply loop unrolling and compute 4 MACs simultaneously. */
01382     k = blockSize3;
01383 
01384     while(k > 0u)
01385     {
01386       /* Perform the multiply-accumulates */
01387       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
01388         sum += ((q31_t) * px++ * *py--);
01389 
01390       /* Decrement the loop counter */
01391       k--;
01392     }
01393 
01394     /* Store the result in the accumulator in the destination buffer. */
01395     *pOut++ = (q15_t) (sum >> 15);
01396 
01397     /* Update the inputA and inputB pointers for next MAC calculation */
01398     px = ++pSrc1;
01399     py = pSrc2;
01400 
01401     /* Decrement the loop counter */
01402     blockSize3--;
01403   }
01404 
01405 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
01406 }
01407 
01408 /**   
01409  * @} end of Conv group   
01410  */