CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_q15.c Source File

arm_conv_q15.c

00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_conv_q15.c   
00009 *   
00010 * Description:  Convolution of Q15 sequences.     
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**   
00044  * @ingroup groupFilters   
00045  */
00046 
00047 /**   
00048  * @addtogroup Conv   
00049  * @{   
00050  */
00051 
00052 /**   
00053  * @brief Convolution of Q15 sequences.   
00054  * @param[in] *pSrcA points to the first input sequence.   
00055  * @param[in] srcALen length of the first input sequence.   
00056  * @param[in] *pSrcB points to the second input sequence.   
00057  * @param[in] srcBLen length of the second input sequence.   
00058  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.   
00059  * @return none.   
00060  *   
00061  * @details   
00062  * <b>Scaling and Overflow Behavior:</b>   
00063  *   
00064  * \par   
00065  * The function is implemented using a 64-bit internal accumulator.   
00066  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.   
00067  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.   
00068  * This approach provides 33 guard bits and there is no risk of overflow.   
00069  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.   
00070  *   
00071  * \par   
00072  * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 
00073  *
00074  * \par    
00075  * Refer the function <code>arm_conv_opt_q15()</code> for a faster implementation of this function using scratch buffers.
00076  *  
00077  */
00078 
00079 void arm_conv_q15(
00080   q15_t * pSrcA,
00081   uint32_t srcALen,
00082   q15_t * pSrcB,
00083   uint32_t srcBLen,
00084   q15_t * pDst)
00085 {
00086 
00087 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
00088 
00089   /* Run the below code for Cortex-M4 and Cortex-M3 */
00090 
00091   q15_t *pIn1;                                   /* inputA pointer */
00092   q15_t *pIn2;                                   /* inputB pointer */
00093   q15_t *pOut = pDst;                            /* output pointer */
00094   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00095   q15_t *px;                                     /* Intermediate inputA pointer  */
00096   q15_t *py;                                     /* Intermediate inputB pointer  */
00097   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
00098   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
00099   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */
00100 
00101   /* The algorithm implementation is based on the lengths of the inputs. */
00102   /* srcB is always made to slide across srcA. */
00103   /* So srcBLen is always considered as shorter or equal to srcALen */
00104   if(srcALen >= srcBLen)
00105   {
00106     /* Initialization of inputA pointer */
00107     pIn1 = pSrcA;
00108 
00109     /* Initialization of inputB pointer */
00110     pIn2 = pSrcB;
00111   }
00112   else
00113   {
00114     /* Initialization of inputA pointer */
00115     pIn1 = pSrcB;
00116 
00117     /* Initialization of inputB pointer */
00118     pIn2 = pSrcA;
00119 
00120     /* srcBLen is always considered as shorter or equal to srcALen */
00121     j = srcBLen;
00122     srcBLen = srcALen;
00123     srcALen = j;
00124   }
00125 
00126   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00127   /* The function is internally   
00128    * divided into three stages according to the number of multiplications that has to be   
00129    * taken place between inputA samples and inputB samples. In the first stage of the   
00130    * algorithm, the multiplications increase by one for every iteration.   
00131    * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00132    * In the third stage of the algorithm, the multiplications decrease by one   
00133    * for every iteration. */
00134 
00135   /* The algorithm is implemented in three stages.   
00136      The loop counters of each stage is initiated here. */
00137   blockSize1 = srcBLen - 1u;
00138   blockSize2 = srcALen - (srcBLen - 1u);
00139 
00140   /* --------------------------   
00141    * Initializations of stage1   
00142    * -------------------------*/
00143 
00144   /* sum = x[0] * y[0]   
00145    * sum = x[0] * y[1] + x[1] * y[0]   
00146    * ....   
00147    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00148    */
00149 
00150   /* In this stage the MAC operations are increased by 1 for every iteration.   
00151      The count variable holds the number of MAC operations performed */
00152   count = 1u;
00153 
00154   /* Working pointer of inputA */
00155   px = pIn1;
00156 
00157   /* Working pointer of inputB */
00158   py = pIn2;
00159 
00160 
00161   /* ------------------------   
00162    * Stage1 process   
00163    * ----------------------*/
00164 
00165   /* For loop unrolling by 4, this stage is divided into two. */
00166   /* First part of this stage computes the MAC operations less than 4 */
00167   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00168 
00169   /* The first part of the stage starts here */
00170   while((count < 4u) && (blockSize1 > 0u))
00171   {
00172     /* Accumulator is made zero for every iteration */
00173     sum = 0;
00174 
00175     /* Loop over number of MAC operations between   
00176      * inputA samples and inputB samples */
00177     k = count;
00178 
00179     while(k > 0u)
00180     {
00181       /* Perform the multiply-accumulates */
00182       sum = __SMLALD(*px++, *py--, sum);
00183 
00184       /* Decrement the loop counter */
00185       k--;
00186     }
00187 
00188     /* Store the result in the accumulator in the destination buffer. */
00189     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00190 
00191     /* Update the inputA and inputB pointers for next MAC calculation */
00192     py = pIn2 + count;
00193     px = pIn1;
00194 
00195     /* Increment the MAC count */
00196     count++;
00197 
00198     /* Decrement the loop counter */
00199     blockSize1--;
00200   }
00201 
00202   /* The second part of the stage starts here */
00203   /* The internal loop, over count, is unrolled by 4 */
00204   /* To, read the last two inputB samples using SIMD:   
00205    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00206   py = py - 1;
00207 
00208   while(blockSize1 > 0u)
00209   {
00210     /* Accumulator is made zero for every iteration */
00211     sum = 0;
00212 
00213     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00214     k = count >> 2u;
00215 
00216     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00217      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00218     while(k > 0u)
00219     {
00220       /* Perform the multiply-accumulates */
00221       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00222       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00223       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00224       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00225 
00226       /* Decrement the loop counter */
00227       k--;
00228     }
00229 
00230     /* For the next MAC operations, the pointer py is used without SIMD   
00231      * So, py is incremented by 1 */
00232     py = py + 1u;
00233 
00234     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00235      ** No loop unrolling is used. */
00236     k = count % 0x4u;
00237 
00238     while(k > 0u)
00239     {
00240       /* Perform the multiply-accumulates */
00241       sum = __SMLALD(*px++, *py--, sum);
00242 
00243       /* Decrement the loop counter */
00244       k--;
00245     }
00246 
00247     /* Store the result in the accumulator in the destination buffer. */
00248     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00249 
00250     /* Update the inputA and inputB pointers for next MAC calculation */
00251     py = pIn2 + (count - 1u);
00252     px = pIn1;
00253 
00254     /* Increment the MAC count */
00255     count++;
00256 
00257     /* Decrement the loop counter */
00258     blockSize1--;
00259   }
00260 
00261   /* --------------------------   
00262    * Initializations of stage2   
00263    * ------------------------*/
00264 
00265   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00266    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00267    * ....   
00268    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00269    */
00270 
00271   /* Working pointer of inputA */
00272   px = pIn1;
00273 
00274   /* Working pointer of inputB */
00275   pSrc2 = pIn2 + (srcBLen - 1u);
00276   py = pSrc2;
00277 
00278   /* count is the index by which the pointer pIn1 to be incremented */
00279   count = 0u;
00280 
00281 
00282   /* --------------------   
00283    * Stage2 process   
00284    * -------------------*/
00285 
00286   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00287    * So, to loop unroll over blockSize2,   
00288    * srcBLen should be greater than or equal to 4 */
00289   if(srcBLen >= 4u)
00290   {
00291     /* Loop unroll over blockSize2, by 4 */
00292     blkCnt = blockSize2 >> 2u;
00293 
00294     while(blkCnt > 0u)
00295     {
00296       py = py - 1u;
00297 
00298       /* Set all accumulators to zero */
00299       acc0 = 0;
00300       acc1 = 0;
00301       acc2 = 0;
00302       acc3 = 0;
00303 
00304 
00305       /* read x[0], x[1] samples */
00306       x0 = *__SIMD32(px);
00307       /* read x[1], x[2] samples */
00308       x1 = _SIMD32_OFFSET(px+1);
00309       px+= 2u;
00310 
00311 
00312       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00313       k = srcBLen >> 2u;
00314 
00315       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00316        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00317       do
00318       {
00319         /* Read the last two inputB samples using SIMD:   
00320          * y[srcBLen - 1] and y[srcBLen - 2] */
00321         c0 = *__SIMD32(py)--;
00322 
00323         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00324         acc0 = __SMLALDX(x0, c0, acc0);
00325 
00326         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00327         acc1 = __SMLALDX(x1, c0, acc1);
00328 
00329         /* Read x[2], x[3] */
00330         x2 = *__SIMD32(px);
00331 
00332         /* Read x[3], x[4] */
00333         x3 = _SIMD32_OFFSET(px+1);
00334 
00335         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00336         acc2 = __SMLALDX(x2, c0, acc2);
00337 
00338         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00339         acc3 = __SMLALDX(x3, c0, acc3);
00340 
00341         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00342         c0 = *__SIMD32(py)--;
00343 
00344         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00345         acc0 = __SMLALDX(x2, c0, acc0);
00346 
00347         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00348         acc1 = __SMLALDX(x3, c0, acc1);
00349 
00350         /* Read x[4], x[5] */
00351         x0 = _SIMD32_OFFSET(px+2);
00352 
00353         /* Read x[5], x[6] */
00354         x1 = _SIMD32_OFFSET(px+3);
00355         px += 4u;
00356 
00357         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00358         acc2 = __SMLALDX(x0, c0, acc2);
00359 
00360         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00361         acc3 = __SMLALDX(x1, c0, acc3);
00362 
00363       } while(--k);
00364 
00365       /* For the next MAC operations, SIMD is not used   
00366        * So, the 16 bit pointer if inputB, py is updated */
00367 
00368       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00369        ** No loop unrolling is used. */
00370       k = srcBLen % 0x4u;
00371 
00372       if(k == 1u)
00373       {
00374         /* Read y[srcBLen - 5] */
00375         c0 = *(py+1);
00376 
00377 #ifdef  ARM_MATH_BIG_ENDIAN
00378 
00379         c0 = c0 << 16u;
00380 
00381 #else
00382 
00383         c0 = c0 & 0x0000FFFF;
00384 
00385 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00386         /* Read x[7] */
00387         x3 = *__SIMD32(px);
00388         px++;
00389 
00390         /* Perform the multiply-accumulates */
00391         acc0 = __SMLALD(x0, c0, acc0);
00392         acc1 = __SMLALD(x1, c0, acc1);
00393         acc2 = __SMLALDX(x1, c0, acc2);
00394         acc3 = __SMLALDX(x3, c0, acc3);
00395       }
00396 
00397       if(k == 2u)
00398       {
00399         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00400         c0 = _SIMD32_OFFSET(py);
00401 
00402         /* Read x[7], x[8] */
00403         x3 = *__SIMD32(px);
00404 
00405         /* Read x[9] */
00406         x2 = _SIMD32_OFFSET(px+1);
00407         px += 2u;
00408 
00409         /* Perform the multiply-accumulates */
00410         acc0 = __SMLALDX(x0, c0, acc0);
00411         acc1 = __SMLALDX(x1, c0, acc1);
00412         acc2 = __SMLALDX(x3, c0, acc2);
00413         acc3 = __SMLALDX(x2, c0, acc3);
00414       }
00415 
00416       if(k == 3u)
00417       {
00418         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00419         c0 = _SIMD32_OFFSET(py);
00420 
00421         /* Read x[7], x[8] */
00422         x3 = *__SIMD32(px);
00423 
00424         /* Read x[9] */
00425         x2 = _SIMD32_OFFSET(px+1);
00426 
00427         /* Perform the multiply-accumulates */
00428         acc0 = __SMLALDX(x0, c0, acc0);
00429         acc1 = __SMLALDX(x1, c0, acc1);
00430         acc2 = __SMLALDX(x3, c0, acc2);
00431         acc3 = __SMLALDX(x2, c0, acc3);
00432 
00433         c0 = *(py-1);
00434 
00435 #ifdef  ARM_MATH_BIG_ENDIAN
00436 
00437         c0 = c0 << 16u;
00438 #else
00439 
00440         c0 = c0 & 0x0000FFFF;
00441 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00442         /* Read x[10] */
00443         x3 =  _SIMD32_OFFSET(px+2);
00444         px += 3u;
00445 
00446         /* Perform the multiply-accumulates */
00447         acc0 = __SMLALDX(x1, c0, acc0);
00448         acc1 = __SMLALD(x2, c0, acc1);
00449         acc2 = __SMLALDX(x2, c0, acc2);
00450         acc3 = __SMLALDX(x3, c0, acc3);
00451       }
00452 
00453 
00454       /* Store the results in the accumulators in the destination buffer. */
00455 
00456 #ifndef  ARM_MATH_BIG_ENDIAN
00457 
00458       *__SIMD32(pOut)++ =
00459         __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00460       *__SIMD32(pOut)++ =
00461         __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00462 
00463 #else
00464 
00465       *__SIMD32(pOut)++ =
00466         __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00467       *__SIMD32(pOut)++ =
00468         __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00469 
00470 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00471 
00472       /* Increment the pointer pIn1 index, count by 4 */
00473       count += 4u;
00474 
00475       /* Update the inputA and inputB pointers for next MAC calculation */
00476       px = pIn1 + count;
00477       py = pSrc2;
00478 
00479        /* Decrement the loop counter */
00480       blkCnt--;
00481     }
00482 
00483     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00484      ** No loop unrolling is used. */
00485     blkCnt = blockSize2 % 0x4u;
00486 
00487     while(blkCnt > 0u)
00488     {
00489       /* Accumulator is made zero for every iteration */
00490       sum = 0;
00491 
00492       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00493       k = srcBLen >> 2u;
00494 
00495       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00496        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00497       while(k > 0u)
00498       {
00499         /* Perform the multiply-accumulates */
00500         sum += (q63_t) ((q31_t) * px++ * *py--);
00501         sum += (q63_t) ((q31_t) * px++ * *py--);
00502         sum += (q63_t) ((q31_t) * px++ * *py--);
00503         sum += (q63_t) ((q31_t) * px++ * *py--);
00504 
00505         /* Decrement the loop counter */
00506         k--;
00507       }
00508 
00509       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00510        ** No loop unrolling is used. */
00511       k = srcBLen % 0x4u;
00512 
00513       while(k > 0u)
00514       {
00515         /* Perform the multiply-accumulates */
00516         sum += (q63_t) ((q31_t) * px++ * *py--);
00517 
00518         /* Decrement the loop counter */
00519         k--;
00520       }
00521 
00522       /* Store the result in the accumulator in the destination buffer. */
00523       *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
00524 
00525       /* Increment the pointer pIn1 index, count by 1 */
00526       count++;
00527 
00528       /* Update the inputA and inputB pointers for next MAC calculation */
00529       px = pIn1 + count;
00530       py = pSrc2;
00531 
00532       /* Decrement the loop counter */
00533       blkCnt--;
00534     }
00535   }
00536   else
00537   {
00538     /* If the srcBLen is not a multiple of 4,   
00539      * the blockSize2 loop cannot be unrolled by 4 */
00540     blkCnt = blockSize2;
00541 
00542     while(blkCnt > 0u)
00543     {
00544       /* Accumulator is made zero for every iteration */
00545       sum = 0;
00546 
00547       /* srcBLen number of MACS should be performed */
00548       k = srcBLen;
00549 
00550       while(k > 0u)
00551       {
00552         /* Perform the multiply-accumulate */
00553         sum += (q63_t) ((q31_t) * px++ * *py--);
00554 
00555         /* Decrement the loop counter */
00556         k--;
00557       }
00558 
00559       /* Store the result in the accumulator in the destination buffer. */
00560       *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
00561 
00562       /* Increment the MAC count */
00563       count++;
00564 
00565       /* Update the inputA and inputB pointers for next MAC calculation */
00566       px = pIn1 + count;
00567       py = pSrc2;
00568 
00569       /* Decrement the loop counter */
00570       blkCnt--;
00571     }
00572   }
00573 
00574 
00575   /* --------------------------   
00576    * Initializations of stage3   
00577    * -------------------------*/
00578 
00579   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
00580    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
00581    * ....   
00582    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
00583    * sum +=  x[srcALen-1] * y[srcBLen-1]   
00584    */
00585 
00586   /* In this stage the MAC operations are decreased by 1 for every iteration.   
00587      The blockSize3 variable holds the number of MAC operations performed */
00588 
00589   blockSize3 = srcBLen - 1u;
00590 
00591   /* Working pointer of inputA */
00592   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00593   px = pSrc1;
00594 
00595   /* Working pointer of inputB */
00596   pSrc2 = pIn2 + (srcBLen - 1u);
00597   pIn2 = pSrc2 - 1u;
00598   py = pIn2;
00599 
00600   /* -------------------   
00601    * Stage3 process   
00602    * ------------------*/
00603 
00604   /* For loop unrolling by 4, this stage is divided into two. */
00605   /* First part of this stage computes the MAC operations greater than 4 */
00606   /* Second part of this stage computes the MAC operations less than or equal to 4 */
00607 
00608   /* The first part of the stage starts here */
00609   j = blockSize3 >> 2u;
00610 
00611   while((j > 0u) && (blockSize3 > 0u))
00612   {
00613     /* Accumulator is made zero for every iteration */
00614     sum = 0;
00615 
00616     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00617     k = blockSize3 >> 2u;
00618 
00619     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00620      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00621     while(k > 0u)
00622     {
00623       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied   
00624        * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00625       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00626       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied   
00627        * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00628       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00629 
00630       /* Decrement the loop counter */
00631       k--;
00632     }
00633 
00634     /* For the next MAC operations, the pointer py is used without SIMD   
00635      * So, py is incremented by 1 */
00636     py = py + 1u;
00637 
00638     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.   
00639      ** No loop unrolling is used. */
00640     k = blockSize3 % 0x4u;
00641 
00642     while(k > 0u)
00643     {
00644       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00645       sum = __SMLALD(*px++, *py--, sum);
00646 
00647       /* Decrement the loop counter */
00648       k--;
00649     }
00650 
00651     /* Store the result in the accumulator in the destination buffer. */
00652     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00653 
00654     /* Update the inputA and inputB pointers for next MAC calculation */
00655     px = ++pSrc1;
00656     py = pIn2;
00657 
00658     /* Decrement the loop counter */
00659     blockSize3--;
00660 
00661     j--;
00662   }
00663 
00664   /* The second part of the stage starts here */
00665   /* SIMD is not used for the next MAC operations,   
00666    * so pointer py is updated to read only one sample at a time */
00667   py = py + 1u;
00668 
00669   while(blockSize3 > 0u)
00670   {
00671     /* Accumulator is made zero for every iteration */
00672     sum = 0;
00673 
00674     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00675     k = blockSize3;
00676 
00677     while(k > 0u)
00678     {
00679       /* Perform the multiply-accumulates */
00680       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00681       sum = __SMLALD(*px++, *py--, sum);
00682 
00683       /* Decrement the loop counter */
00684       k--;
00685     }
00686 
00687     /* Store the result in the accumulator in the destination buffer. */
00688     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00689 
00690     /* Update the inputA and inputB pointers for next MAC calculation */
00691     px = ++pSrc1;
00692     py = pSrc2;
00693 
00694     /* Decrement the loop counter */
00695     blockSize3--;
00696   }
00697 
00698 #else
00699 
00700 /* Run the below code for Cortex-M0 */
00701 
00702   q15_t *pIn1 = pSrcA;                           /* input pointer */
00703   q15_t *pIn2 = pSrcB;                           /* coefficient pointer */
00704   q63_t sum;                                     /* Accumulator */
00705   uint32_t i, j;                                 /* loop counter */
00706 
00707   /* Loop to calculate output of convolution for output length number of times */
00708   for (i = 0; i < (srcALen + srcBLen - 1); i++)
00709   {
00710     /* Initialize sum with zero to carry on MAC operations */
00711     sum = 0;
00712 
00713     /* Loop to perform MAC operations according to convolution equation */
00714     for (j = 0; j <= i; j++)
00715     {
00716       /* Check the array limitations */
00717       if(((i - j) < srcBLen) && (j < srcALen))
00718       {
00719         /* z[i] += x[i-j] * y[j] */
00720         sum += (q31_t) pIn1[j] * (pIn2[i - j]);
00721       }
00722     }
00723 
00724     /* Store the output in the destination buffer */
00725     pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
00726   }
00727 
00728 #endif /*  #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)*/
00729 
00730 }
00731 
00732 /**   
00733  * @} end of Conv group   
00734  */