CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_q15.c Source File

arm_conv_partial_q15.c

00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_conv_partial_q15.c   
00009 *   
00010 * Description:  Partial convolution of Q15 sequences.  
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 * 
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**   
00044  * @ingroup groupFilters   
00045  */
00046 
00047 /**   
00048  * @addtogroup PartialConv   
00049  * @{   
00050  */
00051 
00052 /**   
00053  * @brief Partial convolution of Q15 sequences.   
00054  * @param[in]       *pSrcA points to the first input sequence.   
00055  * @param[in]       srcALen length of the first input sequence.   
00056  * @param[in]       *pSrcB points to the second input sequence.   
00057  * @param[in]       srcBLen length of the second input sequence.   
00058  * @param[out]      *pDst points to the location where the output result is written.   
00059  * @param[in]       firstIndex is the first output sample to start with.   
00060  * @param[in]       numPoints is the number of output points to be computed.   
00061  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].   
00062  *   
00063  * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.  
00064  * 
00065  * \par    
00066  * Refer the function <code>arm_conv_partial_opt_q15()</code> for a faster implementation of this function using scratch buffers.
00067  * 
00068  */
00069 
00070 
00071 arm_status arm_conv_partial_q15(
00072   q15_t * pSrcA,
00073   uint32_t srcALen,
00074   q15_t * pSrcB,
00075   uint32_t srcBLen,
00076   q15_t * pDst,
00077   uint32_t firstIndex,
00078   uint32_t numPoints)
00079 {
00080 
00081 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
00082 
00083   /* Run the below code for Cortex-M4 and Cortex-M3 */
00084 
00085   q15_t *pIn1;                                   /* inputA pointer               */
00086   q15_t *pIn2;                                   /* inputB pointer               */
00087   q15_t *pOut = pDst;                            /* output pointer               */
00088   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
00089   q15_t *px;                                     /* Intermediate inputA pointer  */
00090   q15_t *py;                                     /* Intermediate inputB pointer  */
00091   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00092   q31_t x0, x1, x2, x3, c0;                      /* Temporary input variables */
00093   uint32_t j, k, count, check, blkCnt;
00094   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */
00095   arm_status status;                             /* status of Partial convolution */
00096 
00097   /* Check for range of output samples to be calculated */
00098   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00099   {
00100     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00101     status = ARM_MATH_ARGUMENT_ERROR;
00102   }
00103   else
00104   {
00105 
00106     /* The algorithm implementation is based on the lengths of the inputs. */
00107     /* srcB is always made to slide across srcA. */
00108     /* So srcBLen is always considered as shorter or equal to srcALen */
00109     if(srcALen >= srcBLen)
00110     {
00111       /* Initialization of inputA pointer */
00112       pIn1 = pSrcA;
00113 
00114       /* Initialization of inputB pointer */
00115       pIn2 = pSrcB;
00116     }
00117     else
00118     {
00119       /* Initialization of inputA pointer */
00120       pIn1 = pSrcB;
00121 
00122       /* Initialization of inputB pointer */
00123       pIn2 = pSrcA;
00124 
00125       /* srcBLen is always considered as shorter or equal to srcALen */
00126       j = srcBLen;
00127       srcBLen = srcALen;
00128       srcALen = j;
00129     }
00130 
00131     /* Conditions to check which loopCounter holds   
00132      * the first and last indices of the output samples to be calculated. */
00133     check = firstIndex + numPoints;
00134     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00135     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00136     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00137     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00138                                      (int32_t) numPoints) : 0;
00139     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00140                                     (int32_t) firstIndex);
00141     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00142 
00143     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00144     /* The function is internally   
00145      * divided into three stages according to the number of multiplications that has to be   
00146      * taken place between inputA samples and inputB samples. In the first stage of the   
00147      * algorithm, the multiplications increase by one for every iteration.   
00148      * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00149      * In the third stage of the algorithm, the multiplications decrease by one   
00150      * for every iteration. */
00151 
00152     /* Set the output pointer to point to the firstIndex   
00153      * of the output sample to be calculated. */
00154     pOut = pDst + firstIndex;
00155 
00156     /* --------------------------   
00157      * Initializations of stage1   
00158      * -------------------------*/
00159 
00160     /* sum = x[0] * y[0]   
00161      * sum = x[0] * y[1] + x[1] * y[0]   
00162      * ....   
00163      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00164      */
00165 
00166     /* In this stage the MAC operations are increased by 1 for every iteration.   
00167        The count variable holds the number of MAC operations performed.   
00168        Since the partial convolution starts from firstIndex   
00169        Number of Macs to be performed is firstIndex + 1 */
00170     count = 1u + firstIndex;
00171 
00172     /* Working pointer of inputA */
00173     px = pIn1;
00174 
00175     /* Working pointer of inputB */
00176     pSrc2 = pIn2 + firstIndex;
00177     py = pSrc2;
00178 
00179     /* ------------------------   
00180      * Stage1 process   
00181      * ----------------------*/
00182 
00183     /* For loop unrolling by 4, this stage is divided into two. */
00184     /* First part of this stage computes the MAC operations less than 4 */
00185     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00186 
00187     /* The first part of the stage starts here */
00188     while((count < 4u) && (blockSize1 > 0))
00189     {
00190       /* Accumulator is made zero for every iteration */
00191       sum = 0;
00192 
00193       /* Loop over number of MAC operations between   
00194        * inputA samples and inputB samples */
00195       k = count;
00196 
00197       while(k > 0u)
00198       {
00199         /* Perform the multiply-accumulates */
00200         sum = __SMLALD(*px++, *py--, sum);
00201 
00202         /* Decrement the loop counter */
00203         k--;
00204       }
00205 
00206       /* Store the result in the accumulator in the destination buffer. */
00207       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00208 
00209       /* Update the inputA and inputB pointers for next MAC calculation */
00210       py = ++pSrc2;
00211       px = pIn1;
00212 
00213       /* Increment the MAC count */
00214       count++;
00215 
00216       /* Decrement the loop counter */
00217       blockSize1--;
00218     }
00219 
00220     /* The second part of the stage starts here */
00221     /* The internal loop, over count, is unrolled by 4 */
00222     /* To, read the last two inputB samples using SIMD:   
00223      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00224     py = py - 1;
00225 
00226     while(blockSize1 > 0)
00227     {
00228       /* Accumulator is made zero for every iteration */
00229       sum = 0;
00230 
00231       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00232       k = count >> 2u;
00233 
00234       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00235        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00236       while(k > 0u)
00237       {
00238         /* Perform the multiply-accumulates */
00239         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00240         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00241         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00242         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00243 
00244         /* Decrement the loop counter */
00245         k--;
00246       }
00247 
00248       /* For the next MAC operations, the pointer py is used without SIMD   
00249        * So, py is incremented by 1 */
00250       py = py + 1u;
00251 
00252       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00253        ** No loop unrolling is used. */
00254       k = count % 0x4u;
00255 
00256       while(k > 0u)
00257       {
00258         /* Perform the multiply-accumulates */
00259         sum = __SMLALD(*px++, *py--, sum);
00260 
00261         /* Decrement the loop counter */
00262         k--;
00263       }
00264 
00265       /* Store the result in the accumulator in the destination buffer. */
00266       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00267 
00268       /* Update the inputA and inputB pointers for next MAC calculation */
00269       py = ++pSrc2 - 1u;
00270       px = pIn1;
00271 
00272       /* Increment the MAC count */
00273       count++;
00274 
00275       /* Decrement the loop counter */
00276       blockSize1--;
00277     }
00278 
00279     /* --------------------------   
00280      * Initializations of stage2   
00281      * ------------------------*/
00282 
00283     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00284      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00285      * ....   
00286      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00287      */
00288 
00289     /* Working pointer of inputA */
00290     if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00291     {
00292       px = pIn1 + firstIndex - srcBLen + 1;
00293     }
00294     else
00295     {
00296       px = pIn1;
00297     }
00298 
00299     /* Working pointer of inputB */
00300     pSrc2 = pIn2 + (srcBLen - 1u);
00301     py = pSrc2;
00302 
00303   /* count is the index by which the pointer pIn1 to be incremented */
00304   count = 0u;
00305 
00306 
00307   /* --------------------   
00308    * Stage2 process   
00309    * -------------------*/
00310 
00311   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00312    * So, to loop unroll over blockSize2,   
00313    * srcBLen should be greater than or equal to 4 */
00314   if(srcBLen >= 4u)
00315   {
00316     /* Loop unroll over blockSize2, by 4 */
00317     blkCnt = blockSize2 >> 2u;
00318 
00319     while(blkCnt > 0u)
00320     {
00321       py = py - 1u;
00322 
00323       /* Set all accumulators to zero */
00324       acc0 = 0;
00325       acc1 = 0;
00326       acc2 = 0;
00327       acc3 = 0;
00328 
00329 
00330       /* read x[0], x[1] samples */
00331       x0 = *__SIMD32(px);
00332       /* read x[1], x[2] samples */
00333       x1 = _SIMD32_OFFSET(px+1);
00334       px+= 2u;
00335 
00336 
00337       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00338       k = srcBLen >> 2u;
00339 
00340       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00341        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00342       do
00343       {
00344         /* Read the last two inputB samples using SIMD:   
00345          * y[srcBLen - 1] and y[srcBLen - 2] */
00346         c0 = *__SIMD32(py)--;
00347 
00348         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00349         acc0 = __SMLALDX(x0, c0, acc0);
00350 
00351         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00352         acc1 = __SMLALDX(x1, c0, acc1);
00353 
00354         /* Read x[2], x[3] */
00355         x2 = *__SIMD32(px);
00356 
00357         /* Read x[3], x[4] */
00358         x3 = _SIMD32_OFFSET(px+1);
00359 
00360         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00361         acc2 = __SMLALDX(x2, c0, acc2);
00362 
00363         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00364         acc3 = __SMLALDX(x3, c0, acc3);
00365 
00366         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00367         c0 = *__SIMD32(py)--;
00368 
00369         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00370         acc0 = __SMLALDX(x2, c0, acc0);
00371 
00372         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00373         acc1 = __SMLALDX(x3, c0, acc1);
00374 
00375         /* Read x[4], x[5] */
00376         x0 = _SIMD32_OFFSET(px+2);
00377 
00378         /* Read x[5], x[6] */
00379         x1 = _SIMD32_OFFSET(px+3);
00380         px += 4u;
00381 
00382         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00383         acc2 = __SMLALDX(x0, c0, acc2);
00384 
00385         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00386         acc3 = __SMLALDX(x1, c0, acc3);
00387 
00388       } while(--k);
00389 
00390       /* For the next MAC operations, SIMD is not used   
00391        * So, the 16 bit pointer if inputB, py is updated */
00392 
00393       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00394        ** No loop unrolling is used. */
00395       k = srcBLen % 0x4u;
00396 
00397       if(k == 1u)
00398       {
00399         /* Read y[srcBLen - 5] */
00400         c0 = *(py+1);
00401 
00402 #ifdef  ARM_MATH_BIG_ENDIAN
00403 
00404         c0 = c0 << 16u;
00405 
00406 #else
00407 
00408         c0 = c0 & 0x0000FFFF;
00409 
00410 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00411 
00412         /* Read x[7] */
00413         x3 = *__SIMD32(px);
00414         px++;
00415 
00416         /* Perform the multiply-accumulates */
00417         acc0 = __SMLALD(x0, c0, acc0);
00418         acc1 = __SMLALD(x1, c0, acc1);
00419         acc2 = __SMLALDX(x1, c0, acc2);
00420         acc3 = __SMLALDX(x3, c0, acc3);
00421       }
00422 
00423       if(k == 2u)
00424       {
00425         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00426         c0 = _SIMD32_OFFSET(py);
00427 
00428         /* Read x[7], x[8] */
00429         x3 = *__SIMD32(px);
00430 
00431         /* Read x[9] */
00432         x2 = _SIMD32_OFFSET(px+1);
00433         px += 2u;
00434 
00435         /* Perform the multiply-accumulates */
00436         acc0 = __SMLALDX(x0, c0, acc0);
00437         acc1 = __SMLALDX(x1, c0, acc1);
00438         acc2 = __SMLALDX(x3, c0, acc2);
00439         acc3 = __SMLALDX(x2, c0, acc3);
00440       }
00441 
00442       if(k == 3u)
00443       {
00444         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00445         c0 = _SIMD32_OFFSET(py);
00446 
00447         /* Read x[7], x[8] */
00448         x3 = *__SIMD32(px);
00449 
00450         /* Read x[9] */
00451         x2 = _SIMD32_OFFSET(px+1);
00452 
00453         /* Perform the multiply-accumulates */
00454         acc0 = __SMLALDX(x0, c0, acc0);
00455         acc1 = __SMLALDX(x1, c0, acc1);
00456         acc2 = __SMLALDX(x3, c0, acc2);
00457         acc3 = __SMLALDX(x2, c0, acc3);
00458 
00459         c0 = *(py-1);
00460 
00461 #ifdef  ARM_MATH_BIG_ENDIAN
00462 
00463         c0 = c0 << 16u;
00464 #else
00465 
00466         c0 = c0 & 0x0000FFFF;
00467 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00468 
00469         /* Read x[10] */
00470         x3 =  _SIMD32_OFFSET(px+2);
00471         px += 3u;
00472 
00473         /* Perform the multiply-accumulates */
00474         acc0 = __SMLALDX(x1, c0, acc0);
00475         acc1 = __SMLALD(x2, c0, acc1);
00476         acc2 = __SMLALDX(x2, c0, acc2);
00477         acc3 = __SMLALDX(x3, c0, acc3);
00478       }
00479 
00480 
00481       /* Store the results in the accumulators in the destination buffer. */
00482 
00483 #ifndef  ARM_MATH_BIG_ENDIAN
00484 
00485       *__SIMD32(pOut)++ =
00486         __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00487       *__SIMD32(pOut)++ =
00488         __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00489 
00490 #else
00491 
00492       *__SIMD32(pOut)++ =
00493         __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00494       *__SIMD32(pOut)++ =
00495         __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00496 
00497 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00498 
00499       /* Increment the pointer pIn1 index, count by 4 */
00500       count += 4u;
00501 
00502       /* Update the inputA and inputB pointers for next MAC calculation */
00503       px = pIn1 + count;
00504       py = pSrc2;
00505 
00506         /* Decrement the loop counter */
00507         blkCnt--;
00508       }
00509 
00510       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00511        ** No loop unrolling is used. */
00512       blkCnt = (uint32_t) blockSize2 % 0x4u;
00513       
00514       while(blkCnt > 0u)
00515       {
00516         /* Accumulator is made zero for every iteration */
00517         sum = 0;
00518 
00519         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00520         k = srcBLen >> 2u;
00521 
00522         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00523          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00524         while(k > 0u)
00525         {
00526           /* Perform the multiply-accumulates */
00527           sum += (q63_t) ((q31_t) * px++ * *py--);
00528           sum += (q63_t) ((q31_t) * px++ * *py--);
00529           sum += (q63_t) ((q31_t) * px++ * *py--);
00530           sum += (q63_t) ((q31_t) * px++ * *py--);
00531 
00532           /* Decrement the loop counter */
00533           k--;
00534         }
00535 
00536         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00537          ** No loop unrolling is used. */
00538         k = srcBLen % 0x4u;
00539 
00540         while(k > 0u)
00541         {
00542           /* Perform the multiply-accumulates */
00543           sum += (q63_t) ((q31_t) * px++ * *py--);
00544 
00545           /* Decrement the loop counter */
00546           k--;
00547         }
00548 
00549         /* Store the result in the accumulator in the destination buffer. */
00550         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
00551 
00552         /* Increment the pointer pIn1 index, count by 1 */
00553         count++;
00554 
00555         /* Update the inputA and inputB pointers for next MAC calculation */
00556         px = pIn1 + count;
00557         py = pSrc2;
00558 
00559         /* Decrement the loop counter */
00560         blkCnt--;
00561       }
00562     }
00563     else
00564     {
00565       /* If the srcBLen is not a multiple of 4,   
00566        * the blockSize2 loop cannot be unrolled by 4 */
00567       blkCnt = (uint32_t) blockSize2;
00568 
00569       while(blkCnt > 0u)
00570       {
00571         /* Accumulator is made zero for every iteration */
00572         sum = 0;
00573 
00574         /* srcBLen number of MACS should be performed */
00575         k = srcBLen;
00576 
00577         while(k > 0u)
00578         {
00579           /* Perform the multiply-accumulate */
00580           sum += (q63_t) ((q31_t) * px++ * *py--);
00581 
00582           /* Decrement the loop counter */
00583           k--;
00584         }
00585 
00586         /* Store the result in the accumulator in the destination buffer. */
00587         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
00588 
00589         /* Increment the MAC count */
00590         count++;
00591 
00592         /* Update the inputA and inputB pointers for next MAC calculation */
00593         px = pIn1 + count;
00594         py = pSrc2;
00595   
00596         /* Decrement the loop counter */
00597         blkCnt--;
00598       }
00599     }
00600 
00601 
00602     /* --------------------------   
00603      * Initializations of stage3   
00604      * -------------------------*/
00605 
00606     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
00607      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
00608      * ....   
00609      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
00610      * sum +=  x[srcALen-1] * y[srcBLen-1]   
00611      */
00612 
00613     /* In this stage the MAC operations are decreased by 1 for every iteration.   
00614        The count variable holds the number of MAC operations performed */
00615     count = srcBLen - 1u;
00616 
00617     /* Working pointer of inputA */
00618     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00619     px = pSrc1;
00620 
00621     /* Working pointer of inputB */
00622     pSrc2 = pIn2 + (srcBLen - 1u);
00623     pIn2 = pSrc2 - 1u;
00624     py = pIn2;
00625 
00626     /* -------------------   
00627      * Stage3 process   
00628      * ------------------*/
00629 
00630     /* For loop unrolling by 4, this stage is divided into two. */
00631     /* First part of this stage computes the MAC operations greater than 4 */
00632     /* Second part of this stage computes the MAC operations less than or equal to 4 */
00633 
00634     /* The first part of the stage starts here */
00635     j = count >> 2u;
00636 
00637     while((j > 0u) && (blockSize3 > 0))
00638     {
00639       /* Accumulator is made zero for every iteration */
00640       sum = 0;
00641 
00642       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00643       k = count >> 2u;
00644 
00645       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00646        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00647       while(k > 0u)
00648       {
00649         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied   
00650          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00651         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00652         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied   
00653          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00654         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00655 
00656         /* Decrement the loop counter */
00657         k--;
00658       }
00659 
00660       /* For the next MAC operations, the pointer py is used without SIMD   
00661        * So, py is incremented by 1 */
00662       py = py + 1u;
00663 
00664       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00665        ** No loop unrolling is used. */
00666       k = count % 0x4u;
00667 
00668       while(k > 0u)
00669       {
00670         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00671         sum = __SMLALD(*px++, *py--, sum);
00672 
00673         /* Decrement the loop counter */
00674         k--;
00675       }
00676 
00677       /* Store the result in the accumulator in the destination buffer. */
00678       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00679 
00680       /* Update the inputA and inputB pointers for next MAC calculation */
00681       px = ++pSrc1;
00682       py = pIn2;
00683 
00684       /* Decrement the MAC count */
00685       count--;
00686 
00687       /* Decrement the loop counter */
00688       blockSize3--;
00689 
00690       j--;
00691     }
00692 
00693     /* The second part of the stage starts here */
00694     /* SIMD is not used for the next MAC operations,   
00695      * so pointer py is updated to read only one sample at a time */
00696     py = py + 1u;
00697 
00698     while(blockSize3 > 0)
00699     {
00700       /* Accumulator is made zero for every iteration */
00701       sum = 0;
00702 
00703       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00704       k = count;
00705 
00706       while(k > 0u)
00707       {
00708         /* Perform the multiply-accumulates */
00709         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00710         sum = __SMLALD(*px++, *py--, sum);
00711 
00712         /* Decrement the loop counter */
00713         k--;
00714       }
00715 
00716       /* Store the result in the accumulator in the destination buffer. */
00717       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00718 
00719       /* Update the inputA and inputB pointers for next MAC calculation */
00720       px = ++pSrc1;
00721       py = pSrc2;
00722 
00723       /* Decrement the MAC count */
00724       count--;
00725 
00726       /* Decrement the loop counter */
00727       blockSize3--;
00728     }
00729 
00730     /* set status as ARM_MATH_SUCCESS */
00731     status = ARM_MATH_SUCCESS;
00732   }
00733 
00734   /* Return to application */
00735   return (status);
00736 
00737 #else
00738 
00739   /* Run the below code for Cortex-M0 */
00740 
00741   q15_t *pIn1 = pSrcA;                           /* inputA pointer */
00742   q15_t *pIn2 = pSrcB;                           /* inputB pointer */
00743   q63_t sum;                                     /* Accumulator */
00744   uint32_t i, j;                                 /* loop counters */
00745   arm_status status;                             /* status of Partial convolution */
00746 
00747   /* Check for range of output samples to be calculated */
00748   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00749   {
00750     /* Set status as ARM_ARGUMENT_ERROR */
00751     status = ARM_MATH_ARGUMENT_ERROR;
00752   }
00753   else
00754   {
00755     /* Loop to calculate convolution for output length number of values */
00756     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00757     {
00758       /* Initialize sum with zero to carry on MAC operations */
00759       sum = 0;
00760 
00761       /* Loop to perform MAC operations according to convolution equation */
00762       for (j = 0; j <= i; j++)
00763       {
00764         /* Check the array limitations */
00765         if(((i - j) < srcBLen) && (j < srcALen))
00766         {
00767           /* z[i] += x[i-j] * y[j] */
00768           sum += ((q31_t) pIn1[j] * (pIn2[i - j]));
00769         }
00770       }
00771 
00772       /* Store the output in the destination buffer */
00773       pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
00774     }
00775     /* set status as ARM_SUCCESS as there are no argument errors */
00776     status = ARM_MATH_SUCCESS;
00777   }
00778   return (status);
00779 
00780 #endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)  */
00781 
00782 }
00783 
00784 /**   
00785  * @} end of PartialConv group   
00786  */