CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_q15.c Source File

arm_conv_partial_q15.c

00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_conv_partial_q15.c   
00009 *   
00010 * Description:  Partial convolution of Q15 sequences.  
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 * 
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**   
00044  * @ingroup groupFilters   
00045  */
00046 
00047 /**   
00048  * @addtogroup PartialConv   
00049  * @{   
00050  */
00051 
00052 /**   
00053  * @brief Partial convolution of Q15 sequences.   
00054  * @param[in]       *pSrcA points to the first input sequence.   
00055  * @param[in]       srcALen length of the first input sequence.   
00056  * @param[in]       *pSrcB points to the second input sequence.   
00057  * @param[in]       srcBLen length of the second input sequence.   
00058  * @param[out]      *pDst points to the location where the output result is written.   
00059  * @param[in]       firstIndex is the first output sample to start with.   
00060  * @param[in]       numPoints is the number of output points to be computed.   
00061  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].   
00062  *   
00063  * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.  
00064  * 
00065  * \par    
00066  * Refer the function <code>arm_conv_partial_opt_q15()</code> for a faster implementation of this function using scratch buffers.
00067  * 
00068  */
00069 
00070 
00071 arm_status arm_conv_partial_q15(
00072   q15_t * pSrcA,
00073   uint32_t srcALen,
00074   q15_t * pSrcB,
00075   uint32_t srcBLen,
00076   q15_t * pDst,
00077   uint32_t firstIndex,
00078   uint32_t numPoints)
00079 {
00080 
00081 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
00082 
00083   /* Run the below code for Cortex-M4 and Cortex-M3 */
00084 
00085   q15_t *pIn1;                                   /* inputA pointer               */
00086   q15_t *pIn2;                                   /* inputB pointer               */
00087   q15_t *pOut = pDst;                            /* output pointer               */
00088   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
00089   q15_t *px;                                     /* Intermediate inputA pointer  */
00090   q15_t *py;                                     /* Intermediate inputB pointer  */
00091   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00092   q31_t x0, x1, x2, x3, c0;                      /* Temporary input variables */
00093   uint32_t j, k, count, check, blkCnt;
00094   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */
00095   arm_status status;                             /* status of Partial convolution */
00096 
00097   /* Check for range of output samples to be calculated */
00098   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00099   {
00100     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00101     status = ARM_MATH_ARGUMENT_ERROR;
00102   }
00103   else
00104   {
00105 
00106     /* The algorithm implementation is based on the lengths of the inputs. */
00107     /* srcB is always made to slide across srcA. */
00108     /* So srcBLen is always considered as shorter or equal to srcALen */
00109     if(srcALen >= srcBLen)
00110     {
00111       /* Initialization of inputA pointer */
00112       pIn1 = pSrcA;
00113 
00114       /* Initialization of inputB pointer */
00115       pIn2 = pSrcB;
00116     }
00117     else
00118     {
00119       /* Initialization of inputA pointer */
00120       pIn1 = pSrcB;
00121 
00122       /* Initialization of inputB pointer */
00123       pIn2 = pSrcA;
00124 
00125       /* srcBLen is always considered as shorter or equal to srcALen */
00126       j = srcBLen;
00127       srcBLen = srcALen;
00128       srcALen = j;
00129     }
00130 
00131     /* Conditions to check which loopCounter holds   
00132      * the first and last indices of the output samples to be calculated. */
00133     check = firstIndex + numPoints;
00134     blockSize3 = ((int32_t) check - (int32_t) srcALen);
00135     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
00136     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00137     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00138                                      (int32_t) numPoints) : 0;
00139     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00140                                     (int32_t) firstIndex);
00141     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00142 
00143     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00144     /* The function is internally   
00145      * divided into three stages according to the number of multiplications that has to be   
00146      * taken place between inputA samples and inputB samples. In the first stage of the   
00147      * algorithm, the multiplications increase by one for every iteration.   
00148      * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00149      * In the third stage of the algorithm, the multiplications decrease by one   
00150      * for every iteration. */
00151 
00152     /* Set the output pointer to point to the firstIndex   
00153      * of the output sample to be calculated. */
00154     pOut = pDst + firstIndex;
00155 
00156     /* --------------------------   
00157      * Initializations of stage1   
00158      * -------------------------*/
00159 
00160     /* sum = x[0] * y[0]   
00161      * sum = x[0] * y[1] + x[1] * y[0]   
00162      * ....   
00163      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00164      */
00165 
00166     /* In this stage the MAC operations are increased by 1 for every iteration.   
00167        The count variable holds the number of MAC operations performed.   
00168        Since the partial convolution starts from firstIndex   
00169        Number of Macs to be performed is firstIndex + 1 */
00170     count = 1u + firstIndex;
00171 
00172     /* Working pointer of inputA */
00173     px = pIn1;
00174 
00175     /* Working pointer of inputB */
00176     pSrc2 = pIn2 + firstIndex;
00177     py = pSrc2;
00178 
00179     /* ------------------------   
00180      * Stage1 process   
00181      * ----------------------*/
00182 
00183     /* For loop unrolling by 4, this stage is divided into two. */
00184     /* First part of this stage computes the MAC operations less than 4 */
00185     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00186 
00187     /* The first part of the stage starts here */
00188     while((count < 4u) && (blockSize1 > 0))
00189     {
00190       /* Accumulator is made zero for every iteration */
00191       sum = 0;
00192 
00193       /* Loop over number of MAC operations between   
00194        * inputA samples and inputB samples */
00195       k = count;
00196 
00197       while(k > 0u)
00198       {
00199         /* Perform the multiply-accumulates */
00200         sum = __SMLALD(*px++, *py--, sum);
00201 
00202         /* Decrement the loop counter */
00203         k--;
00204       }
00205 
00206       /* Store the result in the accumulator in the destination buffer. */
00207       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00208 
00209       /* Update the inputA and inputB pointers for next MAC calculation */
00210       py = ++pSrc2;
00211       px = pIn1;
00212 
00213       /* Increment the MAC count */
00214       count++;
00215 
00216       /* Decrement the loop counter */
00217       blockSize1--;
00218     }
00219 
00220     /* The second part of the stage starts here */
00221     /* The internal loop, over count, is unrolled by 4 */
00222     /* To, read the last two inputB samples using SIMD:   
00223      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00224     py = py - 1;
00225 
00226     while(blockSize1 > 0)
00227     {
00228       /* Accumulator is made zero for every iteration */
00229       sum = 0;
00230 
00231       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00232       k = count >> 2u;
00233 
00234       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00235        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00236       while(k > 0u)
00237       {
00238         /* Perform the multiply-accumulates */
00239         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00240         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00241         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00242         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00243 
00244         /* Decrement the loop counter */
00245         k--;
00246       }
00247 
00248       /* For the next MAC operations, the pointer py is used without SIMD   
00249        * So, py is incremented by 1 */
00250       py = py + 1u;
00251 
00252       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00253        ** No loop unrolling is used. */
00254       k = count % 0x4u;
00255 
00256       while(k > 0u)
00257       {
00258         /* Perform the multiply-accumulates */
00259         sum = __SMLALD(*px++, *py--, sum);
00260 
00261         /* Decrement the loop counter */
00262         k--;
00263       }
00264 
00265       /* Store the result in the accumulator in the destination buffer. */
00266       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00267 
00268       /* Update the inputA and inputB pointers for next MAC calculation */
00269       py = ++pSrc2 - 1u;
00270       px = pIn1;
00271 
00272       /* Increment the MAC count */
00273       count++;
00274 
00275       /* Decrement the loop counter */
00276       blockSize1--;
00277     }
00278 
00279     /* --------------------------   
00280      * Initializations of stage2   
00281      * ------------------------*/
00282 
00283     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00284      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00285      * ....   
00286      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00287      */
00288 
00289     /* Working pointer of inputA */
00290     px = pIn1;
00291 
00292     /* Working pointer of inputB */
00293     pSrc2 = pIn2 + (srcBLen - 1u);
00294     py = pSrc2;
00295 
00296   /* count is the index by which the pointer pIn1 to be incremented */
00297   count = 0u;
00298 
00299 
00300   /* --------------------   
00301    * Stage2 process   
00302    * -------------------*/
00303 
00304   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00305    * So, to loop unroll over blockSize2,   
00306    * srcBLen should be greater than or equal to 4 */
00307   if(srcBLen >= 4u)
00308   {
00309     /* Loop unroll over blockSize2, by 4 */
00310     blkCnt = blockSize2 >> 2u;
00311 
00312     while(blkCnt > 0u)
00313     {
00314       py = py - 1u;
00315 
00316       /* Set all accumulators to zero */
00317       acc0 = 0;
00318       acc1 = 0;
00319       acc2 = 0;
00320       acc3 = 0;
00321 
00322 
00323       /* read x[0], x[1] samples */
00324       x0 = *__SIMD32(px);
00325       /* read x[1], x[2] samples */
00326       x1 = _SIMD32_OFFSET(px+1);
00327       px+= 2u;
00328 
00329 
00330       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00331       k = srcBLen >> 2u;
00332 
00333       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00334        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00335       do
00336       {
00337         /* Read the last two inputB samples using SIMD:   
00338          * y[srcBLen - 1] and y[srcBLen - 2] */
00339         c0 = *__SIMD32(py)--;
00340 
00341         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00342         acc0 = __SMLALDX(x0, c0, acc0);
00343 
00344         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00345         acc1 = __SMLALDX(x1, c0, acc1);
00346 
00347         /* Read x[2], x[3] */
00348         x2 = *__SIMD32(px);
00349 
00350         /* Read x[3], x[4] */
00351         x3 = _SIMD32_OFFSET(px+1);
00352 
00353         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00354         acc2 = __SMLALDX(x2, c0, acc2);
00355 
00356         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00357         acc3 = __SMLALDX(x3, c0, acc3);
00358 
00359         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00360         c0 = *__SIMD32(py)--;
00361 
00362         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00363         acc0 = __SMLALDX(x2, c0, acc0);
00364 
00365         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00366         acc1 = __SMLALDX(x3, c0, acc1);
00367 
00368         /* Read x[4], x[5] */
00369         x0 = _SIMD32_OFFSET(px+2);
00370 
00371         /* Read x[5], x[6] */
00372         x1 = _SIMD32_OFFSET(px+3);
00373         px += 4u;
00374 
00375         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00376         acc2 = __SMLALDX(x0, c0, acc2);
00377 
00378         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00379         acc3 = __SMLALDX(x1, c0, acc3);
00380 
00381       } while(--k);
00382 
00383       /* For the next MAC operations, SIMD is not used   
00384        * So, the 16 bit pointer if inputB, py is updated */
00385 
00386       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00387        ** No loop unrolling is used. */
00388       k = srcBLen % 0x4u;
00389 
00390       if(k == 1u)
00391       {
00392         /* Read y[srcBLen - 5] */
00393         c0 = *(py+1);
00394 
00395 #ifdef  ARM_MATH_BIG_ENDIAN
00396 
00397         c0 = c0 << 16u;
00398 
00399 #else
00400 
00401         c0 = c0 & 0x0000FFFF;
00402 
00403 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00404 
00405         /* Read x[7] */
00406         x3 = *__SIMD32(px);
00407         px++;
00408 
00409         /* Perform the multiply-accumulates */
00410         acc0 = __SMLALD(x0, c0, acc0);
00411         acc1 = __SMLALD(x1, c0, acc1);
00412         acc2 = __SMLALDX(x1, c0, acc2);
00413         acc3 = __SMLALDX(x3, c0, acc3);
00414       }
00415 
00416       if(k == 2u)
00417       {
00418         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00419         c0 = _SIMD32_OFFSET(py);
00420 
00421         /* Read x[7], x[8] */
00422         x3 = *__SIMD32(px);
00423 
00424         /* Read x[9] */
00425         x2 = _SIMD32_OFFSET(px+1);
00426         px += 2u;
00427 
00428         /* Perform the multiply-accumulates */
00429         acc0 = __SMLALDX(x0, c0, acc0);
00430         acc1 = __SMLALDX(x1, c0, acc1);
00431         acc2 = __SMLALDX(x3, c0, acc2);
00432         acc3 = __SMLALDX(x2, c0, acc3);
00433       }
00434 
00435       if(k == 3u)
00436       {
00437         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00438         c0 = _SIMD32_OFFSET(py);
00439 
00440         /* Read x[7], x[8] */
00441         x3 = *__SIMD32(px);
00442 
00443         /* Read x[9] */
00444         x2 = _SIMD32_OFFSET(px+1);
00445 
00446         /* Perform the multiply-accumulates */
00447         acc0 = __SMLALDX(x0, c0, acc0);
00448         acc1 = __SMLALDX(x1, c0, acc1);
00449         acc2 = __SMLALDX(x3, c0, acc2);
00450         acc3 = __SMLALDX(x2, c0, acc3);
00451 
00452         c0 = *(py-1);
00453 
00454 #ifdef  ARM_MATH_BIG_ENDIAN
00455 
00456         c0 = c0 << 16u;
00457 #else
00458 
00459         c0 = c0 & 0x0000FFFF;
00460 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00461 
00462         /* Read x[10] */
00463         x3 =  _SIMD32_OFFSET(px+2);
00464         px += 3u;
00465 
00466         /* Perform the multiply-accumulates */
00467         acc0 = __SMLALDX(x1, c0, acc0);
00468         acc1 = __SMLALD(x2, c0, acc1);
00469         acc2 = __SMLALDX(x2, c0, acc2);
00470         acc3 = __SMLALDX(x3, c0, acc3);
00471       }
00472 
00473 
00474       /* Store the results in the accumulators in the destination buffer. */
00475 
00476 #ifndef  ARM_MATH_BIG_ENDIAN
00477 
00478       *__SIMD32(pOut)++ =
00479         __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00480       *__SIMD32(pOut)++ =
00481         __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00482 
00483 #else
00484 
00485       *__SIMD32(pOut)++ =
00486         __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00487       *__SIMD32(pOut)++ =
00488         __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00489 
00490 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00491 
00492       /* Increment the pointer pIn1 index, count by 4 */
00493       count += 4u;
00494 
00495       /* Update the inputA and inputB pointers for next MAC calculation */
00496       px = pIn1 + count;
00497       py = pSrc2;
00498 
00499         /* Decrement the loop counter */
00500         blkCnt--;
00501       }
00502 
00503       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00504        ** No loop unrolling is used. */
00505       blkCnt = (uint32_t) blockSize2 % 0x4u;
00506       
00507       while(blkCnt > 0u)
00508       {
00509         /* Accumulator is made zero for every iteration */
00510         sum = 0;
00511 
00512         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00513         k = srcBLen >> 2u;
00514 
00515         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00516          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00517         while(k > 0u)
00518         {
00519           /* Perform the multiply-accumulates */
00520           sum += (q63_t) ((q31_t) * px++ * *py--);
00521           sum += (q63_t) ((q31_t) * px++ * *py--);
00522           sum += (q63_t) ((q31_t) * px++ * *py--);
00523           sum += (q63_t) ((q31_t) * px++ * *py--);
00524 
00525           /* Decrement the loop counter */
00526           k--;
00527         }
00528 
00529         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00530          ** No loop unrolling is used. */
00531         k = srcBLen % 0x4u;
00532 
00533         while(k > 0u)
00534         {
00535           /* Perform the multiply-accumulates */
00536           sum += (q63_t) ((q31_t) * px++ * *py--);
00537 
00538           /* Decrement the loop counter */
00539           k--;
00540         }
00541 
00542         /* Store the result in the accumulator in the destination buffer. */
00543         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
00544 
00545         /* Increment the pointer pIn1 index, count by 1 */
00546         count++;
00547 
00548         /* Update the inputA and inputB pointers for next MAC calculation */
00549         px = pIn1 + count;
00550         py = pSrc2;
00551 
00552         /* Decrement the loop counter */
00553         blkCnt--;
00554       }
00555     }
00556     else
00557     {
00558       /* If the srcBLen is not a multiple of 4,   
00559        * the blockSize2 loop cannot be unrolled by 4 */
00560       blkCnt = (uint32_t) blockSize2;
00561 
00562       while(blkCnt > 0u)
00563       {
00564         /* Accumulator is made zero for every iteration */
00565         sum = 0;
00566 
00567         /* srcBLen number of MACS should be performed */
00568         k = srcBLen;
00569 
00570         while(k > 0u)
00571         {
00572           /* Perform the multiply-accumulate */
00573           sum += (q63_t) ((q31_t) * px++ * *py--);
00574 
00575           /* Decrement the loop counter */
00576           k--;
00577         }
00578 
00579         /* Store the result in the accumulator in the destination buffer. */
00580         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
00581 
00582         /* Increment the MAC count */
00583         count++;
00584 
00585         /* Update the inputA and inputB pointers for next MAC calculation */
00586         px = pIn1 + count;
00587         py = pSrc2;
00588   
00589         /* Decrement the loop counter */
00590         blkCnt--;
00591       }
00592     }
00593 
00594 
00595     /* --------------------------   
00596      * Initializations of stage3   
00597      * -------------------------*/
00598 
00599     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
00600      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
00601      * ....   
00602      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
00603      * sum +=  x[srcALen-1] * y[srcBLen-1]   
00604      */
00605 
00606     /* In this stage the MAC operations are decreased by 1 for every iteration.   
00607        The count variable holds the number of MAC operations performed */
00608     count = srcBLen - 1u;
00609 
00610     /* Working pointer of inputA */
00611     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00612     px = pSrc1;
00613 
00614     /* Working pointer of inputB */
00615     pSrc2 = pIn2 + (srcBLen - 1u);
00616     pIn2 = pSrc2 - 1u;
00617     py = pIn2;
00618 
00619     /* -------------------   
00620      * Stage3 process   
00621      * ------------------*/
00622 
00623     /* For loop unrolling by 4, this stage is divided into two. */
00624     /* First part of this stage computes the MAC operations greater than 4 */
00625     /* Second part of this stage computes the MAC operations less than or equal to 4 */
00626 
00627     /* The first part of the stage starts here */
00628     j = count >> 2u;
00629 
00630     while((j > 0u) && (blockSize3 > 0))
00631     {
00632       /* Accumulator is made zero for every iteration */
00633       sum = 0;
00634 
00635       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00636       k = count >> 2u;
00637 
00638       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00639        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00640       while(k > 0u)
00641       {
00642         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied   
00643          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00644         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00645         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied   
00646          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00647         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00648 
00649         /* Decrement the loop counter */
00650         k--;
00651       }
00652 
00653       /* For the next MAC operations, the pointer py is used without SIMD   
00654        * So, py is incremented by 1 */
00655       py = py + 1u;
00656 
00657       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00658        ** No loop unrolling is used. */
00659       k = count % 0x4u;
00660 
00661       while(k > 0u)
00662       {
00663         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00664         sum = __SMLALD(*px++, *py--, sum);
00665 
00666         /* Decrement the loop counter */
00667         k--;
00668       }
00669 
00670       /* Store the result in the accumulator in the destination buffer. */
00671       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00672 
00673       /* Update the inputA and inputB pointers for next MAC calculation */
00674       px = ++pSrc1;
00675       py = pIn2;
00676 
00677       /* Decrement the MAC count */
00678       count--;
00679 
00680       /* Decrement the loop counter */
00681       blockSize3--;
00682 
00683       j--;
00684     }
00685 
00686     /* The second part of the stage starts here */
00687     /* SIMD is not used for the next MAC operations,   
00688      * so pointer py is updated to read only one sample at a time */
00689     py = py + 1u;
00690 
00691     while(blockSize3 > 0)
00692     {
00693       /* Accumulator is made zero for every iteration */
00694       sum = 0;
00695 
00696       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00697       k = count;
00698 
00699       while(k > 0u)
00700       {
00701         /* Perform the multiply-accumulates */
00702         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00703         sum = __SMLALD(*px++, *py--, sum);
00704 
00705         /* Decrement the loop counter */
00706         k--;
00707       }
00708 
00709       /* Store the result in the accumulator in the destination buffer. */
00710       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00711 
00712       /* Update the inputA and inputB pointers for next MAC calculation */
00713       px = ++pSrc1;
00714       py = pSrc2;
00715 
00716       /* Decrement the MAC count */
00717       count--;
00718 
00719       /* Decrement the loop counter */
00720       blockSize3--;
00721     }
00722 
00723     /* set status as ARM_MATH_SUCCESS */
00724     status = ARM_MATH_SUCCESS;
00725   }
00726 
00727   /* Return to application */
00728   return (status);
00729 
00730 #else
00731 
00732   /* Run the below code for Cortex-M0 */
00733 
00734   q15_t *pIn1 = pSrcA;                           /* inputA pointer */
00735   q15_t *pIn2 = pSrcB;                           /* inputB pointer */
00736   q63_t sum;                                     /* Accumulator */
00737   uint32_t i, j;                                 /* loop counters */
00738   arm_status status;                             /* status of Partial convolution */
00739 
00740   /* Check for range of output samples to be calculated */
00741   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00742   {
00743     /* Set status as ARM_ARGUMENT_ERROR */
00744     status = ARM_MATH_ARGUMENT_ERROR;
00745   }
00746   else
00747   {
00748     /* Loop to calculate convolution for output length number of values */
00749     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00750     {
00751       /* Initialize sum with zero to carry on MAC operations */
00752       sum = 0;
00753 
00754       /* Loop to perform MAC operations according to convolution equation */
00755       for (j = 0; j <= i; j++)
00756       {
00757         /* Check the array limitations */
00758         if(((i - j) < srcBLen) && (j < srcALen))
00759         {
00760           /* z[i] += x[i-j] * y[j] */
00761           sum += ((q31_t) pIn1[j] * (pIn2[i - j]));
00762         }
00763       }
00764 
00765       /* Store the output in the destination buffer */
00766       pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
00767     }
00768     /* set status as ARM_SUCCESS as there are no argument errors */
00769     status = ARM_MATH_SUCCESS;
00770   }
00771   return (status);
00772 
00773 #endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)  */
00774 
00775 }
00776 
00777 /**   
00778  * @} end of PartialConv group   
00779  */