CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_q7.c Source File

arm_conv_partial_q7.c

00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_conv_partial_q7.c   
00009 *   
00010 * Description:  Partial convolution of Q7 sequences.   
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.    
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**   
00044  * @ingroup groupFilters   
00045  */
00046 
00047 /**   
00048  * @addtogroup PartialConv   
00049  * @{   
00050  */
00051 
00052 /**   
00053  * @brief Partial convolution of Q7 sequences.   
00054  * @param[in]       *pSrcA points to the first input sequence.   
00055  * @param[in]       srcALen length of the first input sequence.   
00056  * @param[in]       *pSrcB points to the second input sequence.   
00057  * @param[in]       srcBLen length of the second input sequence.   
00058  * @param[out]      *pDst points to the location where the output result is written.   
00059  * @param[in]       firstIndex is the first output sample to start with.   
00060  * @param[in]       numPoints is the number of output points to be computed.   
00061  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].   
00062  *  
00063  * \par    
00064  * Refer the function <code>arm_conv_partial_opt_q7()</code> for a faster implementation of this function.
00065  *  
00066  */
00067 
00068 arm_status arm_conv_partial_q7(
00069   q7_t * pSrcA,
00070   uint32_t srcALen,
00071   q7_t * pSrcB,
00072   uint32_t srcBLen,
00073   q7_t * pDst,
00074   uint32_t firstIndex,
00075   uint32_t numPoints)
00076 {
00077 
00078 
00079 #ifndef ARM_MATH_CM0_FAMILY
00080 
00081   /* Run the below code for Cortex-M4 and Cortex-M3 */
00082 
00083   q7_t *pIn1;                                    /* inputA pointer */
00084   q7_t *pIn2;                                    /* inputB pointer */
00085   q7_t *pOut = pDst;                             /* output pointer */
00086   q7_t *px;                                      /* Intermediate inputA pointer */
00087   q7_t *py;                                      /* Intermediate inputB pointer */
00088   q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */
00089   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00090   q31_t input1, input2;
00091   q15_t in1, in2;
00092   q7_t x0, x1, x2, x3, c0, c1;
00093   uint32_t j, k, count, check, blkCnt;
00094   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter */
00095   arm_status status;
00096 
00097 
00098   /* Check for range of output samples to be calculated */
00099   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00100   {
00101     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00102     status = ARM_MATH_ARGUMENT_ERROR;
00103   }
00104   else
00105   {
00106 
00107     /* The algorithm implementation is based on the lengths of the inputs. */
00108     /* srcB is always made to slide across srcA. */
00109     /* So srcBLen is always considered as shorter or equal to srcALen */
00110     if(srcALen >= srcBLen)
00111     {
00112       /* Initialization of inputA pointer */
00113       pIn1 = pSrcA;
00114 
00115       /* Initialization of inputB pointer */
00116       pIn2 = pSrcB;
00117     }
00118     else
00119     {
00120       /* Initialization of inputA pointer */
00121       pIn1 = pSrcB;
00122 
00123       /* Initialization of inputB pointer */
00124       pIn2 = pSrcA;
00125 
00126       /* srcBLen is always considered as shorter or equal to srcALen */
00127       j = srcBLen;
00128       srcBLen = srcALen;
00129       srcALen = j;
00130     }
00131 
00132     /* Conditions to check which loopCounter holds   
00133      * the first and last indices of the output samples to be calculated. */
00134     check = firstIndex + numPoints;
00135     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00136     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00137     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00138     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00139                                      (int32_t) numPoints) : 0;
00140     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00141                                     (int32_t) firstIndex);
00142     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00143 
00144     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00145     /* The function is internally   
00146      * divided into three stages according to the number of multiplications that has to be   
00147      * taken place between inputA samples and inputB samples. In the first stage of the   
00148      * algorithm, the multiplications increase by one for every iteration.   
00149      * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00150      * In the third stage of the algorithm, the multiplications decrease by one   
00151      * for every iteration. */
00152 
00153     /* Set the output pointer to point to the firstIndex   
00154      * of the output sample to be calculated. */
00155     pOut = pDst + firstIndex;
00156 
00157     /* --------------------------   
00158      * Initializations of stage1   
00159      * -------------------------*/
00160 
00161     /* sum = x[0] * y[0]   
00162      * sum = x[0] * y[1] + x[1] * y[0]   
00163      * ....   
00164      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00165      */
00166 
00167     /* In this stage the MAC operations are increased by 1 for every iteration.   
00168        The count variable holds the number of MAC operations performed.   
00169        Since the partial convolution starts from from firstIndex   
00170        Number of Macs to be performed is firstIndex + 1 */
00171     count = 1u + firstIndex;
00172 
00173     /* Working pointer of inputA */
00174     px = pIn1;
00175 
00176     /* Working pointer of inputB */
00177     pSrc2 = pIn2 + firstIndex;
00178     py = pSrc2;
00179 
00180     /* ------------------------   
00181      * Stage1 process   
00182      * ----------------------*/
00183 
00184     /* The first stage starts here */
00185     while(blockSize1 > 0)
00186     {
00187       /* Accumulator is made zero for every iteration */
00188       sum = 0;
00189 
00190       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00191       k = count >> 2u;
00192 
00193       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00194        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00195       while(k > 0u)
00196       {
00197         /* x[0] , x[1] */
00198         in1 = (q15_t) * px++;
00199         in2 = (q15_t) * px++;
00200         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00201 
00202         /* y[srcBLen - 1] , y[srcBLen - 2] */
00203         in1 = (q15_t) * py--;
00204         in2 = (q15_t) * py--;
00205         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00206 
00207         /* x[0] * y[srcBLen - 1] */
00208         /* x[1] * y[srcBLen - 2] */
00209         sum = __SMLAD(input1, input2, sum);
00210 
00211         /* x[2] , x[3] */
00212         in1 = (q15_t) * px++;
00213         in2 = (q15_t) * px++;
00214         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00215 
00216         /* y[srcBLen - 3] , y[srcBLen - 4] */
00217         in1 = (q15_t) * py--;
00218         in2 = (q15_t) * py--;
00219         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00220 
00221         /* x[2] * y[srcBLen - 3] */
00222         /* x[3] * y[srcBLen - 4] */
00223         sum = __SMLAD(input1, input2, sum);
00224 
00225         /* Decrement the loop counter */
00226         k--;
00227       }
00228 
00229       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00230        ** No loop unrolling is used. */
00231       k = count % 0x4u;
00232 
00233       while(k > 0u)
00234       {
00235         /* Perform the multiply-accumulates */
00236         sum += ((q31_t) * px++ * *py--);
00237 
00238         /* Decrement the loop counter */
00239         k--;
00240       }
00241 
00242       /* Store the result in the accumulator in the destination buffer. */
00243       *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
00244 
00245       /* Update the inputA and inputB pointers for next MAC calculation */
00246       py = ++pSrc2;
00247       px = pIn1;
00248 
00249       /* Increment the MAC count */
00250       count++;
00251 
00252       /* Decrement the loop counter */
00253       blockSize1--;
00254     }
00255 
00256     /* --------------------------   
00257      * Initializations of stage2   
00258      * ------------------------*/
00259 
00260     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00261      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00262      * ....   
00263      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00264      */
00265 
00266     /* Working pointer of inputA */
00267     if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00268     {
00269       px = pIn1 + firstIndex - srcBLen + 1;
00270     }
00271     else
00272     {
00273       px = pIn1;
00274     }
00275 
00276     /* Working pointer of inputB */
00277     pSrc2 = pIn2 + (srcBLen - 1u);
00278     py = pSrc2;
00279 
00280     /* count is index by which the pointer pIn1 to be incremented */
00281     count = 0u;
00282 
00283     /* -------------------   
00284      * Stage2 process   
00285      * ------------------*/
00286 
00287     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00288      * So, to loop unroll over blockSize2,   
00289      * srcBLen should be greater than or equal to 4 */
00290     if(srcBLen >= 4u)
00291     {
00292       /* Loop unroll over blockSize2, by 4 */
00293       blkCnt = ((uint32_t) blockSize2 >> 2u);
00294 
00295       while(blkCnt > 0u)
00296       {
00297         /* Set all accumulators to zero */
00298         acc0 = 0;
00299         acc1 = 0;
00300         acc2 = 0;
00301         acc3 = 0;
00302 
00303         /* read x[0], x[1], x[2] samples */
00304         x0 = *(px++);
00305         x1 = *(px++);
00306         x2 = *(px++);
00307 
00308         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00309         k = srcBLen >> 2u;
00310 
00311         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00312          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00313         do
00314         {
00315           /* Read y[srcBLen - 1] sample */
00316           c0 = *(py--);
00317           /* Read y[srcBLen - 2] sample */
00318           c1 = *(py--);
00319 
00320           /* Read x[3] sample */
00321           x3 = *(px++);
00322 
00323           /* x[0] and x[1] are packed */
00324           in1 = (q15_t) x0;
00325           in2 = (q15_t) x1;
00326 
00327           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00328 
00329           /* y[srcBLen - 1]   and y[srcBLen - 2] are packed */
00330           in1 = (q15_t) c0;
00331           in2 = (q15_t) c1;
00332 
00333           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00334 
00335           /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2]  */
00336           acc0 = __SMLAD(input1, input2, acc0);
00337 
00338           /* x[1] and x[2] are packed */
00339           in1 = (q15_t) x1;
00340           in2 = (q15_t) x2;
00341 
00342           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00343 
00344           /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2]  */
00345           acc1 = __SMLAD(input1, input2, acc1);
00346 
00347           /* x[2] and x[3] are packed */
00348           in1 = (q15_t) x2;
00349           in2 = (q15_t) x3;
00350 
00351           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00352 
00353           /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2]  */
00354           acc2 = __SMLAD(input1, input2, acc2);
00355 
00356           /* Read x[4] sample */
00357           x0 = *(px++);
00358 
00359           /* x[3] and x[4] are packed */
00360           in1 = (q15_t) x3;
00361           in2 = (q15_t) x0;
00362 
00363           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00364 
00365           /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2]  */
00366           acc3 = __SMLAD(input1, input2, acc3);
00367 
00368           /* Read y[srcBLen - 3] sample */
00369           c0 = *(py--);
00370           /* Read y[srcBLen - 4] sample */
00371           c1 = *(py--);
00372 
00373           /* Read x[5] sample */
00374           x1 = *(px++);
00375 
00376           /* x[2] and x[3] are packed */
00377           in1 = (q15_t) x2;
00378           in2 = (q15_t) x3;
00379 
00380           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00381 
00382           /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
00383           in1 = (q15_t) c0;
00384           in2 = (q15_t) c1;
00385 
00386           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00387 
00388           /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */
00389           acc0 = __SMLAD(input1, input2, acc0);
00390 
00391           /* x[3] and x[4] are packed */
00392           in1 = (q15_t) x3;
00393           in2 = (q15_t) x0;
00394 
00395           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00396 
00397           /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4]  */
00398           acc1 = __SMLAD(input1, input2, acc1);
00399 
00400           /* x[4] and x[5] are packed */
00401           in1 = (q15_t) x0;
00402           in2 = (q15_t) x1;
00403 
00404           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00405 
00406           /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */
00407           acc2 = __SMLAD(input1, input2, acc2);
00408 
00409           /* Read x[6] sample */
00410           x2 = *(px++);
00411 
00412           /* x[5] and x[6] are packed */
00413           in1 = (q15_t) x1;
00414           in2 = (q15_t) x2;
00415 
00416           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00417 
00418           /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4]  */
00419           acc3 = __SMLAD(input1, input2, acc3);
00420 
00421         } while(--k);
00422 
00423         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00424          ** No loop unrolling is used. */
00425         k = srcBLen % 0x4u;
00426 
00427         while(k > 0u)
00428         {
00429           /* Read y[srcBLen - 5] sample */
00430           c0 = *(py--);
00431 
00432           /* Read x[7] sample */
00433           x3 = *(px++);
00434 
00435           /* Perform the multiply-accumulates */
00436           /* acc0 +=  x[4] * y[srcBLen - 5] */
00437           acc0 += ((q31_t) x0 * c0);
00438           /* acc1 +=  x[5] * y[srcBLen - 5] */
00439           acc1 += ((q31_t) x1 * c0);
00440           /* acc2 +=  x[6] * y[srcBLen - 5] */
00441           acc2 += ((q31_t) x2 * c0);
00442           /* acc3 +=  x[7] * y[srcBLen - 5] */
00443           acc3 += ((q31_t) x3 * c0);
00444 
00445           /* Reuse the present samples for the next MAC */
00446           x0 = x1;
00447           x1 = x2;
00448           x2 = x3;
00449 
00450           /* Decrement the loop counter */
00451           k--;
00452         }
00453 
00454         /* Store the result in the accumulator in the destination buffer. */
00455         *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8));
00456         *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8));
00457         *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8));
00458         *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8));
00459 
00460         /* Increment the pointer pIn1 index, count by 4 */
00461         count += 4u;
00462 
00463         /* Update the inputA and inputB pointers for next MAC calculation */
00464         px = pIn1 + count;
00465         py = pSrc2;
00466 
00467 
00468         /* Decrement the loop counter */
00469         blkCnt--;
00470       }
00471 
00472       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00473        ** No loop unrolling is used. */
00474       blkCnt = (uint32_t) blockSize2 % 0x4u;
00475 
00476       while(blkCnt > 0u)
00477       {
00478         /* Accumulator is made zero for every iteration */
00479         sum = 0;
00480 
00481         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00482         k = srcBLen >> 2u;
00483 
00484         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00485          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00486         while(k > 0u)
00487         {
00488 
00489           /* Reading two inputs of SrcA buffer and packing */
00490           in1 = (q15_t) * px++;
00491           in2 = (q15_t) * px++;
00492           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00493 
00494           /* Reading two inputs of SrcB buffer and packing */
00495           in1 = (q15_t) * py--;
00496           in2 = (q15_t) * py--;
00497           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00498 
00499           /* Perform the multiply-accumulates */
00500           sum = __SMLAD(input1, input2, sum);
00501 
00502           /* Reading two inputs of SrcA buffer and packing */
00503           in1 = (q15_t) * px++;
00504           in2 = (q15_t) * px++;
00505           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00506 
00507           /* Reading two inputs of SrcB buffer and packing */
00508           in1 = (q15_t) * py--;
00509           in2 = (q15_t) * py--;
00510           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00511 
00512           /* Perform the multiply-accumulates */
00513           sum = __SMLAD(input1, input2, sum);
00514 
00515           /* Decrement the loop counter */
00516           k--;
00517         }
00518 
00519         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00520          ** No loop unrolling is used. */
00521         k = srcBLen % 0x4u;
00522 
00523         while(k > 0u)
00524         {
00525           /* Perform the multiply-accumulates */
00526           sum += ((q31_t) * px++ * *py--);
00527 
00528           /* Decrement the loop counter */
00529           k--;
00530         }
00531 
00532         /* Store the result in the accumulator in the destination buffer. */
00533         *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
00534 
00535         /* Increment the pointer pIn1 index, count by 1 */
00536         count++;
00537 
00538         /* Update the inputA and inputB pointers for next MAC calculation */
00539         px = pIn1 + count;
00540         py = pSrc2; 
00541 
00542         /* Decrement the loop counter */
00543         blkCnt--;
00544       }
00545     }
00546     else
00547     {
00548       /* If the srcBLen is not a multiple of 4,   
00549        * the blockSize2 loop cannot be unrolled by 4 */
00550       blkCnt = (uint32_t) blockSize2;
00551 
00552       while(blkCnt > 0u)
00553       {
00554         /* Accumulator is made zero for every iteration */
00555         sum = 0;
00556 
00557         /* srcBLen number of MACS should be performed */
00558         k = srcBLen;
00559 
00560         while(k > 0u)
00561         {
00562           /* Perform the multiply-accumulate */
00563           sum += ((q31_t) * px++ * *py--);
00564 
00565           /* Decrement the loop counter */
00566           k--;
00567         }
00568 
00569         /* Store the result in the accumulator in the destination buffer. */
00570         *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
00571 
00572         /* Increment the MAC count */
00573         count++;
00574 
00575         /* Update the inputA and inputB pointers for next MAC calculation */
00576         px = pIn1 + count;
00577         py = pSrc2;
00578 
00579         /* Decrement the loop counter */
00580         blkCnt--;
00581       }
00582     }
00583 
00584 
00585     /* --------------------------   
00586      * Initializations of stage3   
00587      * -------------------------*/
00588 
00589     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
00590      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
00591      * ....   
00592      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
00593      * sum +=  x[srcALen-1] * y[srcBLen-1]   
00594      */
00595 
00596     /* In this stage the MAC operations are decreased by 1 for every iteration.   
00597        The count variable holds the number of MAC operations performed */
00598     count = srcBLen - 1u;
00599 
00600     /* Working pointer of inputA */
00601     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00602     px = pSrc1;
00603 
00604     /* Working pointer of inputB */
00605     pSrc2 = pIn2 + (srcBLen - 1u);
00606     py = pSrc2;
00607 
00608     /* -------------------   
00609      * Stage3 process   
00610      * ------------------*/
00611 
00612     while(blockSize3 > 0)
00613     {
00614       /* Accumulator is made zero for every iteration */
00615       sum = 0;
00616 
00617       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00618       k = count >> 2u;
00619 
00620       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00621        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00622       while(k > 0u)
00623       {
00624         /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
00625         in1 = (q15_t) * px++;
00626         in2 = (q15_t) * px++;
00627         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00628 
00629         /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
00630         in1 = (q15_t) * py--;
00631         in2 = (q15_t) * py--;
00632         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00633 
00634         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00635         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00636         sum = __SMLAD(input1, input2, sum);
00637 
00638         /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
00639         in1 = (q15_t) * px++;
00640         in2 = (q15_t) * px++;
00641         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00642 
00643         /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
00644         in1 = (q15_t) * py--;
00645         in2 = (q15_t) * py--;
00646         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00647 
00648         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00649         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00650         sum = __SMLAD(input1, input2, sum);
00651 
00652         /* Decrement the loop counter */
00653         k--;
00654       }
00655 
00656       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00657        ** No loop unrolling is used. */
00658       k = count % 0x4u;
00659 
00660       while(k > 0u)
00661       {
00662         /* Perform the multiply-accumulates */
00663         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00664         sum += ((q31_t) * px++ * *py--);
00665 
00666         /* Decrement the loop counter */
00667         k--;
00668       }
00669 
00670       /* Store the result in the accumulator in the destination buffer. */
00671       *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
00672 
00673       /* Update the inputA and inputB pointers for next MAC calculation */
00674       px = ++pSrc1;
00675       py = pSrc2;
00676 
00677       /* Decrement the MAC count */
00678       count--;
00679 
00680       /* Decrement the loop counter */
00681       blockSize3--;
00682 
00683     }
00684 
00685     /* set status as ARM_MATH_SUCCESS */
00686     status = ARM_MATH_SUCCESS;
00687   }
00688 
00689   /* Return to application */
00690   return (status);
00691 
00692 #else
00693 
00694   /* Run the below code for Cortex-M0 */
00695 
00696   q7_t *pIn1 = pSrcA;                            /* inputA pointer */
00697   q7_t *pIn2 = pSrcB;                            /* inputB pointer */
00698   q31_t sum;                                     /* Accumulator */
00699   uint32_t i, j;                                 /* loop counters */
00700   arm_status status;                             /* status of Partial convolution */
00701 
00702   /* Check for range of output samples to be calculated */
00703   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00704   {
00705     /* Set status as ARM_ARGUMENT_ERROR */
00706     status = ARM_MATH_ARGUMENT_ERROR;
00707   }
00708   else
00709   {
00710     /* Loop to calculate convolution for output length number of values */
00711     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00712     {
00713       /* Initialize sum with zero to carry on MAC operations */
00714       sum = 0;
00715 
00716       /* Loop to perform MAC operations according to convolution equation */
00717       for (j = 0; j <= i; j++)
00718       {
00719         /* Check the array limitations */
00720         if(((i - j) < srcBLen) && (j < srcALen))
00721         {
00722           /* z[i] += x[i-j] * y[j] */
00723           sum += ((q15_t) pIn1[j] * (pIn2[i - j]));
00724         }
00725       }
00726 
00727       /* Store the output in the destination buffer */
00728       pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
00729     }
00730     /* set status as ARM_SUCCESS as there are no argument errors */
00731     status = ARM_MATH_SUCCESS;
00732   }
00733   return (status);
00734 
00735 #endif /*  #ifndef ARM_MATH_CM0_FAMILY */
00736 
00737 }
00738 
00739 /**   
00740  * @} end of PartialConv group   
00741  */