CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_fast_q15.c Source File

arm_conv_partial_fast_q15.c

00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_conv_partial_fast_q15.c   
00009 *   
00010 * Description:  Fast Q15 Partial convolution.   
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**   
00044  * @ingroup groupFilters   
00045  */
00046 
00047 /**   
00048  * @addtogroup PartialConv   
00049  * @{   
00050  */
00051 
00052 /**   
00053  * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.   
00054  * @param[in]       *pSrcA points to the first input sequence.   
00055  * @param[in]       srcALen length of the first input sequence.   
00056  * @param[in]       *pSrcB points to the second input sequence.   
00057  * @param[in]       srcBLen length of the second input sequence.   
00058  * @param[out]      *pDst points to the location where the output result is written.   
00059  * @param[in]       firstIndex is the first output sample to start with.   
00060  * @param[in]       numPoints is the number of output points to be computed.   
00061  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].   
00062  *   
00063  * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.   
00064  */
00065 
00066 
00067 arm_status arm_conv_partial_fast_q15(
00068   q15_t * pSrcA,
00069   uint32_t srcALen,
00070   q15_t * pSrcB,
00071   uint32_t srcBLen,
00072   q15_t * pDst,
00073   uint32_t firstIndex,
00074   uint32_t numPoints)
00075 {
00076 #ifndef UNALIGNED_SUPPORT_DISABLE
00077 
00078   q15_t *pIn1;                                   /* inputA pointer               */
00079   q15_t *pIn2;                                   /* inputB pointer               */
00080   q15_t *pOut = pDst;                            /* output pointer               */
00081   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
00082   q15_t *px;                                     /* Intermediate inputA pointer  */
00083   q15_t *py;                                     /* Intermediate inputB pointer  */
00084   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00085   q31_t x0, x1, x2, x3, c0;
00086   uint32_t j, k, count, check, blkCnt;
00087   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
00088   arm_status status;                             /* status of Partial convolution */
00089 
00090   /* Check for range of output samples to be calculated */
00091   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00092   {
00093     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00094     status = ARM_MATH_ARGUMENT_ERROR;
00095   }
00096   else
00097   {
00098 
00099     /* The algorithm implementation is based on the lengths of the inputs. */
00100     /* srcB is always made to slide across srcA. */
00101     /* So srcBLen is always considered as shorter or equal to srcALen */
00102     if(srcALen >=srcBLen)
00103     {
00104       /* Initialization of inputA pointer */
00105       pIn1 = pSrcA;
00106 
00107       /* Initialization of inputB pointer */
00108       pIn2 = pSrcB;
00109     }
00110     else
00111     {
00112       /* Initialization of inputA pointer */
00113       pIn1 = pSrcB;
00114 
00115       /* Initialization of inputB pointer */
00116       pIn2 = pSrcA;
00117 
00118       /* srcBLen is always considered as shorter or equal to srcALen */
00119       j = srcBLen;
00120       srcBLen = srcALen;
00121       srcALen = j;
00122     }
00123 
00124     /* Conditions to check which loopCounter holds   
00125      * the first and last indices of the output samples to be calculated. */
00126     check = firstIndex + numPoints;
00127     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00128     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00129     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00130     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00131                                      (int32_t) numPoints) : 0;
00132     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00133                                     (int32_t) firstIndex);
00134     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00135 
00136     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00137     /* The function is internally   
00138      * divided into three stages according to the number of multiplications that has to be   
00139      * taken place between inputA samples and inputB samples. In the first stage of the   
00140      * algorithm, the multiplications increase by one for every iteration.   
00141      * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00142      * In the third stage of the algorithm, the multiplications decrease by one   
00143      * for every iteration. */
00144 
00145     /* Set the output pointer to point to the firstIndex   
00146      * of the output sample to be calculated. */
00147     pOut = pDst + firstIndex;
00148 
00149     /* --------------------------   
00150      * Initializations of stage1   
00151      * -------------------------*/
00152 
00153     /* sum = x[0] * y[0]   
00154      * sum = x[0] * y[1] + x[1] * y[0]   
00155      * ....   
00156      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00157      */
00158 
00159     /* In this stage the MAC operations are increased by 1 for every iteration.   
00160        The count variable holds the number of MAC operations performed.   
00161        Since the partial convolution starts from firstIndex   
00162        Number of Macs to be performed is firstIndex + 1 */
00163     count = 1u + firstIndex;
00164 
00165     /* Working pointer of inputA */
00166     px = pIn1;
00167 
00168     /* Working pointer of inputB */
00169     pSrc2 = pIn2 + firstIndex;
00170     py = pSrc2;
00171 
00172     /* ------------------------   
00173      * Stage1 process   
00174      * ----------------------*/
00175 
00176     /* For loop unrolling by 4, this stage is divided into two. */
00177     /* First part of this stage computes the MAC operations less than 4 */
00178     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00179 
00180     /* The first part of the stage starts here */
00181     while((count < 4u) && (blockSize1 > 0))
00182     {
00183       /* Accumulator is made zero for every iteration */
00184       sum = 0;
00185 
00186       /* Loop over number of MAC operations between   
00187        * inputA samples and inputB samples */
00188       k = count;
00189 
00190       while(k > 0u)
00191       {
00192         /* Perform the multiply-accumulates */
00193         sum = __SMLAD(*px++, *py--, sum);
00194 
00195         /* Decrement the loop counter */
00196         k--;
00197       }
00198 
00199       /* Store the result in the accumulator in the destination buffer. */
00200       *pOut++ = (q15_t) (sum >> 15);
00201 
00202       /* Update the inputA and inputB pointers for next MAC calculation */
00203       py = ++pSrc2;
00204       px = pIn1;
00205 
00206       /* Increment the MAC count */
00207       count++;
00208 
00209       /* Decrement the loop counter */
00210       blockSize1--;
00211     }
00212 
00213     /* The second part of the stage starts here */
00214     /* The internal loop, over count, is unrolled by 4 */
00215     /* To, read the last two inputB samples using SIMD:   
00216      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00217     py = py - 1;
00218 
00219     while(blockSize1 > 0)
00220     {
00221       /* Accumulator is made zero for every iteration */
00222       sum = 0;
00223 
00224       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00225       k = count >> 2u;
00226 
00227       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00228        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00229       while(k > 0u)
00230       {
00231         /* Perform the multiply-accumulates */
00232         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00233         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00234         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00235         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00236 
00237         /* Decrement the loop counter */
00238         k--;
00239       }
00240 
00241       /* For the next MAC operations, the pointer py is used without SIMD   
00242        * So, py is incremented by 1 */
00243       py = py + 1u;
00244 
00245       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00246        ** No loop unrolling is used. */
00247       k = count % 0x4u;
00248 
00249       while(k > 0u)
00250       {
00251         /* Perform the multiply-accumulates */
00252         sum = __SMLAD(*px++, *py--, sum);
00253 
00254         /* Decrement the loop counter */
00255         k--;
00256       }
00257 
00258       /* Store the result in the accumulator in the destination buffer. */
00259       *pOut++ = (q15_t) (sum >> 15);
00260 
00261       /* Update the inputA and inputB pointers for next MAC calculation */
00262       py = ++pSrc2 - 1u;
00263       px = pIn1;
00264 
00265       /* Increment the MAC count */
00266       count++;
00267 
00268       /* Decrement the loop counter */
00269       blockSize1--;
00270     }
00271 
00272     /* --------------------------   
00273      * Initializations of stage2   
00274      * ------------------------*/
00275 
00276     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00277      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00278      * ....   
00279      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00280      */
00281 
00282     /* Working pointer of inputA */
00283     if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00284     {
00285       px = pIn1 + firstIndex - srcBLen + 1;
00286     }
00287     else
00288     {
00289       px = pIn1;
00290     }
00291 
00292     /* Working pointer of inputB */
00293     pSrc2 = pIn2 + (srcBLen - 1u);
00294     py = pSrc2;
00295 
00296     /* count is the index by which the pointer pIn1 to be incremented */
00297     count = 0u;
00298 
00299 
00300     /* --------------------   
00301      * Stage2 process   
00302      * -------------------*/
00303 
00304     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00305      * So, to loop unroll over blockSize2,   
00306      * srcBLen should be greater than or equal to 4 */
00307     if(srcBLen >= 4u)
00308     {
00309       /* Loop unroll over blockSize2, by 4 */
00310       blkCnt = ((uint32_t) blockSize2 >> 2u);
00311 
00312       while(blkCnt > 0u)
00313       {
00314       py = py - 1u;
00315 
00316         /* Set all accumulators to zero */
00317         acc0 = 0;
00318         acc1 = 0;
00319         acc2 = 0;
00320         acc3 = 0;
00321 
00322 
00323         /* read x[0], x[1] samples */
00324       x0 = *__SIMD32(px);
00325         /* read x[1], x[2] samples */
00326       x1 = _SIMD32_OFFSET(px+1);
00327       px+= 2u;
00328 
00329 
00330         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00331         k = srcBLen >> 2u;
00332 
00333         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00334          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00335         do
00336         {
00337           /* Read the last two inputB samples using SIMD:   
00338            * y[srcBLen - 1] and y[srcBLen - 2] */
00339         c0 = *__SIMD32(py)--;
00340 
00341           /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00342           acc0 = __SMLADX(x0, c0, acc0);
00343 
00344           /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00345           acc1 = __SMLADX(x1, c0, acc1);
00346 
00347           /* Read x[2], x[3] */
00348         x2 = *__SIMD32(px);
00349 
00350           /* Read x[3], x[4] */
00351         x3 = _SIMD32_OFFSET(px+1);
00352 
00353           /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00354           acc2 = __SMLADX(x2, c0, acc2);
00355 
00356           /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00357           acc3 = __SMLADX(x3, c0, acc3);
00358 
00359           /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00360         c0 = *__SIMD32(py)--;
00361 
00362           /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00363           acc0 = __SMLADX(x2, c0, acc0);
00364 
00365           /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00366           acc1 = __SMLADX(x3, c0, acc1);
00367 
00368           /* Read x[4], x[5] */
00369         x0 = _SIMD32_OFFSET(px+2);
00370 
00371           /* Read x[5], x[6] */
00372         x1 = _SIMD32_OFFSET(px+3);
00373         px += 4u;
00374 
00375           /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00376           acc2 = __SMLADX(x0, c0, acc2);
00377 
00378           /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00379           acc3 = __SMLADX(x1, c0, acc3);
00380 
00381         } while(--k);
00382 
00383         /* For the next MAC operations, SIMD is not used   
00384          * So, the 16 bit pointer if inputB, py is updated */
00385 
00386         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00387          ** No loop unrolling is used. */
00388         k = srcBLen % 0x4u;
00389 
00390         if(k == 1u)
00391         {
00392           /* Read y[srcBLen - 5] */
00393         c0 = *(py+1);
00394 #ifdef  ARM_MATH_BIG_ENDIAN
00395 
00396         c0 = c0 << 16u;
00397 
00398 #else
00399 
00400         c0 = c0 & 0x0000FFFF;
00401 
00402 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00403 
00404           /* Read x[7] */
00405         x3 = *__SIMD32(px);
00406         px++;
00407 
00408           /* Perform the multiply-accumulates */
00409           acc0 = __SMLAD(x0, c0, acc0);
00410           acc1 = __SMLAD(x1, c0, acc1);
00411           acc2 = __SMLADX(x1, c0, acc2);
00412           acc3 = __SMLADX(x3, c0, acc3);
00413         }
00414 
00415         if(k == 2u)
00416         {
00417           /* Read y[srcBLen - 5], y[srcBLen - 6] */
00418         c0 = _SIMD32_OFFSET(py);
00419 
00420           /* Read x[7], x[8] */
00421         x3 = *__SIMD32(px);
00422 
00423         /* Read x[9] */
00424         x2 = _SIMD32_OFFSET(px+1);
00425         px += 2u;
00426 
00427           /* Perform the multiply-accumulates */
00428           acc0 = __SMLADX(x0, c0, acc0);
00429           acc1 = __SMLADX(x1, c0, acc1);
00430           acc2 = __SMLADX(x3, c0, acc2);
00431           acc3 = __SMLADX(x2, c0, acc3);
00432         }
00433 
00434         if(k == 3u)
00435         {
00436           /* Read y[srcBLen - 5], y[srcBLen - 6] */
00437         c0 = _SIMD32_OFFSET(py);
00438 
00439           /* Read x[7], x[8] */
00440         x3 = *__SIMD32(px);
00441 
00442           /* Read x[9] */
00443         x2 = _SIMD32_OFFSET(px+1);
00444 
00445           /* Perform the multiply-accumulates */
00446           acc0 = __SMLADX(x0, c0, acc0);
00447           acc1 = __SMLADX(x1, c0, acc1);
00448           acc2 = __SMLADX(x3, c0, acc2);
00449           acc3 = __SMLADX(x2, c0, acc3);
00450 
00451         c0 = *(py-1);
00452 #ifdef  ARM_MATH_BIG_ENDIAN
00453 
00454         c0 = c0 << 16u;
00455 #else
00456 
00457         c0 = c0 & 0x0000FFFF;
00458 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00459 
00460           /* Read x[10] */
00461         x3 =  _SIMD32_OFFSET(px+2);
00462         px += 3u;
00463 
00464           /* Perform the multiply-accumulates */
00465           acc0 = __SMLADX(x1, c0, acc0);
00466           acc1 = __SMLAD(x2, c0, acc1);
00467           acc2 = __SMLADX(x2, c0, acc2);
00468           acc3 = __SMLADX(x3, c0, acc3);
00469         }
00470 
00471         /* Store the results in the accumulators in the destination buffer. */
00472 #ifndef ARM_MATH_BIG_ENDIAN
00473 
00474         *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16);
00475         *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16);
00476 
00477 #else
00478 
00479         *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16);
00480         *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16);
00481 
00482 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00483 
00484         /* Increment the pointer pIn1 index, count by 4 */
00485         count += 4u;
00486 
00487         /* Update the inputA and inputB pointers for next MAC calculation */
00488         px = pIn1 + count;
00489         py = pSrc2;
00490 
00491         /* Decrement the loop counter */
00492         blkCnt--;
00493       }
00494 
00495       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00496        ** No loop unrolling is used. */
00497       blkCnt = (uint32_t) blockSize2 % 0x4u;
00498 
00499       while(blkCnt > 0u)
00500       {
00501         /* Accumulator is made zero for every iteration */
00502         sum = 0;
00503 
00504         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00505         k = srcBLen >> 2u;
00506 
00507         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00508          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00509         while(k > 0u)
00510         {
00511           /* Perform the multiply-accumulates */
00512           sum += ((q31_t) * px++ * *py--);
00513           sum += ((q31_t) * px++ * *py--);
00514           sum += ((q31_t) * px++ * *py--);
00515           sum += ((q31_t) * px++ * *py--);
00516 
00517           /* Decrement the loop counter */
00518           k--;
00519         }
00520 
00521         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00522          ** No loop unrolling is used. */
00523         k = srcBLen % 0x4u;
00524 
00525         while(k > 0u)
00526         {
00527           /* Perform the multiply-accumulates */
00528           sum += ((q31_t) * px++ * *py--);
00529 
00530           /* Decrement the loop counter */
00531           k--;
00532         }
00533 
00534         /* Store the result in the accumulator in the destination buffer. */
00535         *pOut++ = (q15_t) (sum >> 15);
00536 
00537         /* Increment the pointer pIn1 index, count by 1 */
00538         count++;
00539 
00540         /* Update the inputA and inputB pointers for next MAC calculation */
00541         px = pIn1 + count;
00542         py = pSrc2;
00543 
00544         /* Decrement the loop counter */
00545         blkCnt--;
00546       }
00547     }
00548     else
00549     {
00550       /* If the srcBLen is not a multiple of 4,   
00551        * the blockSize2 loop cannot be unrolled by 4 */
00552       blkCnt = (uint32_t) blockSize2;
00553 
00554       while(blkCnt > 0u)
00555       {
00556         /* Accumulator is made zero for every iteration */
00557         sum = 0;
00558 
00559         /* srcBLen number of MACS should be performed */
00560         k = srcBLen;
00561 
00562         while(k > 0u)
00563         {
00564           /* Perform the multiply-accumulate */
00565           sum += ((q31_t) * px++ * *py--);
00566 
00567           /* Decrement the loop counter */
00568           k--;
00569         }
00570 
00571         /* Store the result in the accumulator in the destination buffer. */
00572         *pOut++ = (q15_t) (sum >> 15);
00573 
00574         /* Increment the MAC count */
00575         count++;
00576 
00577         /* Update the inputA and inputB pointers for next MAC calculation */
00578         px = pIn1 + count;
00579         py = pSrc2;
00580 
00581         /* Decrement the loop counter */
00582         blkCnt--;
00583       }
00584     }
00585 
00586 
00587     /* --------------------------   
00588      * Initializations of stage3   
00589      * -------------------------*/
00590 
00591     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
00592      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
00593      * ....   
00594      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
00595      * sum +=  x[srcALen-1] * y[srcBLen-1]   
00596      */
00597 
00598     /* In this stage the MAC operations are decreased by 1 for every iteration.   
00599        The count variable holds the number of MAC operations performed */
00600     count = srcBLen - 1u;
00601 
00602     /* Working pointer of inputA */
00603     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00604     px = pSrc1;
00605 
00606     /* Working pointer of inputB */
00607     pSrc2 = pIn2 + (srcBLen - 1u);
00608     pIn2 = pSrc2 - 1u;
00609     py = pIn2;
00610 
00611     /* -------------------   
00612      * Stage3 process   
00613      * ------------------*/
00614 
00615     /* For loop unrolling by 4, this stage is divided into two. */
00616     /* First part of this stage computes the MAC operations greater than 4 */
00617     /* Second part of this stage computes the MAC operations less than or equal to 4 */
00618 
00619     /* The first part of the stage starts here */
00620     j = count >> 2u;
00621 
00622     while((j > 0u) && (blockSize3 > 0))
00623     {
00624       /* Accumulator is made zero for every iteration */
00625       sum = 0;
00626 
00627       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00628       k = count >> 2u;
00629 
00630       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00631        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00632       while(k > 0u)
00633       {
00634         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied   
00635          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00636         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00637         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied   
00638          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00639         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00640 
00641         /* Decrement the loop counter */
00642         k--;
00643       }
00644 
00645       /* For the next MAC operations, the pointer py is used without SIMD   
00646        * So, py is incremented by 1 */
00647       py = py + 1u;
00648 
00649       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00650        ** No loop unrolling is used. */
00651       k = count % 0x4u;
00652 
00653       while(k > 0u)
00654       {
00655         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00656         sum = __SMLAD(*px++, *py--, sum);
00657 
00658         /* Decrement the loop counter */
00659         k--;
00660       }
00661 
00662       /* Store the result in the accumulator in the destination buffer. */
00663       *pOut++ = (q15_t) (sum >> 15);
00664 
00665       /* Update the inputA and inputB pointers for next MAC calculation */
00666       px = ++pSrc1;
00667       py = pIn2;
00668 
00669       /* Decrement the MAC count */
00670       count--;
00671 
00672       /* Decrement the loop counter */
00673       blockSize3--;
00674 
00675       j--;
00676     }
00677 
00678     /* The second part of the stage starts here */
00679     /* SIMD is not used for the next MAC operations,   
00680      * so pointer py is updated to read only one sample at a time */
00681     py = py + 1u;
00682 
00683     while(blockSize3 > 0)
00684     {
00685       /* Accumulator is made zero for every iteration */
00686       sum = 0;
00687 
00688       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00689       k = count;
00690 
00691       while(k > 0u)
00692       {
00693         /* Perform the multiply-accumulates */
00694         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00695         sum = __SMLAD(*px++, *py--, sum);
00696 
00697         /* Decrement the loop counter */
00698         k--;
00699       }
00700 
00701       /* Store the result in the accumulator in the destination buffer. */
00702       *pOut++ = (q15_t) (sum >> 15);
00703 
00704       /* Update the inputA and inputB pointers for next MAC calculation */
00705       px = ++pSrc1;
00706       py = pSrc2;
00707 
00708       /* Decrement the MAC count */
00709       count--;
00710 
00711       /* Decrement the loop counter */
00712       blockSize3--;
00713     }
00714 
00715     /* set status as ARM_MATH_SUCCESS */
00716     status = ARM_MATH_SUCCESS;
00717   }
00718 
00719   /* Return to application */
00720   return (status);
00721 
00722 #else
00723 
00724   q15_t *pIn1;                                   /* inputA pointer               */
00725   q15_t *pIn2;                                   /* inputB pointer               */
00726   q15_t *pOut = pDst;                            /* output pointer               */
00727   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
00728   q15_t *px;                                     /* Intermediate inputA pointer  */
00729   q15_t *py;                                     /* Intermediate inputB pointer  */
00730   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00731   q31_t x0, x1, x2, x3, c0;
00732   uint32_t j, k, count, check, blkCnt;
00733   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
00734   arm_status status;                             /* status of Partial convolution */
00735   q15_t a, b;
00736 
00737   /* Check for range of output samples to be calculated */
00738   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00739   {
00740     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00741     status = ARM_MATH_ARGUMENT_ERROR;
00742   }
00743   else
00744   {
00745 
00746     /* The algorithm implementation is based on the lengths of the inputs. */
00747     /* srcB is always made to slide across srcA. */
00748     /* So srcBLen is always considered as shorter or equal to srcALen */
00749     if(srcALen >=srcBLen)
00750     {
00751       /* Initialization of inputA pointer */
00752       pIn1 = pSrcA;
00753 
00754       /* Initialization of inputB pointer */
00755       pIn2 = pSrcB;
00756     }
00757     else
00758     {
00759       /* Initialization of inputA pointer */
00760       pIn1 = pSrcB;
00761 
00762       /* Initialization of inputB pointer */
00763       pIn2 = pSrcA;
00764 
00765       /* srcBLen is always considered as shorter or equal to srcALen */
00766       j = srcBLen;
00767       srcBLen = srcALen;
00768       srcALen = j;
00769     }
00770 
00771     /* Conditions to check which loopCounter holds   
00772      * the first and last indices of the output samples to be calculated. */
00773     check = firstIndex + numPoints;
00774     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00775     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00776     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
00777     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00778                                      (int32_t) numPoints) : 0;
00779     blockSize2 = ((int32_t) check - blockSize3) -
00780       (blockSize1 + (int32_t) firstIndex);
00781     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00782 
00783     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00784     /* The function is internally   
00785      * divided into three stages according to the number of multiplications that has to be   
00786      * taken place between inputA samples and inputB samples. In the first stage of the   
00787      * algorithm, the multiplications increase by one for every iteration.   
00788      * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00789      * In the third stage of the algorithm, the multiplications decrease by one   
00790      * for every iteration. */
00791 
00792     /* Set the output pointer to point to the firstIndex   
00793      * of the output sample to be calculated. */
00794     pOut = pDst + firstIndex;
00795 
00796     /* --------------------------   
00797      * Initializations of stage1   
00798      * -------------------------*/
00799 
00800     /* sum = x[0] * y[0]   
00801      * sum = x[0] * y[1] + x[1] * y[0]   
00802      * ....   
00803      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00804      */
00805 
00806     /* In this stage the MAC operations are increased by 1 for every iteration.   
00807        The count variable holds the number of MAC operations performed.   
00808        Since the partial convolution starts from firstIndex   
00809        Number of Macs to be performed is firstIndex + 1 */
00810     count = 1u + firstIndex;
00811 
00812     /* Working pointer of inputA */
00813     px = pIn1;
00814 
00815     /* Working pointer of inputB */
00816     pSrc2 = pIn2 + firstIndex;
00817     py = pSrc2;
00818 
00819     /* ------------------------   
00820      * Stage1 process   
00821      * ----------------------*/
00822 
00823     /* For loop unrolling by 4, this stage is divided into two. */
00824     /* First part of this stage computes the MAC operations less than 4 */
00825     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00826 
00827     /* The first part of the stage starts here */
00828   while((count < 4u) && (blockSize1 > 0))
00829     {
00830       /* Accumulator is made zero for every iteration */
00831       sum = 0;
00832 
00833       /* Loop over number of MAC operations between   
00834        * inputA samples and inputB samples */
00835       k = count;
00836 
00837       while(k > 0u)
00838       {
00839         /* Perform the multiply-accumulates */
00840       sum += ((q31_t) * px++ * *py--);
00841 
00842         /* Decrement the loop counter */
00843         k--;
00844       }
00845 
00846       /* Store the result in the accumulator in the destination buffer. */
00847       *pOut++ = (q15_t) (sum >> 15);
00848 
00849       /* Update the inputA and inputB pointers for next MAC calculation */
00850       py = ++pSrc2;
00851       px = pIn1;
00852 
00853       /* Increment the MAC count */
00854       count++;
00855 
00856       /* Decrement the loop counter */
00857       blockSize1--;
00858     }
00859 
00860     /* The second part of the stage starts here */
00861     /* The internal loop, over count, is unrolled by 4 */
00862     /* To, read the last two inputB samples using SIMD:   
00863      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00864     py = py - 1;
00865 
00866   while(blockSize1 > 0)
00867     {
00868       /* Accumulator is made zero for every iteration */
00869       sum = 0;
00870 
00871       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00872       k = count >> 2u;
00873 
00874       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00875        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00876     py++;
00877 
00878     while(k > 0u)
00879     {
00880       /* Perform the multiply-accumulates */
00881         sum += ((q31_t) * px++ * *py--);
00882         sum += ((q31_t) * px++ * *py--);
00883         sum += ((q31_t) * px++ * *py--);
00884         sum += ((q31_t) * px++ * *py--);
00885 
00886       /* Decrement the loop counter */
00887       k--;
00888     }
00889 
00890       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00891        ** No loop unrolling is used. */
00892       k = count % 0x4u;
00893 
00894       while(k > 0u)
00895       {
00896         /* Perform the multiply-accumulates */
00897       sum += ((q31_t) * px++ * *py--);
00898 
00899         /* Decrement the loop counter */
00900         k--;
00901       }
00902 
00903       /* Store the result in the accumulator in the destination buffer. */
00904       *pOut++ = (q15_t) (sum >> 15);
00905 
00906       /* Update the inputA and inputB pointers for next MAC calculation */
00907       py = ++pSrc2 - 1u;
00908       px = pIn1;
00909 
00910       /* Increment the MAC count */
00911       count++;
00912 
00913       /* Decrement the loop counter */
00914       blockSize1--;
00915     }
00916 
00917     /* --------------------------   
00918      * Initializations of stage2   
00919      * ------------------------*/
00920 
00921     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00922      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00923      * ....   
00924      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00925      */
00926 
00927     /* Working pointer of inputA */
00928     if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00929     {
00930       px = pIn1 + firstIndex - srcBLen + 1;
00931     }
00932     else
00933     {
00934       px = pIn1;
00935     }
00936 
00937     /* Working pointer of inputB */
00938     pSrc2 = pIn2 + (srcBLen - 1u);
00939     py = pSrc2;
00940 
00941     /* count is the index by which the pointer pIn1 to be incremented */
00942     count = 0u;
00943 
00944 
00945     /* --------------------   
00946      * Stage2 process   
00947      * -------------------*/
00948 
00949     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00950      * So, to loop unroll over blockSize2,   
00951      * srcBLen should be greater than or equal to 4 */
00952     if(srcBLen >= 4u)
00953     {
00954       /* Loop unroll over blockSize2, by 4 */
00955       blkCnt = ((uint32_t) blockSize2 >> 2u);
00956 
00957       while(blkCnt > 0u)
00958       {
00959       py = py - 1u;
00960 
00961         /* Set all accumulators to zero */
00962         acc0 = 0;
00963         acc1 = 0;
00964         acc2 = 0;
00965         acc3 = 0;
00966 
00967       /* read x[0], x[1] samples */
00968       a = *px++;
00969       b = *px++;
00970 
00971 #ifndef ARM_MATH_BIG_ENDIAN
00972     
00973       x0 = __PKHBT(a, b, 16);
00974       a = *px;
00975       x1 = __PKHBT(b, a, 16);
00976 
00977 #else
00978 
00979       x0 = __PKHBT(b, a, 16);
00980       a = *px;
00981       x1 = __PKHBT(a, b, 16);
00982 
00983 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
00984 
00985       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00986       k = srcBLen >> 2u;
00987 
00988       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00989        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00990       do
00991       {
00992         /* Read the last two inputB samples using SIMD:   
00993          * y[srcBLen - 1] and y[srcBLen - 2] */
00994         a = *py;
00995         b = *(py+1);
00996         py -= 2;
00997 
00998 #ifndef ARM_MATH_BIG_ENDIAN
00999 
01000         c0 = __PKHBT(a, b, 16);
01001 
01002 #else
01003 
01004         c0 = __PKHBT(b, a, 16);;
01005 
01006 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01007 
01008         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
01009         acc0 = __SMLADX(x0, c0, acc0);
01010 
01011         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
01012         acc1 = __SMLADX(x1, c0, acc1);
01013 
01014       a = *px;
01015       b = *(px + 1);
01016 
01017 #ifndef ARM_MATH_BIG_ENDIAN
01018     
01019       x2 = __PKHBT(a, b, 16);
01020       a = *(px + 2);
01021       x3 = __PKHBT(b, a, 16);
01022 
01023 #else
01024 
01025       x2 = __PKHBT(b, a, 16);
01026       a = *(px + 2);
01027       x3 = __PKHBT(a, b, 16);
01028 
01029 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01030 
01031         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
01032         acc2 = __SMLADX(x2, c0, acc2);
01033 
01034         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
01035         acc3 = __SMLADX(x3, c0, acc3);
01036 
01037         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
01038         a = *py;
01039         b = *(py+1);
01040         py -= 2;
01041 
01042 #ifndef ARM_MATH_BIG_ENDIAN
01043 
01044         c0 = __PKHBT(a, b, 16);
01045 
01046 #else
01047 
01048         c0 = __PKHBT(b, a, 16);;
01049 
01050 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01051 
01052         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
01053         acc0 = __SMLADX(x2, c0, acc0);
01054 
01055         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
01056         acc1 = __SMLADX(x3, c0, acc1);
01057 
01058         /* Read x[4], x[5], x[6] */
01059       a = *(px + 2);
01060       b = *(px + 3);
01061 
01062 #ifndef ARM_MATH_BIG_ENDIAN
01063     
01064       x0 = __PKHBT(a, b, 16);
01065       a = *(px + 4);
01066       x1 = __PKHBT(b, a, 16);
01067 
01068 #else
01069 
01070       x0 = __PKHBT(b, a, 16);
01071       a = *(px + 4);
01072       x1 = __PKHBT(a, b, 16);
01073 
01074 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01075 
01076         px += 4u;
01077 
01078         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
01079         acc2 = __SMLADX(x0, c0, acc2);
01080 
01081         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
01082         acc3 = __SMLADX(x1, c0, acc3);
01083 
01084       } while(--k);
01085 
01086       /* For the next MAC operations, SIMD is not used   
01087        * So, the 16 bit pointer if inputB, py is updated */
01088 
01089       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
01090        ** No loop unrolling is used. */
01091       k = srcBLen % 0x4u;
01092 
01093       if(k == 1u)
01094       {
01095         /* Read y[srcBLen - 5] */
01096         c0 = *(py+1);
01097 
01098 #ifdef  ARM_MATH_BIG_ENDIAN
01099 
01100         c0 = c0 << 16u;
01101 
01102 #else
01103 
01104         c0 = c0 & 0x0000FFFF;
01105 
01106 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
01107 
01108         /* Read x[7] */
01109         a = *px;
01110         b = *(px+1);
01111         px++;
01112 
01113 #ifndef ARM_MATH_BIG_ENDIAN
01114 
01115         x3 = __PKHBT(a, b, 16);
01116 
01117 #else
01118 
01119         x3 = __PKHBT(b, a, 16);;
01120 
01121 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01122 
01123 
01124         /* Perform the multiply-accumulates */
01125         acc0 = __SMLAD(x0, c0, acc0);
01126         acc1 = __SMLAD(x1, c0, acc1);
01127         acc2 = __SMLADX(x1, c0, acc2);
01128         acc3 = __SMLADX(x3, c0, acc3);
01129       }
01130 
01131       if(k == 2u)
01132       {
01133         /* Read y[srcBLen - 5], y[srcBLen - 6] */
01134         a = *py;
01135         b = *(py+1);
01136 
01137 #ifndef ARM_MATH_BIG_ENDIAN
01138 
01139         c0 = __PKHBT(a, b, 16);
01140 
01141 #else
01142 
01143         c0 = __PKHBT(b, a, 16);;
01144 
01145 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01146 
01147         /* Read x[7], x[8], x[9] */
01148       a = *px;
01149       b = *(px + 1);
01150 
01151 #ifndef ARM_MATH_BIG_ENDIAN
01152     
01153       x3 = __PKHBT(a, b, 16);
01154       a = *(px + 2);
01155       x2 = __PKHBT(b, a, 16);
01156 
01157 #else
01158 
01159       x3 = __PKHBT(b, a, 16);
01160       a = *(px + 2);
01161       x2 = __PKHBT(a, b, 16);
01162 
01163 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01164         px += 2u;
01165 
01166         /* Perform the multiply-accumulates */
01167         acc0 = __SMLADX(x0, c0, acc0);
01168         acc1 = __SMLADX(x1, c0, acc1);
01169         acc2 = __SMLADX(x3, c0, acc2);
01170         acc3 = __SMLADX(x2, c0, acc3);
01171       }
01172 
01173       if(k == 3u)
01174       {
01175         /* Read y[srcBLen - 5], y[srcBLen - 6] */
01176         a = *py;
01177         b = *(py+1);
01178 
01179 #ifndef ARM_MATH_BIG_ENDIAN
01180 
01181         c0 = __PKHBT(a, b, 16);
01182 
01183 #else
01184 
01185         c0 = __PKHBT(b, a, 16);;
01186 
01187 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01188 
01189         /* Read x[7], x[8], x[9] */
01190       a = *px;
01191       b = *(px + 1);
01192 
01193 #ifndef ARM_MATH_BIG_ENDIAN
01194     
01195       x3 = __PKHBT(a, b, 16);
01196       a = *(px + 2);
01197       x2 = __PKHBT(b, a, 16);
01198 
01199 #else
01200 
01201       x3 = __PKHBT(b, a, 16);
01202       a = *(px + 2);
01203       x2 = __PKHBT(a, b, 16);
01204 
01205 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01206 
01207         /* Perform the multiply-accumulates */
01208         acc0 = __SMLADX(x0, c0, acc0);
01209         acc1 = __SMLADX(x1, c0, acc1);
01210         acc2 = __SMLADX(x3, c0, acc2);
01211         acc3 = __SMLADX(x2, c0, acc3);
01212 
01213         /* Read y[srcBLen - 7] */
01214         c0 = *(py-1);
01215 #ifdef  ARM_MATH_BIG_ENDIAN
01216 
01217         c0 = c0 << 16u;
01218 #else
01219 
01220         c0 = c0 & 0x0000FFFF;
01221 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
01222 
01223         /* Read x[10] */
01224         a = *(px+2);
01225         b = *(px+3);
01226 
01227 #ifndef ARM_MATH_BIG_ENDIAN
01228 
01229         x3 = __PKHBT(a, b, 16);
01230 
01231 #else
01232 
01233         x3 = __PKHBT(b, a, 16);;
01234 
01235 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01236 
01237         px += 3u;
01238 
01239         /* Perform the multiply-accumulates */
01240         acc0 = __SMLADX(x1, c0, acc0);
01241         acc1 = __SMLAD(x2, c0, acc1);
01242         acc2 = __SMLADX(x2, c0, acc2);
01243         acc3 = __SMLADX(x3, c0, acc3);
01244       }
01245 
01246       /* Store the results in the accumulators in the destination buffer. */
01247       *pOut++ = (q15_t)(acc0 >> 15);
01248       *pOut++ = (q15_t)(acc1 >> 15);
01249       *pOut++ = (q15_t)(acc2 >> 15);
01250       *pOut++ = (q15_t)(acc3 >> 15);
01251 
01252         /* Increment the pointer pIn1 index, count by 4 */
01253         count += 4u;
01254 
01255         /* Update the inputA and inputB pointers for next MAC calculation */
01256         px = pIn1 + count;
01257         py = pSrc2;
01258 
01259         /* Decrement the loop counter */
01260         blkCnt--;
01261       }
01262 
01263       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
01264        ** No loop unrolling is used. */
01265       blkCnt = (uint32_t) blockSize2 % 0x4u;
01266 
01267       while(blkCnt > 0u)
01268       {
01269         /* Accumulator is made zero for every iteration */
01270         sum = 0;
01271 
01272         /* Apply loop unrolling and compute 4 MACs simultaneously. */
01273         k = srcBLen >> 2u;
01274 
01275         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
01276          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01277         while(k > 0u)
01278         {
01279           /* Perform the multiply-accumulates */
01280           sum += ((q31_t) * px++ * *py--);
01281           sum += ((q31_t) * px++ * *py--);
01282           sum += ((q31_t) * px++ * *py--);
01283           sum += ((q31_t) * px++ * *py--);
01284 
01285           /* Decrement the loop counter */
01286           k--;
01287         }
01288 
01289         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
01290          ** No loop unrolling is used. */
01291         k = srcBLen % 0x4u;
01292 
01293         while(k > 0u)
01294         {
01295           /* Perform the multiply-accumulates */
01296           sum += ((q31_t) * px++ * *py--);
01297 
01298           /* Decrement the loop counter */
01299           k--;
01300         }
01301 
01302         /* Store the result in the accumulator in the destination buffer. */
01303         *pOut++ = (q15_t) (sum >> 15);
01304 
01305         /* Increment the pointer pIn1 index, count by 1 */
01306         count++;
01307 
01308         /* Update the inputA and inputB pointers for next MAC calculation */
01309         px = pIn1 + count;
01310         py = pSrc2;
01311 
01312         /* Decrement the loop counter */
01313         blkCnt--;
01314       }
01315     }
01316     else
01317     {
01318       /* If the srcBLen is not a multiple of 4,   
01319        * the blockSize2 loop cannot be unrolled by 4 */
01320       blkCnt = (uint32_t) blockSize2;
01321 
01322       while(blkCnt > 0u)
01323       {
01324         /* Accumulator is made zero for every iteration */
01325         sum = 0;
01326 
01327         /* srcBLen number of MACS should be performed */
01328         k = srcBLen;
01329 
01330         while(k > 0u)
01331         {
01332           /* Perform the multiply-accumulate */
01333           sum += ((q31_t) * px++ * *py--);
01334 
01335           /* Decrement the loop counter */
01336           k--;
01337         }
01338 
01339         /* Store the result in the accumulator in the destination buffer. */
01340         *pOut++ = (q15_t) (sum >> 15);
01341 
01342         /* Increment the MAC count */
01343         count++;
01344 
01345         /* Update the inputA and inputB pointers for next MAC calculation */
01346         px = pIn1 + count;
01347         py = pSrc2;
01348 
01349         /* Decrement the loop counter */
01350         blkCnt--;
01351       }
01352     }
01353 
01354 
01355     /* --------------------------   
01356      * Initializations of stage3   
01357      * -------------------------*/
01358 
01359     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
01360      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
01361      * ....   
01362      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
01363      * sum +=  x[srcALen-1] * y[srcBLen-1]   
01364      */
01365 
01366     /* In this stage the MAC operations are decreased by 1 for every iteration.   
01367        The count variable holds the number of MAC operations performed */
01368     count = srcBLen - 1u;
01369 
01370     /* Working pointer of inputA */
01371     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
01372     px = pSrc1;
01373 
01374     /* Working pointer of inputB */
01375     pSrc2 = pIn2 + (srcBLen - 1u);
01376     pIn2 = pSrc2 - 1u;
01377     py = pIn2;
01378 
01379     /* -------------------   
01380      * Stage3 process   
01381      * ------------------*/
01382 
01383     /* For loop unrolling by 4, this stage is divided into two. */
01384     /* First part of this stage computes the MAC operations greater than 4 */
01385     /* Second part of this stage computes the MAC operations less than or equal to 4 */
01386 
01387     /* The first part of the stage starts here */
01388     j = count >> 2u;
01389 
01390     while((j > 0u) && (blockSize3 > 0))
01391     {
01392       /* Accumulator is made zero for every iteration */
01393       sum = 0;
01394 
01395       /* Apply loop unrolling and compute 4 MACs simultaneously. */
01396       k = count >> 2u;
01397 
01398       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
01399        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01400     py++;
01401 
01402     while(k > 0u)
01403     {   
01404       /* Perform the multiply-accumulates */
01405         sum += ((q31_t) * px++ * *py--);
01406         sum += ((q31_t) * px++ * *py--);
01407         sum += ((q31_t) * px++ * *py--);
01408         sum += ((q31_t) * px++ * *py--);
01409       /* Decrement the loop counter */
01410       k--;
01411     }
01412 
01413 
01414       /* If the count is not a multiple of 4, compute any remaining MACs here.   
01415        ** No loop unrolling is used. */
01416       k = count % 0x4u;
01417 
01418       while(k > 0u)
01419       {
01420       /* Perform the multiply-accumulates */
01421         sum += ((q31_t) * px++ * *py--);
01422 
01423         /* Decrement the loop counter */
01424         k--;
01425       }
01426 
01427       /* Store the result in the accumulator in the destination buffer. */
01428       *pOut++ = (q15_t) (sum >> 15);
01429 
01430       /* Update the inputA and inputB pointers for next MAC calculation */
01431       px = ++pSrc1;
01432       py = pIn2;
01433 
01434       /* Decrement the MAC count */
01435       count--;
01436 
01437       /* Decrement the loop counter */
01438       blockSize3--;
01439 
01440       j--;
01441     }
01442 
01443     /* The second part of the stage starts here */
01444     /* SIMD is not used for the next MAC operations,   
01445      * so pointer py is updated to read only one sample at a time */
01446     py = py + 1u;
01447 
01448   while(blockSize3 > 0)
01449     {
01450       /* Accumulator is made zero for every iteration */
01451       sum = 0;
01452 
01453       /* Apply loop unrolling and compute 4 MACs simultaneously. */
01454       k = count;
01455 
01456       while(k > 0u)
01457       {
01458         /* Perform the multiply-accumulates */
01459         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
01460         sum += ((q31_t) * px++ * *py--);
01461 
01462         /* Decrement the loop counter */
01463         k--;
01464       }
01465 
01466       /* Store the result in the accumulator in the destination buffer. */
01467       *pOut++ = (q15_t) (sum >> 15);
01468 
01469       /* Update the inputA and inputB pointers for next MAC calculation */
01470       px = ++pSrc1;
01471       py = pSrc2;
01472 
01473       /* Decrement the MAC count */
01474       count--;
01475 
01476       /* Decrement the loop counter */
01477       blockSize3--;
01478     }
01479 
01480     /* set status as ARM_MATH_SUCCESS */
01481     status = ARM_MATH_SUCCESS;
01482   }
01483 
01484   /* Return to application */
01485   return (status);
01486 
01487 #endif /*     #ifndef UNALIGNED_SUPPORT_DISABLE      */
01488 }
01489 
01490 /**   
01491  * @} end of PartialConv group   
01492  */