Laxmi Kant Tiwari / mbed-dsp

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_fast_q15.c Source File

arm_conv_partial_fast_q15.c

00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_conv_partial_fast_q15.c   
00009 *   
00010 * Description:  Fast Q15 Partial convolution.   
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**   
00044  * @ingroup groupFilters   
00045  */
00046 
00047 /**   
00048  * @addtogroup PartialConv   
00049  * @{   
00050  */
00051 
00052 /**   
00053  * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.   
00054  * @param[in]       *pSrcA points to the first input sequence.   
00055  * @param[in]       srcALen length of the first input sequence.   
00056  * @param[in]       *pSrcB points to the second input sequence.   
00057  * @param[in]       srcBLen length of the second input sequence.   
00058  * @param[out]      *pDst points to the location where the output result is written.   
00059  * @param[in]       firstIndex is the first output sample to start with.   
00060  * @param[in]       numPoints is the number of output points to be computed.   
00061  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].   
00062  *   
00063  * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.   
00064  */
00065 
00066 
00067 arm_status arm_conv_partial_fast_q15(
00068   q15_t * pSrcA,
00069   uint32_t srcALen,
00070   q15_t * pSrcB,
00071   uint32_t srcBLen,
00072   q15_t * pDst,
00073   uint32_t firstIndex,
00074   uint32_t numPoints)
00075 {
00076 #ifndef UNALIGNED_SUPPORT_DISABLE
00077 
00078   q15_t *pIn1;                                   /* inputA pointer               */
00079   q15_t *pIn2;                                   /* inputB pointer               */
00080   q15_t *pOut = pDst;                            /* output pointer               */
00081   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
00082   q15_t *px;                                     /* Intermediate inputA pointer  */
00083   q15_t *py;                                     /* Intermediate inputB pointer  */
00084   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00085   q31_t x0, x1, x2, x3, c0;
00086   uint32_t j, k, count, check, blkCnt;
00087   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
00088   arm_status status;                             /* status of Partial convolution */
00089 
00090   /* Check for range of output samples to be calculated */
00091   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00092   {
00093     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00094     status = ARM_MATH_ARGUMENT_ERROR;
00095   }
00096   else
00097   {
00098 
00099     /* The algorithm implementation is based on the lengths of the inputs. */
00100     /* srcB is always made to slide across srcA. */
00101     /* So srcBLen is always considered as shorter or equal to srcALen */
00102     if(srcALen >=srcBLen)
00103     {
00104       /* Initialization of inputA pointer */
00105       pIn1 = pSrcA;
00106 
00107       /* Initialization of inputB pointer */
00108       pIn2 = pSrcB;
00109     }
00110     else
00111     {
00112       /* Initialization of inputA pointer */
00113       pIn1 = pSrcB;
00114 
00115       /* Initialization of inputB pointer */
00116       pIn2 = pSrcA;
00117 
00118       /* srcBLen is always considered as shorter or equal to srcALen */
00119       j = srcBLen;
00120       srcBLen = srcALen;
00121       srcALen = j;
00122     }
00123 
00124     /* Conditions to check which loopCounter holds   
00125      * the first and last indices of the output samples to be calculated. */
00126     check = firstIndex + numPoints;
00127     blockSize3 = ((int32_t) check - (int32_t) srcALen);
00128     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
00129     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00130     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00131                                      (int32_t) numPoints) : 0;
00132     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00133                                     (int32_t) firstIndex);
00134     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00135 
00136     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00137     /* The function is internally   
00138      * divided into three stages according to the number of multiplications that has to be   
00139      * taken place between inputA samples and inputB samples. In the first stage of the   
00140      * algorithm, the multiplications increase by one for every iteration.   
00141      * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00142      * In the third stage of the algorithm, the multiplications decrease by one   
00143      * for every iteration. */
00144 
00145     /* Set the output pointer to point to the firstIndex   
00146      * of the output sample to be calculated. */
00147     pOut = pDst + firstIndex;
00148 
00149     /* --------------------------   
00150      * Initializations of stage1   
00151      * -------------------------*/
00152 
00153     /* sum = x[0] * y[0]   
00154      * sum = x[0] * y[1] + x[1] * y[0]   
00155      * ....   
00156      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00157      */
00158 
00159     /* In this stage the MAC operations are increased by 1 for every iteration.   
00160        The count variable holds the number of MAC operations performed.   
00161        Since the partial convolution starts from firstIndex   
00162        Number of Macs to be performed is firstIndex + 1 */
00163     count = 1u + firstIndex;
00164 
00165     /* Working pointer of inputA */
00166     px = pIn1;
00167 
00168     /* Working pointer of inputB */
00169     pSrc2 = pIn2 + firstIndex;
00170     py = pSrc2;
00171 
00172     /* ------------------------   
00173      * Stage1 process   
00174      * ----------------------*/
00175 
00176     /* For loop unrolling by 4, this stage is divided into two. */
00177     /* First part of this stage computes the MAC operations less than 4 */
00178     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00179 
00180     /* The first part of the stage starts here */
00181     while((count < 4u) && (blockSize1 > 0))
00182     {
00183       /* Accumulator is made zero for every iteration */
00184       sum = 0;
00185 
00186       /* Loop over number of MAC operations between   
00187        * inputA samples and inputB samples */
00188       k = count;
00189 
00190       while(k > 0u)
00191       {
00192         /* Perform the multiply-accumulates */
00193         sum = __SMLAD(*px++, *py--, sum);
00194 
00195         /* Decrement the loop counter */
00196         k--;
00197       }
00198 
00199       /* Store the result in the accumulator in the destination buffer. */
00200       *pOut++ = (q15_t) (sum >> 15);
00201 
00202       /* Update the inputA and inputB pointers for next MAC calculation */
00203       py = ++pSrc2;
00204       px = pIn1;
00205 
00206       /* Increment the MAC count */
00207       count++;
00208 
00209       /* Decrement the loop counter */
00210       blockSize1--;
00211     }
00212 
00213     /* The second part of the stage starts here */
00214     /* The internal loop, over count, is unrolled by 4 */
00215     /* To, read the last two inputB samples using SIMD:   
00216      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00217     py = py - 1;
00218 
00219     while(blockSize1 > 0)
00220     {
00221       /* Accumulator is made zero for every iteration */
00222       sum = 0;
00223 
00224       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00225       k = count >> 2u;
00226 
00227       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00228        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00229       while(k > 0u)
00230       {
00231         /* Perform the multiply-accumulates */
00232         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00233         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00234         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00235         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00236 
00237         /* Decrement the loop counter */
00238         k--;
00239       }
00240 
00241       /* For the next MAC operations, the pointer py is used without SIMD   
00242        * So, py is incremented by 1 */
00243       py = py + 1u;
00244 
00245       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00246        ** No loop unrolling is used. */
00247       k = count % 0x4u;
00248 
00249       while(k > 0u)
00250       {
00251         /* Perform the multiply-accumulates */
00252         sum = __SMLAD(*px++, *py--, sum);
00253 
00254         /* Decrement the loop counter */
00255         k--;
00256       }
00257 
00258       /* Store the result in the accumulator in the destination buffer. */
00259       *pOut++ = (q15_t) (sum >> 15);
00260 
00261       /* Update the inputA and inputB pointers for next MAC calculation */
00262       py = ++pSrc2 - 1u;
00263       px = pIn1;
00264 
00265       /* Increment the MAC count */
00266       count++;
00267 
00268       /* Decrement the loop counter */
00269       blockSize1--;
00270     }
00271 
00272     /* --------------------------   
00273      * Initializations of stage2   
00274      * ------------------------*/
00275 
00276     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00277      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00278      * ....   
00279      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00280      */
00281 
00282     /* Working pointer of inputA */
00283     px = pIn1;
00284 
00285     /* Working pointer of inputB */
00286     pSrc2 = pIn2 + (srcBLen - 1u);
00287     py = pSrc2;
00288 
00289     /* count is the index by which the pointer pIn1 to be incremented */
00290     count = 0u;
00291 
00292 
00293     /* --------------------   
00294      * Stage2 process   
00295      * -------------------*/
00296 
00297     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00298      * So, to loop unroll over blockSize2,   
00299      * srcBLen should be greater than or equal to 4 */
00300     if(srcBLen >= 4u)
00301     {
00302       /* Loop unroll over blockSize2, by 4 */
00303       blkCnt = ((uint32_t) blockSize2 >> 2u);
00304 
00305       while(blkCnt > 0u)
00306       {
00307       py = py - 1u;
00308 
00309         /* Set all accumulators to zero */
00310         acc0 = 0;
00311         acc1 = 0;
00312         acc2 = 0;
00313         acc3 = 0;
00314 
00315 
00316         /* read x[0], x[1] samples */
00317       x0 = *__SIMD32(px);
00318         /* read x[1], x[2] samples */
00319       x1 = _SIMD32_OFFSET(px+1);
00320       px+= 2u;
00321 
00322 
00323         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00324         k = srcBLen >> 2u;
00325 
00326         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00327          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00328         do
00329         {
00330           /* Read the last two inputB samples using SIMD:   
00331            * y[srcBLen - 1] and y[srcBLen - 2] */
00332         c0 = *__SIMD32(py)--;
00333 
00334           /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00335           acc0 = __SMLADX(x0, c0, acc0);
00336 
00337           /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00338           acc1 = __SMLADX(x1, c0, acc1);
00339 
00340           /* Read x[2], x[3] */
00341         x2 = *__SIMD32(px);
00342 
00343           /* Read x[3], x[4] */
00344         x3 = _SIMD32_OFFSET(px+1);
00345 
00346           /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00347           acc2 = __SMLADX(x2, c0, acc2);
00348 
00349           /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00350           acc3 = __SMLADX(x3, c0, acc3);
00351 
00352           /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00353         c0 = *__SIMD32(py)--;
00354 
00355           /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00356           acc0 = __SMLADX(x2, c0, acc0);
00357 
00358           /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00359           acc1 = __SMLADX(x3, c0, acc1);
00360 
00361           /* Read x[4], x[5] */
00362         x0 = _SIMD32_OFFSET(px+2);
00363 
00364           /* Read x[5], x[6] */
00365         x1 = _SIMD32_OFFSET(px+3);
00366         px += 4u;
00367 
00368           /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00369           acc2 = __SMLADX(x0, c0, acc2);
00370 
00371           /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00372           acc3 = __SMLADX(x1, c0, acc3);
00373 
00374         } while(--k);
00375 
00376         /* For the next MAC operations, SIMD is not used   
00377          * So, the 16 bit pointer if inputB, py is updated */
00378 
00379         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00380          ** No loop unrolling is used. */
00381         k = srcBLen % 0x4u;
00382 
00383         if(k == 1u)
00384         {
00385           /* Read y[srcBLen - 5] */
00386         c0 = *(py+1);
00387 #ifdef  ARM_MATH_BIG_ENDIAN
00388 
00389         c0 = c0 << 16u;
00390 
00391 #else
00392 
00393         c0 = c0 & 0x0000FFFF;
00394 
00395 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00396 
00397           /* Read x[7] */
00398         x3 = *__SIMD32(px);
00399         px++;
00400 
00401           /* Perform the multiply-accumulates */
00402           acc0 = __SMLAD(x0, c0, acc0);
00403           acc1 = __SMLAD(x1, c0, acc1);
00404           acc2 = __SMLADX(x1, c0, acc2);
00405           acc3 = __SMLADX(x3, c0, acc3);
00406         }
00407 
00408         if(k == 2u)
00409         {
00410           /* Read y[srcBLen - 5], y[srcBLen - 6] */
00411         c0 = _SIMD32_OFFSET(py);
00412 
00413           /* Read x[7], x[8] */
00414         x3 = *__SIMD32(px);
00415 
00416         /* Read x[9] */
00417         x2 = _SIMD32_OFFSET(px+1);
00418         px += 2u;
00419 
00420           /* Perform the multiply-accumulates */
00421           acc0 = __SMLADX(x0, c0, acc0);
00422           acc1 = __SMLADX(x1, c0, acc1);
00423           acc2 = __SMLADX(x3, c0, acc2);
00424           acc3 = __SMLADX(x2, c0, acc3);
00425         }
00426 
00427         if(k == 3u)
00428         {
00429           /* Read y[srcBLen - 5], y[srcBLen - 6] */
00430         c0 = _SIMD32_OFFSET(py);
00431 
00432           /* Read x[7], x[8] */
00433         x3 = *__SIMD32(px);
00434 
00435           /* Read x[9] */
00436         x2 = _SIMD32_OFFSET(px+1);
00437 
00438           /* Perform the multiply-accumulates */
00439           acc0 = __SMLADX(x0, c0, acc0);
00440           acc1 = __SMLADX(x1, c0, acc1);
00441           acc2 = __SMLADX(x3, c0, acc2);
00442           acc3 = __SMLADX(x2, c0, acc3);
00443 
00444         c0 = *(py-1);
00445 #ifdef  ARM_MATH_BIG_ENDIAN
00446 
00447         c0 = c0 << 16u;
00448 #else
00449 
00450         c0 = c0 & 0x0000FFFF;
00451 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00452 
00453           /* Read x[10] */
00454         x3 =  _SIMD32_OFFSET(px+2);
00455         px += 3u;
00456 
00457           /* Perform the multiply-accumulates */
00458           acc0 = __SMLADX(x1, c0, acc0);
00459           acc1 = __SMLAD(x2, c0, acc1);
00460           acc2 = __SMLADX(x2, c0, acc2);
00461           acc3 = __SMLADX(x3, c0, acc3);
00462         }
00463 
00464         /* Store the results in the accumulators in the destination buffer. */
00465 #ifndef ARM_MATH_BIG_ENDIAN
00466 
00467         *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16);
00468         *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16);
00469 
00470 #else
00471 
00472         *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16);
00473         *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16);
00474 
00475 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00476 
00477         /* Increment the pointer pIn1 index, count by 4 */
00478         count += 4u;
00479 
00480         /* Update the inputA and inputB pointers for next MAC calculation */
00481         px = pIn1 + count;
00482         py = pSrc2;
00483 
00484         /* Decrement the loop counter */
00485         blkCnt--;
00486       }
00487 
00488       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00489        ** No loop unrolling is used. */
00490       blkCnt = (uint32_t) blockSize2 % 0x4u;
00491 
00492       while(blkCnt > 0u)
00493       {
00494         /* Accumulator is made zero for every iteration */
00495         sum = 0;
00496 
00497         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00498         k = srcBLen >> 2u;
00499 
00500         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00501          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00502         while(k > 0u)
00503         {
00504           /* Perform the multiply-accumulates */
00505           sum += ((q31_t) * px++ * *py--);
00506           sum += ((q31_t) * px++ * *py--);
00507           sum += ((q31_t) * px++ * *py--);
00508           sum += ((q31_t) * px++ * *py--);
00509 
00510           /* Decrement the loop counter */
00511           k--;
00512         }
00513 
00514         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00515          ** No loop unrolling is used. */
00516         k = srcBLen % 0x4u;
00517 
00518         while(k > 0u)
00519         {
00520           /* Perform the multiply-accumulates */
00521           sum += ((q31_t) * px++ * *py--);
00522 
00523           /* Decrement the loop counter */
00524           k--;
00525         }
00526 
00527         /* Store the result in the accumulator in the destination buffer. */
00528         *pOut++ = (q15_t) (sum >> 15);
00529 
00530         /* Increment the pointer pIn1 index, count by 1 */
00531         count++;
00532 
00533         /* Update the inputA and inputB pointers for next MAC calculation */
00534         px = pIn1 + count;
00535         py = pSrc2;
00536 
00537         /* Decrement the loop counter */
00538         blkCnt--;
00539       }
00540     }
00541     else
00542     {
00543       /* If the srcBLen is not a multiple of 4,   
00544        * the blockSize2 loop cannot be unrolled by 4 */
00545       blkCnt = (uint32_t) blockSize2;
00546 
00547       while(blkCnt > 0u)
00548       {
00549         /* Accumulator is made zero for every iteration */
00550         sum = 0;
00551 
00552         /* srcBLen number of MACS should be performed */
00553         k = srcBLen;
00554 
00555         while(k > 0u)
00556         {
00557           /* Perform the multiply-accumulate */
00558           sum += ((q31_t) * px++ * *py--);
00559 
00560           /* Decrement the loop counter */
00561           k--;
00562         }
00563 
00564         /* Store the result in the accumulator in the destination buffer. */
00565         *pOut++ = (q15_t) (sum >> 15);
00566 
00567         /* Increment the MAC count */
00568         count++;
00569 
00570         /* Update the inputA and inputB pointers for next MAC calculation */
00571         px = pIn1 + count;
00572         py = pSrc2;
00573 
00574         /* Decrement the loop counter */
00575         blkCnt--;
00576       }
00577     }
00578 
00579 
00580     /* --------------------------   
00581      * Initializations of stage3   
00582      * -------------------------*/
00583 
00584     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
00585      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
00586      * ....   
00587      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
00588      * sum +=  x[srcALen-1] * y[srcBLen-1]   
00589      */
00590 
00591     /* In this stage the MAC operations are decreased by 1 for every iteration.   
00592        The count variable holds the number of MAC operations performed */
00593     count = srcBLen - 1u;
00594 
00595     /* Working pointer of inputA */
00596     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00597     px = pSrc1;
00598 
00599     /* Working pointer of inputB */
00600     pSrc2 = pIn2 + (srcBLen - 1u);
00601     pIn2 = pSrc2 - 1u;
00602     py = pIn2;
00603 
00604     /* -------------------   
00605      * Stage3 process   
00606      * ------------------*/
00607 
00608     /* For loop unrolling by 4, this stage is divided into two. */
00609     /* First part of this stage computes the MAC operations greater than 4 */
00610     /* Second part of this stage computes the MAC operations less than or equal to 4 */
00611 
00612     /* The first part of the stage starts here */
00613     j = count >> 2u;
00614 
00615     while((j > 0u) && (blockSize3 > 0))
00616     {
00617       /* Accumulator is made zero for every iteration */
00618       sum = 0;
00619 
00620       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00621       k = count >> 2u;
00622 
00623       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00624        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00625       while(k > 0u)
00626       {
00627         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied   
00628          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00629         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00630         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied   
00631          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00632         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00633 
00634         /* Decrement the loop counter */
00635         k--;
00636       }
00637 
00638       /* For the next MAC operations, the pointer py is used without SIMD   
00639        * So, py is incremented by 1 */
00640       py = py + 1u;
00641 
00642       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00643        ** No loop unrolling is used. */
00644       k = count % 0x4u;
00645 
00646       while(k > 0u)
00647       {
00648         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00649         sum = __SMLAD(*px++, *py--, sum);
00650 
00651         /* Decrement the loop counter */
00652         k--;
00653       }
00654 
00655       /* Store the result in the accumulator in the destination buffer. */
00656       *pOut++ = (q15_t) (sum >> 15);
00657 
00658       /* Update the inputA and inputB pointers for next MAC calculation */
00659       px = ++pSrc1;
00660       py = pIn2;
00661 
00662       /* Decrement the MAC count */
00663       count--;
00664 
00665       /* Decrement the loop counter */
00666       blockSize3--;
00667 
00668       j--;
00669     }
00670 
00671     /* The second part of the stage starts here */
00672     /* SIMD is not used for the next MAC operations,   
00673      * so pointer py is updated to read only one sample at a time */
00674     py = py + 1u;
00675 
00676     while(blockSize3 > 0)
00677     {
00678       /* Accumulator is made zero for every iteration */
00679       sum = 0;
00680 
00681       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00682       k = count;
00683 
00684       while(k > 0u)
00685       {
00686         /* Perform the multiply-accumulates */
00687         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00688         sum = __SMLAD(*px++, *py--, sum);
00689 
00690         /* Decrement the loop counter */
00691         k--;
00692       }
00693 
00694       /* Store the result in the accumulator in the destination buffer. */
00695       *pOut++ = (q15_t) (sum >> 15);
00696 
00697       /* Update the inputA and inputB pointers for next MAC calculation */
00698       px = ++pSrc1;
00699       py = pSrc2;
00700 
00701       /* Decrement the MAC count */
00702       count--;
00703 
00704       /* Decrement the loop counter */
00705       blockSize3--;
00706     }
00707 
00708     /* set status as ARM_MATH_SUCCESS */
00709     status = ARM_MATH_SUCCESS;
00710   }
00711 
00712   /* Return to application */
00713   return (status);
00714 
00715 #else
00716 
00717   q15_t *pIn1;                                   /* inputA pointer               */
00718   q15_t *pIn2;                                   /* inputB pointer               */
00719   q15_t *pOut = pDst;                            /* output pointer               */
00720   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
00721   q15_t *px;                                     /* Intermediate inputA pointer  */
00722   q15_t *py;                                     /* Intermediate inputB pointer  */
00723   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00724   q31_t x0, x1, x2, x3, c0;
00725   uint32_t j, k, count, check, blkCnt;
00726   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
00727   arm_status status;                             /* status of Partial convolution */
00728   q15_t a, b;
00729 
00730   /* Check for range of output samples to be calculated */
00731   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00732   {
00733     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00734     status = ARM_MATH_ARGUMENT_ERROR;
00735   }
00736   else
00737   {
00738 
00739     /* The algorithm implementation is based on the lengths of the inputs. */
00740     /* srcB is always made to slide across srcA. */
00741     /* So srcBLen is always considered as shorter or equal to srcALen */
00742     if(srcALen >=srcBLen)
00743     {
00744       /* Initialization of inputA pointer */
00745       pIn1 = pSrcA;
00746 
00747       /* Initialization of inputB pointer */
00748       pIn2 = pSrcB;
00749     }
00750     else
00751     {
00752       /* Initialization of inputA pointer */
00753       pIn1 = pSrcB;
00754 
00755       /* Initialization of inputB pointer */
00756       pIn2 = pSrcA;
00757 
00758       /* srcBLen is always considered as shorter or equal to srcALen */
00759       j = srcBLen;
00760       srcBLen = srcALen;
00761       srcALen = j;
00762     }
00763 
00764     /* Conditions to check which loopCounter holds   
00765      * the first and last indices of the output samples to be calculated. */
00766     check = firstIndex + numPoints;
00767     blockSize3 = ((int32_t) check - (int32_t) srcALen);
00768     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
00769     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00770     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00771                                      (int32_t) numPoints) : 0;
00772     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00773                                     (int32_t) firstIndex);
00774     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00775 
00776     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00777     /* The function is internally   
00778      * divided into three stages according to the number of multiplications that has to be   
00779      * taken place between inputA samples and inputB samples. In the first stage of the   
00780      * algorithm, the multiplications increase by one for every iteration.   
00781      * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00782      * In the third stage of the algorithm, the multiplications decrease by one   
00783      * for every iteration. */
00784 
00785     /* Set the output pointer to point to the firstIndex   
00786      * of the output sample to be calculated. */
00787     pOut = pDst + firstIndex;
00788 
00789     /* --------------------------   
00790      * Initializations of stage1   
00791      * -------------------------*/
00792 
00793     /* sum = x[0] * y[0]   
00794      * sum = x[0] * y[1] + x[1] * y[0]   
00795      * ....   
00796      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00797      */
00798 
00799     /* In this stage the MAC operations are increased by 1 for every iteration.   
00800        The count variable holds the number of MAC operations performed.   
00801        Since the partial convolution starts from firstIndex   
00802        Number of Macs to be performed is firstIndex + 1 */
00803     count = 1u + firstIndex;
00804 
00805     /* Working pointer of inputA */
00806     px = pIn1;
00807 
00808     /* Working pointer of inputB */
00809     pSrc2 = pIn2 + firstIndex;
00810     py = pSrc2;
00811 
00812     /* ------------------------   
00813      * Stage1 process   
00814      * ----------------------*/
00815 
00816     /* For loop unrolling by 4, this stage is divided into two. */
00817     /* First part of this stage computes the MAC operations less than 4 */
00818     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00819 
00820     /* The first part of the stage starts here */
00821   while((count < 4u) && (blockSize1 > 0u))
00822     {
00823       /* Accumulator is made zero for every iteration */
00824       sum = 0;
00825 
00826       /* Loop over number of MAC operations between   
00827        * inputA samples and inputB samples */
00828       k = count;
00829 
00830       while(k > 0u)
00831       {
00832         /* Perform the multiply-accumulates */
00833       sum += ((q31_t) * px++ * *py--);
00834 
00835         /* Decrement the loop counter */
00836         k--;
00837       }
00838 
00839       /* Store the result in the accumulator in the destination buffer. */
00840       *pOut++ = (q15_t) (sum >> 15);
00841 
00842       /* Update the inputA and inputB pointers for next MAC calculation */
00843       py = ++pSrc2;
00844       px = pIn1;
00845 
00846       /* Increment the MAC count */
00847       count++;
00848 
00849       /* Decrement the loop counter */
00850       blockSize1--;
00851     }
00852 
00853     /* The second part of the stage starts here */
00854     /* The internal loop, over count, is unrolled by 4 */
00855     /* To, read the last two inputB samples using SIMD:   
00856      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00857     py = py - 1;
00858 
00859   while(blockSize1 > 0u)
00860     {
00861       /* Accumulator is made zero for every iteration */
00862       sum = 0;
00863 
00864       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00865       k = count >> 2u;
00866 
00867       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00868        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00869     py++;
00870 
00871     while(k > 0u)
00872     {
00873       /* Perform the multiply-accumulates */
00874         sum += ((q31_t) * px++ * *py--);
00875         sum += ((q31_t) * px++ * *py--);
00876         sum += ((q31_t) * px++ * *py--);
00877         sum += ((q31_t) * px++ * *py--);
00878 
00879       /* Decrement the loop counter */
00880       k--;
00881     }
00882 
00883       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00884        ** No loop unrolling is used. */
00885       k = count % 0x4u;
00886 
00887       while(k > 0u)
00888       {
00889         /* Perform the multiply-accumulates */
00890       sum += ((q31_t) * px++ * *py--);
00891 
00892         /* Decrement the loop counter */
00893         k--;
00894       }
00895 
00896       /* Store the result in the accumulator in the destination buffer. */
00897       *pOut++ = (q15_t) (sum >> 15);
00898 
00899       /* Update the inputA and inputB pointers for next MAC calculation */
00900       py = ++pSrc2 - 1u;
00901       px = pIn1;
00902 
00903       /* Increment the MAC count */
00904       count++;
00905 
00906       /* Decrement the loop counter */
00907       blockSize1--;
00908     }
00909 
00910     /* --------------------------   
00911      * Initializations of stage2   
00912      * ------------------------*/
00913 
00914     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00915      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00916      * ....   
00917      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00918      */
00919 
00920     /* Working pointer of inputA */
00921     px = pIn1;
00922 
00923     /* Working pointer of inputB */
00924     pSrc2 = pIn2 + (srcBLen - 1u);
00925     py = pSrc2;
00926 
00927     /* count is the index by which the pointer pIn1 to be incremented */
00928     count = 0u;
00929 
00930 
00931     /* --------------------   
00932      * Stage2 process   
00933      * -------------------*/
00934 
00935     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00936      * So, to loop unroll over blockSize2,   
00937      * srcBLen should be greater than or equal to 4 */
00938     if(srcBLen >= 4u)
00939     {
00940       /* Loop unroll over blockSize2, by 4 */
00941       blkCnt = ((uint32_t) blockSize2 >> 2u);
00942 
00943       while(blkCnt > 0u)
00944       {
00945       py = py - 1u;
00946 
00947         /* Set all accumulators to zero */
00948         acc0 = 0;
00949         acc1 = 0;
00950         acc2 = 0;
00951         acc3 = 0;
00952 
00953       /* read x[0], x[1] samples */
00954       a = *px++;
00955       b = *px++;
00956 
00957 #ifndef ARM_MATH_BIG_ENDIAN
00958     
00959       x0 = __PKHBT(a, b, 16);
00960       a = *px;
00961       x1 = __PKHBT(b, a, 16);
00962 
00963 #else
00964 
00965       x0 = __PKHBT(b, a, 16);
00966       a = *px;
00967       x1 = __PKHBT(a, b, 16);
00968 
00969 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
00970 
00971       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00972       k = srcBLen >> 2u;
00973 
00974       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00975        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00976       do
00977       {
00978         /* Read the last two inputB samples using SIMD:   
00979          * y[srcBLen - 1] and y[srcBLen - 2] */
00980         a = *py;
00981         b = *(py+1);
00982         py -= 2;
00983 
00984 #ifndef ARM_MATH_BIG_ENDIAN
00985 
00986         c0 = __PKHBT(a, b, 16);
00987 
00988 #else
00989 
00990         c0 = __PKHBT(b, a, 16);;
00991 
00992 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00993 
00994         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00995         acc0 = __SMLADX(x0, c0, acc0);
00996 
00997         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00998         acc1 = __SMLADX(x1, c0, acc1);
00999 
01000       a = *px;
01001       b = *(px + 1);
01002 
01003 #ifndef ARM_MATH_BIG_ENDIAN
01004     
01005       x2 = __PKHBT(a, b, 16);
01006       a = *(px + 2);
01007       x3 = __PKHBT(b, a, 16);
01008 
01009 #else
01010 
01011       x2 = __PKHBT(b, a, 16);
01012       a = *(px + 2);
01013       x3 = __PKHBT(a, b, 16);
01014 
01015 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01016 
01017         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
01018         acc2 = __SMLADX(x2, c0, acc2);
01019 
01020         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
01021         acc3 = __SMLADX(x3, c0, acc3);
01022 
01023         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
01024         a = *py;
01025         b = *(py+1);
01026         py -= 2;
01027 
01028 #ifndef ARM_MATH_BIG_ENDIAN
01029 
01030         c0 = __PKHBT(a, b, 16);
01031 
01032 #else
01033 
01034         c0 = __PKHBT(b, a, 16);;
01035 
01036 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01037 
01038         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
01039         acc0 = __SMLADX(x2, c0, acc0);
01040 
01041         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
01042         acc1 = __SMLADX(x3, c0, acc1);
01043 
01044         /* Read x[4], x[5], x[6] */
01045       a = *(px + 2);
01046       b = *(px + 3);
01047 
01048 #ifndef ARM_MATH_BIG_ENDIAN
01049     
01050       x0 = __PKHBT(a, b, 16);
01051       a = *(px + 4);
01052       x1 = __PKHBT(b, a, 16);
01053 
01054 #else
01055 
01056       x0 = __PKHBT(b, a, 16);
01057       a = *(px + 4);
01058       x1 = __PKHBT(a, b, 16);
01059 
01060 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01061 
01062         px += 4u;
01063 
01064         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
01065         acc2 = __SMLADX(x0, c0, acc2);
01066 
01067         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
01068         acc3 = __SMLADX(x1, c0, acc3);
01069 
01070       } while(--k);
01071 
01072       /* For the next MAC operations, SIMD is not used   
01073        * So, the 16 bit pointer if inputB, py is updated */
01074 
01075       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
01076        ** No loop unrolling is used. */
01077       k = srcBLen % 0x4u;
01078 
01079       if(k == 1u)
01080       {
01081         /* Read y[srcBLen - 5] */
01082         c0 = *(py+1);
01083 
01084 #ifdef  ARM_MATH_BIG_ENDIAN
01085 
01086         c0 = c0 << 16u;
01087 
01088 #else
01089 
01090         c0 = c0 & 0x0000FFFF;
01091 
01092 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
01093 
01094         /* Read x[7] */
01095         a = *px;
01096         b = *(px+1);
01097         px++;
01098 
01099 #ifndef ARM_MATH_BIG_ENDIAN
01100 
01101         x3 = __PKHBT(a, b, 16);
01102 
01103 #else
01104 
01105         x3 = __PKHBT(b, a, 16);;
01106 
01107 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01108 
01109 
01110         /* Perform the multiply-accumulates */
01111         acc0 = __SMLAD(x0, c0, acc0);
01112         acc1 = __SMLAD(x1, c0, acc1);
01113         acc2 = __SMLADX(x1, c0, acc2);
01114         acc3 = __SMLADX(x3, c0, acc3);
01115       }
01116 
01117       if(k == 2u)
01118       {
01119         /* Read y[srcBLen - 5], y[srcBLen - 6] */
01120         a = *py;
01121         b = *(py+1);
01122 
01123 #ifndef ARM_MATH_BIG_ENDIAN
01124 
01125         c0 = __PKHBT(a, b, 16);
01126 
01127 #else
01128 
01129         c0 = __PKHBT(b, a, 16);;
01130 
01131 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01132 
01133         /* Read x[7], x[8], x[9] */
01134       a = *px;
01135       b = *(px + 1);
01136 
01137 #ifndef ARM_MATH_BIG_ENDIAN
01138     
01139       x3 = __PKHBT(a, b, 16);
01140       a = *(px + 2);
01141       x2 = __PKHBT(b, a, 16);
01142 
01143 #else
01144 
01145       x3 = __PKHBT(b, a, 16);
01146       a = *(px + 2);
01147       x2 = __PKHBT(a, b, 16);
01148 
01149 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01150         px += 2u;
01151 
01152         /* Perform the multiply-accumulates */
01153         acc0 = __SMLADX(x0, c0, acc0);
01154         acc1 = __SMLADX(x1, c0, acc1);
01155         acc2 = __SMLADX(x3, c0, acc2);
01156         acc3 = __SMLADX(x2, c0, acc3);
01157       }
01158 
01159       if(k == 3u)
01160       {
01161         /* Read y[srcBLen - 5], y[srcBLen - 6] */
01162         a = *py;
01163         b = *(py+1);
01164 
01165 #ifndef ARM_MATH_BIG_ENDIAN
01166 
01167         c0 = __PKHBT(a, b, 16);
01168 
01169 #else
01170 
01171         c0 = __PKHBT(b, a, 16);;
01172 
01173 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01174 
01175         /* Read x[7], x[8], x[9] */
01176       a = *px;
01177       b = *(px + 1);
01178 
01179 #ifndef ARM_MATH_BIG_ENDIAN
01180     
01181       x3 = __PKHBT(a, b, 16);
01182       a = *(px + 2);
01183       x2 = __PKHBT(b, a, 16);
01184 
01185 #else
01186 
01187       x3 = __PKHBT(b, a, 16);
01188       a = *(px + 2);
01189       x2 = __PKHBT(a, b, 16);
01190 
01191 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01192 
01193         /* Perform the multiply-accumulates */
01194         acc0 = __SMLADX(x0, c0, acc0);
01195         acc1 = __SMLADX(x1, c0, acc1);
01196         acc2 = __SMLADX(x3, c0, acc2);
01197         acc3 = __SMLADX(x2, c0, acc3);
01198 
01199         /* Read y[srcBLen - 7] */
01200         c0 = *(py-1);
01201 #ifdef  ARM_MATH_BIG_ENDIAN
01202 
01203         c0 = c0 << 16u;
01204 #else
01205 
01206         c0 = c0 & 0x0000FFFF;
01207 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
01208 
01209         /* Read x[10] */
01210         a = *(px+2);
01211         b = *(px+3);
01212 
01213 #ifndef ARM_MATH_BIG_ENDIAN
01214 
01215         x3 = __PKHBT(a, b, 16);
01216 
01217 #else
01218 
01219         x3 = __PKHBT(b, a, 16);;
01220 
01221 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01222 
01223         px += 3u;
01224 
01225         /* Perform the multiply-accumulates */
01226         acc0 = __SMLADX(x1, c0, acc0);
01227         acc1 = __SMLAD(x2, c0, acc1);
01228         acc2 = __SMLADX(x2, c0, acc2);
01229         acc3 = __SMLADX(x3, c0, acc3);
01230       }
01231 
01232       /* Store the results in the accumulators in the destination buffer. */
01233       *pOut++ = (q15_t)(acc0 >> 15);
01234       *pOut++ = (q15_t)(acc1 >> 15);
01235       *pOut++ = (q15_t)(acc2 >> 15);
01236       *pOut++ = (q15_t)(acc3 >> 15);
01237 
01238         /* Increment the pointer pIn1 index, count by 4 */
01239         count += 4u;
01240 
01241         /* Update the inputA and inputB pointers for next MAC calculation */
01242         px = pIn1 + count;
01243         py = pSrc2;
01244 
01245         /* Decrement the loop counter */
01246         blkCnt--;
01247       }
01248 
01249       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
01250        ** No loop unrolling is used. */
01251       blkCnt = (uint32_t) blockSize2 % 0x4u;
01252 
01253       while(blkCnt > 0u)
01254       {
01255         /* Accumulator is made zero for every iteration */
01256         sum = 0;
01257 
01258         /* Apply loop unrolling and compute 4 MACs simultaneously. */
01259         k = srcBLen >> 2u;
01260 
01261         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
01262          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01263         while(k > 0u)
01264         {
01265           /* Perform the multiply-accumulates */
01266           sum += ((q31_t) * px++ * *py--);
01267           sum += ((q31_t) * px++ * *py--);
01268           sum += ((q31_t) * px++ * *py--);
01269           sum += ((q31_t) * px++ * *py--);
01270 
01271           /* Decrement the loop counter */
01272           k--;
01273         }
01274 
01275         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
01276          ** No loop unrolling is used. */
01277         k = srcBLen % 0x4u;
01278 
01279         while(k > 0u)
01280         {
01281           /* Perform the multiply-accumulates */
01282           sum += ((q31_t) * px++ * *py--);
01283 
01284           /* Decrement the loop counter */
01285           k--;
01286         }
01287 
01288         /* Store the result in the accumulator in the destination buffer. */
01289         *pOut++ = (q15_t) (sum >> 15);
01290 
01291         /* Increment the pointer pIn1 index, count by 1 */
01292         count++;
01293 
01294         /* Update the inputA and inputB pointers for next MAC calculation */
01295         px = pIn1 + count;
01296         py = pSrc2;
01297 
01298         /* Decrement the loop counter */
01299         blkCnt--;
01300       }
01301     }
01302     else
01303     {
01304       /* If the srcBLen is not a multiple of 4,   
01305        * the blockSize2 loop cannot be unrolled by 4 */
01306       blkCnt = (uint32_t) blockSize2;
01307 
01308       while(blkCnt > 0u)
01309       {
01310         /* Accumulator is made zero for every iteration */
01311         sum = 0;
01312 
01313         /* srcBLen number of MACS should be performed */
01314         k = srcBLen;
01315 
01316         while(k > 0u)
01317         {
01318           /* Perform the multiply-accumulate */
01319           sum += ((q31_t) * px++ * *py--);
01320 
01321           /* Decrement the loop counter */
01322           k--;
01323         }
01324 
01325         /* Store the result in the accumulator in the destination buffer. */
01326         *pOut++ = (q15_t) (sum >> 15);
01327 
01328         /* Increment the MAC count */
01329         count++;
01330 
01331         /* Update the inputA and inputB pointers for next MAC calculation */
01332         px = pIn1 + count;
01333         py = pSrc2;
01334 
01335         /* Decrement the loop counter */
01336         blkCnt--;
01337       }
01338     }
01339 
01340 
01341     /* --------------------------   
01342      * Initializations of stage3   
01343      * -------------------------*/
01344 
01345     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
01346      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
01347      * ....   
01348      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
01349      * sum +=  x[srcALen-1] * y[srcBLen-1]   
01350      */
01351 
01352     /* In this stage the MAC operations are decreased by 1 for every iteration.   
01353        The count variable holds the number of MAC operations performed */
01354     count = srcBLen - 1u;
01355 
01356     /* Working pointer of inputA */
01357     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
01358     px = pSrc1;
01359 
01360     /* Working pointer of inputB */
01361     pSrc2 = pIn2 + (srcBLen - 1u);
01362     pIn2 = pSrc2 - 1u;
01363     py = pIn2;
01364 
01365     /* -------------------   
01366      * Stage3 process   
01367      * ------------------*/
01368 
01369     /* For loop unrolling by 4, this stage is divided into two. */
01370     /* First part of this stage computes the MAC operations greater than 4 */
01371     /* Second part of this stage computes the MAC operations less than or equal to 4 */
01372 
01373     /* The first part of the stage starts here */
01374     j = count >> 2u;
01375 
01376     while((j > 0u) && (blockSize3 > 0))
01377     {
01378       /* Accumulator is made zero for every iteration */
01379       sum = 0;
01380 
01381       /* Apply loop unrolling and compute 4 MACs simultaneously. */
01382       k = count >> 2u;
01383 
01384       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
01385        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01386     py++;
01387 
01388     while(k > 0u)
01389     {   
01390       /* Perform the multiply-accumulates */
01391         sum += ((q31_t) * px++ * *py--);
01392         sum += ((q31_t) * px++ * *py--);
01393         sum += ((q31_t) * px++ * *py--);
01394         sum += ((q31_t) * px++ * *py--);
01395       /* Decrement the loop counter */
01396       k--;
01397     }
01398 
01399 
01400       /* If the count is not a multiple of 4, compute any remaining MACs here.   
01401        ** No loop unrolling is used. */
01402       k = count % 0x4u;
01403 
01404       while(k > 0u)
01405       {
01406       /* Perform the multiply-accumulates */
01407         sum += ((q31_t) * px++ * *py--);
01408 
01409         /* Decrement the loop counter */
01410         k--;
01411       }
01412 
01413       /* Store the result in the accumulator in the destination buffer. */
01414       *pOut++ = (q15_t) (sum >> 15);
01415 
01416       /* Update the inputA and inputB pointers for next MAC calculation */
01417       px = ++pSrc1;
01418       py = pIn2;
01419 
01420       /* Decrement the MAC count */
01421       count--;
01422 
01423       /* Decrement the loop counter */
01424       blockSize3--;
01425 
01426       j--;
01427     }
01428 
01429     /* The second part of the stage starts here */
01430     /* SIMD is not used for the next MAC operations,   
01431      * so pointer py is updated to read only one sample at a time */
01432     py = py + 1u;
01433 
01434   while(blockSize3 > 0u)
01435     {
01436       /* Accumulator is made zero for every iteration */
01437       sum = 0;
01438 
01439       /* Apply loop unrolling and compute 4 MACs simultaneously. */
01440       k = count;
01441 
01442       while(k > 0u)
01443       {
01444         /* Perform the multiply-accumulates */
01445         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
01446         sum += ((q31_t) * px++ * *py--);
01447 
01448         /* Decrement the loop counter */
01449         k--;
01450       }
01451 
01452       /* Store the result in the accumulator in the destination buffer. */
01453       *pOut++ = (q15_t) (sum >> 15);
01454 
01455       /* Update the inputA and inputB pointers for next MAC calculation */
01456       px = ++pSrc1;
01457       py = pSrc2;
01458 
01459       /* Decrement the MAC count */
01460       count--;
01461 
01462       /* Decrement the loop counter */
01463       blockSize3--;
01464     }
01465 
01466     /* set status as ARM_MATH_SUCCESS */
01467     status = ARM_MATH_SUCCESS;
01468   }
01469 
01470   /* Return to application */
01471   return (status);
01472 
01473 #endif /*     #ifndef UNALIGNED_SUPPORT_DISABLE      */
01474 }
01475 
01476 /**   
01477  * @} end of PartialConv group   
01478  */