Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_opt_q15.c Source File

arm_conv_partial_opt_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_partial_opt_q15.c
00004  * Description:  Partial convolution of Q15 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup PartialConv
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Partial convolution of Q15 sequences.
00042  * @param[in]       *pSrcA points to the first input sequence.
00043  * @param[in]       srcALen length of the first input sequence.
00044  * @param[in]       *pSrcB points to the second input sequence.
00045  * @param[in]       srcBLen length of the second input sequence.
00046  * @param[out]      *pDst points to the location where the output result is written.
00047  * @param[in]       firstIndex is the first output sample to start with.
00048  * @param[in]       numPoints is the number of output points to be computed.
00049  * @param[in]       *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
00050  * @param[in]       *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
00051  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
00052  *
00053  * \par Restrictions
00054  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
00055  *  In this case input, output, state buffers should be aligned by 32-bit
00056  *
00057  * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
00058  *
00059  *
00060  */
00061 
00062 #ifndef UNALIGNED_SUPPORT_DISABLE
00063 
00064 arm_status arm_conv_partial_opt_q15(
00065   q15_t * pSrcA,
00066   uint32_t srcALen,
00067   q15_t * pSrcB,
00068   uint32_t srcBLen,
00069   q15_t * pDst,
00070   uint32_t firstIndex,
00071   uint32_t numPoints,
00072   q15_t * pScratch1,
00073   q15_t * pScratch2)
00074 {
00075 
00076   q15_t *pOut = pDst;                            /* output pointer */
00077   q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
00078   q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
00079   q63_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00080   q31_t x1, x2, x3;                              /* Temporary variables to hold state and coefficient values */
00081   q31_t y1, y2;                                  /* State variables */
00082   q15_t *pIn1;                                   /* inputA pointer */
00083   q15_t *pIn2;                                   /* inputB pointer */
00084   q15_t *px;                                     /* Intermediate inputA pointer  */
00085   q15_t *py;                                     /* Intermediate inputB pointer  */
00086   uint32_t j, k, blkCnt;                         /* loop counter */
00087   arm_status status;                             /* Status variable */
00088   uint32_t tapCnt;                               /* loop count */
00089 
00090   /* Check for range of output samples to be calculated */
00091   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00092   {
00093     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00094     status = ARM_MATH_ARGUMENT_ERROR;
00095   }
00096   else
00097   {
00098 
00099     /* The algorithm implementation is based on the lengths of the inputs. */
00100     /* srcB is always made to slide across srcA. */
00101     /* So srcBLen is always considered as shorter or equal to srcALen */
00102     if (srcALen >= srcBLen)
00103     {
00104       /* Initialization of inputA pointer */
00105       pIn1 = pSrcA;
00106 
00107       /* Initialization of inputB pointer */
00108       pIn2 = pSrcB;
00109     }
00110     else
00111     {
00112       /* Initialization of inputA pointer */
00113       pIn1 = pSrcB;
00114 
00115       /* Initialization of inputB pointer */
00116       pIn2 = pSrcA;
00117 
00118       /* srcBLen is always considered as shorter or equal to srcALen */
00119       j = srcBLen;
00120       srcBLen = srcALen;
00121       srcALen = j;
00122     }
00123 
00124     /* Temporary pointer for scratch2 */
00125     py = pScratch2;
00126 
00127     /* pointer to take end of scratch2 buffer */
00128     pScr2 = pScratch2 + srcBLen - 1;
00129 
00130     /* points to smaller length sequence */
00131     px = pIn2;
00132 
00133     /* Apply loop unrolling and do 4 Copies simultaneously. */
00134     k = srcBLen >> 2U;
00135 
00136     /* First part of the processing with loop unrolling copies 4 data points at a time.
00137      ** a second loop below copies for the remaining 1 to 3 samples. */
00138     while (k > 0U)
00139     {
00140       /* copy second buffer in reversal manner */
00141       *pScr2-- = *px++;
00142       *pScr2-- = *px++;
00143       *pScr2-- = *px++;
00144       *pScr2-- = *px++;
00145 
00146       /* Decrement the loop counter */
00147       k--;
00148     }
00149 
00150     /* If the count is not a multiple of 4, copy remaining samples here.
00151      ** No loop unrolling is used. */
00152     k = srcBLen % 0x4U;
00153 
00154     while (k > 0U)
00155     {
00156       /* copy second buffer in reversal manner for remaining samples */
00157       *pScr2-- = *px++;
00158 
00159       /* Decrement the loop counter */
00160       k--;
00161     }
00162 
00163     /* Initialze temporary scratch pointer */
00164     pScr1 = pScratch1;
00165 
00166     /* Fill (srcBLen - 1U) zeros in scratch buffer */
00167     arm_fill_q15(0, pScr1, (srcBLen - 1U));
00168 
00169     /* Update temporary scratch pointer */
00170     pScr1 += (srcBLen - 1U);
00171 
00172     /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
00173 
00174     /* Copy (srcALen) samples in scratch buffer */
00175     arm_copy_q15(pIn1, pScr1, srcALen);
00176 
00177     /* Update pointers */
00178     pScr1 += srcALen;
00179 
00180     /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
00181     arm_fill_q15(0, pScr1, (srcBLen - 1U));
00182 
00183     /* Update pointer */
00184     pScr1 += (srcBLen - 1U);
00185 
00186     /* Initialization of pIn2 pointer */
00187     pIn2 = py;
00188 
00189     pScratch1 += firstIndex;
00190 
00191     pOut = pDst + firstIndex;
00192 
00193     /* Actual convolution process starts here */
00194     blkCnt = (numPoints) >> 2;
00195 
00196     while (blkCnt > 0)
00197     {
00198       /* Initialze temporary scratch pointer as scratch1 */
00199       pScr1 = pScratch1;
00200 
00201       /* Clear Accumlators */
00202       acc0 = 0;
00203       acc1 = 0;
00204       acc2 = 0;
00205       acc3 = 0;
00206 
00207       /* Read two samples from scratch1 buffer */
00208       x1 = *__SIMD32(pScr1)++;
00209 
00210       /* Read next two samples from scratch1 buffer */
00211       x2 = *__SIMD32(pScr1)++;
00212 
00213       tapCnt = (srcBLen) >> 2U;
00214 
00215       while (tapCnt > 0U)
00216       {
00217 
00218         /* Read four samples from smaller buffer */
00219         y1 = _SIMD32_OFFSET(pIn2);
00220         y2 = _SIMD32_OFFSET(pIn2 + 2U);
00221 
00222         /* multiply and accumlate */
00223         acc0 = __SMLALD(x1, y1, acc0);
00224         acc2 = __SMLALD(x2, y1, acc2);
00225 
00226         /* pack input data */
00227 #ifndef ARM_MATH_BIG_ENDIAN
00228         x3 = __PKHBT(x2, x1, 0);
00229 #else
00230         x3 = __PKHBT(x1, x2, 0);
00231 #endif
00232 
00233         /* multiply and accumlate */
00234         acc1 = __SMLALDX(x3, y1, acc1);
00235 
00236         /* Read next two samples from scratch1 buffer */
00237         x1 = _SIMD32_OFFSET(pScr1);
00238 
00239         /* multiply and accumlate */
00240         acc0 = __SMLALD(x2, y2, acc0);
00241         acc2 = __SMLALD(x1, y2, acc2);
00242 
00243         /* pack input data */
00244 #ifndef ARM_MATH_BIG_ENDIAN
00245         x3 = __PKHBT(x1, x2, 0);
00246 #else
00247         x3 = __PKHBT(x2, x1, 0);
00248 #endif
00249 
00250         acc3 = __SMLALDX(x3, y1, acc3);
00251         acc1 = __SMLALDX(x3, y2, acc1);
00252 
00253         x2 = _SIMD32_OFFSET(pScr1 + 2U);
00254 
00255 #ifndef ARM_MATH_BIG_ENDIAN
00256         x3 = __PKHBT(x2, x1, 0);
00257 #else
00258         x3 = __PKHBT(x1, x2, 0);
00259 #endif
00260 
00261         acc3 = __SMLALDX(x3, y2, acc3);
00262 
00263         /* update scratch pointers */
00264         pIn2 += 4U;
00265         pScr1 += 4U;
00266 
00267 
00268         /* Decrement the loop counter */
00269         tapCnt--;
00270       }
00271 
00272       /* Update scratch pointer for remaining samples of smaller length sequence */
00273       pScr1 -= 4U;
00274 
00275       /* apply same above for remaining samples of smaller length sequence */
00276       tapCnt = (srcBLen) & 3U;
00277 
00278       while (tapCnt > 0U)
00279       {
00280         /* accumlate the results */
00281         acc0 += (*pScr1++ * *pIn2);
00282         acc1 += (*pScr1++ * *pIn2);
00283         acc2 += (*pScr1++ * *pIn2);
00284         acc3 += (*pScr1++ * *pIn2++);
00285 
00286         pScr1 -= 3U;
00287 
00288         /* Decrement the loop counter */
00289         tapCnt--;
00290       }
00291 
00292       blkCnt--;
00293 
00294 
00295       /* Store the results in the accumulators in the destination buffer. */
00296 
00297 #ifndef  ARM_MATH_BIG_ENDIAN
00298 
00299       *__SIMD32(pOut)++ =
00300         __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00301       *__SIMD32(pOut)++ =
00302         __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00303 
00304 #else
00305 
00306       *__SIMD32(pOut)++ =
00307         __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00308       *__SIMD32(pOut)++ =
00309         __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00310 
00311 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00312 
00313       /* Initialization of inputB pointer */
00314       pIn2 = py;
00315 
00316       pScratch1 += 4U;
00317 
00318     }
00319 
00320 
00321     blkCnt = numPoints & 0x3;
00322 
00323     /* Calculate convolution for remaining samples of Bigger length sequence */
00324     while (blkCnt > 0)
00325     {
00326       /* Initialze temporary scratch pointer as scratch1 */
00327       pScr1 = pScratch1;
00328 
00329       /* Clear Accumlators */
00330       acc0 = 0;
00331 
00332       tapCnt = (srcBLen) >> 1U;
00333 
00334       while (tapCnt > 0U)
00335       {
00336 
00337         /* Read next two samples from scratch1 buffer */
00338         x1 = *__SIMD32(pScr1)++;
00339 
00340         /* Read two samples from smaller buffer */
00341         y1 = *__SIMD32(pIn2)++;
00342 
00343         acc0 = __SMLALD(x1, y1, acc0);
00344 
00345         /* Decrement the loop counter */
00346         tapCnt--;
00347       }
00348 
00349       tapCnt = (srcBLen) & 1U;
00350 
00351       /* apply same above for remaining samples of smaller length sequence */
00352       while (tapCnt > 0U)
00353       {
00354 
00355         /* accumlate the results */
00356         acc0 += (*pScr1++ * *pIn2++);
00357 
00358         /* Decrement the loop counter */
00359         tapCnt--;
00360       }
00361 
00362       blkCnt--;
00363 
00364       /* Store the result in the accumulator in the destination buffer. */
00365       *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00366 
00367       /* Initialization of inputB pointer */
00368       pIn2 = py;
00369 
00370       pScratch1 += 1U;
00371 
00372     }
00373 
00374     /* set status as ARM_MATH_SUCCESS */
00375     status = ARM_MATH_SUCCESS;
00376 
00377   }
00378 
00379   /* Return to application */
00380   return (status);
00381 }
00382 
00383 #else
00384 
00385 arm_status arm_conv_partial_opt_q15(
00386   q15_t * pSrcA,
00387   uint32_t srcALen,
00388   q15_t * pSrcB,
00389   uint32_t srcBLen,
00390   q15_t * pDst,
00391   uint32_t firstIndex,
00392   uint32_t numPoints,
00393   q15_t * pScratch1,
00394   q15_t * pScratch2)
00395 {
00396 
00397   q15_t *pOut = pDst;                            /* output pointer */
00398   q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
00399   q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
00400   q63_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00401   q15_t *pIn1;                                   /* inputA pointer */
00402   q15_t *pIn2;                                   /* inputB pointer */
00403   q15_t *px;                                     /* Intermediate inputA pointer  */
00404   q15_t *py;                                     /* Intermediate inputB pointer  */
00405   uint32_t j, k, blkCnt;                         /* loop counter */
00406   arm_status status;                             /* Status variable */
00407   uint32_t tapCnt;                               /* loop count */
00408   q15_t x10, x11, x20, x21;                      /* Temporary variables to hold srcA buffer */
00409   q15_t y10, y11;                                /* Temporary variables to hold srcB buffer */
00410 
00411 
00412   /* Check for range of output samples to be calculated */
00413   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00414   {
00415     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00416     status = ARM_MATH_ARGUMENT_ERROR;
00417   }
00418   else
00419   {
00420 
00421     /* The algorithm implementation is based on the lengths of the inputs. */
00422     /* srcB is always made to slide across srcA. */
00423     /* So srcBLen is always considered as shorter or equal to srcALen */
00424     if (srcALen >= srcBLen)
00425     {
00426       /* Initialization of inputA pointer */
00427       pIn1 = pSrcA;
00428 
00429       /* Initialization of inputB pointer */
00430       pIn2 = pSrcB;
00431     }
00432     else
00433     {
00434       /* Initialization of inputA pointer */
00435       pIn1 = pSrcB;
00436 
00437       /* Initialization of inputB pointer */
00438       pIn2 = pSrcA;
00439 
00440       /* srcBLen is always considered as shorter or equal to srcALen */
00441       j = srcBLen;
00442       srcBLen = srcALen;
00443       srcALen = j;
00444     }
00445 
00446     /* Temporary pointer for scratch2 */
00447     py = pScratch2;
00448 
00449     /* pointer to take end of scratch2 buffer */
00450     pScr2 = pScratch2 + srcBLen - 1;
00451 
00452     /* points to smaller length sequence */
00453     px = pIn2;
00454 
00455     /* Apply loop unrolling and do 4 Copies simultaneously. */
00456     k = srcBLen >> 2U;
00457 
00458     /* First part of the processing with loop unrolling copies 4 data points at a time.
00459      ** a second loop below copies for the remaining 1 to 3 samples. */
00460     while (k > 0U)
00461     {
00462       /* copy second buffer in reversal manner */
00463       *pScr2-- = *px++;
00464       *pScr2-- = *px++;
00465       *pScr2-- = *px++;
00466       *pScr2-- = *px++;
00467 
00468       /* Decrement the loop counter */
00469       k--;
00470     }
00471 
00472     /* If the count is not a multiple of 4, copy remaining samples here.
00473      ** No loop unrolling is used. */
00474     k = srcBLen % 0x4U;
00475 
00476     while (k > 0U)
00477     {
00478       /* copy second buffer in reversal manner for remaining samples */
00479       *pScr2-- = *px++;
00480 
00481       /* Decrement the loop counter */
00482       k--;
00483     }
00484 
00485     /* Initialze temporary scratch pointer */
00486     pScr1 = pScratch1;
00487 
00488     /* Fill (srcBLen - 1U) zeros in scratch buffer */
00489     arm_fill_q15(0, pScr1, (srcBLen - 1U));
00490 
00491     /* Update temporary scratch pointer */
00492     pScr1 += (srcBLen - 1U);
00493 
00494     /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
00495 
00496 
00497     /* Apply loop unrolling and do 4 Copies simultaneously. */
00498     k = srcALen >> 2U;
00499 
00500     /* First part of the processing with loop unrolling copies 4 data points at a time.
00501      ** a second loop below copies for the remaining 1 to 3 samples. */
00502     while (k > 0U)
00503     {
00504       /* copy second buffer in reversal manner */
00505       *pScr1++ = *pIn1++;
00506       *pScr1++ = *pIn1++;
00507       *pScr1++ = *pIn1++;
00508       *pScr1++ = *pIn1++;
00509 
00510       /* Decrement the loop counter */
00511       k--;
00512     }
00513 
00514     /* If the count is not a multiple of 4, copy remaining samples here.
00515      ** No loop unrolling is used. */
00516     k = srcALen % 0x4U;
00517 
00518     while (k > 0U)
00519     {
00520       /* copy second buffer in reversal manner for remaining samples */
00521       *pScr1++ = *pIn1++;
00522 
00523       /* Decrement the loop counter */
00524       k--;
00525     }
00526 
00527 
00528     /* Apply loop unrolling and do 4 Copies simultaneously. */
00529     k = (srcBLen - 1U) >> 2U;
00530 
00531     /* First part of the processing with loop unrolling copies 4 data points at a time.
00532      ** a second loop below copies for the remaining 1 to 3 samples. */
00533     while (k > 0U)
00534     {
00535       /* copy second buffer in reversal manner */
00536       *pScr1++ = 0;
00537       *pScr1++ = 0;
00538       *pScr1++ = 0;
00539       *pScr1++ = 0;
00540 
00541       /* Decrement the loop counter */
00542       k--;
00543     }
00544 
00545     /* If the count is not a multiple of 4, copy remaining samples here.
00546      ** No loop unrolling is used. */
00547     k = (srcBLen - 1U) % 0x4U;
00548 
00549     while (k > 0U)
00550     {
00551       /* copy second buffer in reversal manner for remaining samples */
00552       *pScr1++ = 0;
00553 
00554       /* Decrement the loop counter */
00555       k--;
00556     }
00557 
00558 
00559     /* Initialization of pIn2 pointer */
00560     pIn2 = py;
00561 
00562     pScratch1 += firstIndex;
00563 
00564     pOut = pDst + firstIndex;
00565 
00566     /* Actual convolution process starts here */
00567     blkCnt = (numPoints) >> 2;
00568 
00569     while (blkCnt > 0)
00570     {
00571       /* Initialze temporary scratch pointer as scratch1 */
00572       pScr1 = pScratch1;
00573 
00574       /* Clear Accumlators */
00575       acc0 = 0;
00576       acc1 = 0;
00577       acc2 = 0;
00578       acc3 = 0;
00579 
00580       /* Read two samples from scratch1 buffer */
00581       x10 = *pScr1++;
00582       x11 = *pScr1++;
00583 
00584       /* Read next two samples from scratch1 buffer */
00585       x20 = *pScr1++;
00586       x21 = *pScr1++;
00587 
00588       tapCnt = (srcBLen) >> 2U;
00589 
00590       while (tapCnt > 0U)
00591       {
00592 
00593         /* Read two samples from smaller buffer */
00594         y10 = *pIn2;
00595         y11 = *(pIn2 + 1U);
00596 
00597         /* multiply and accumlate */
00598         acc0 += (q63_t) x10 *y10;
00599         acc0 += (q63_t) x11 *y11;
00600         acc2 += (q63_t) x20 *y10;
00601         acc2 += (q63_t) x21 *y11;
00602 
00603         /* multiply and accumlate */
00604         acc1 += (q63_t) x11 *y10;
00605         acc1 += (q63_t) x20 *y11;
00606 
00607         /* Read next two samples from scratch1 buffer */
00608         x10 = *pScr1;
00609         x11 = *(pScr1 + 1U);
00610 
00611         /* multiply and accumlate */
00612         acc3 += (q63_t) x21 *y10;
00613         acc3 += (q63_t) x10 *y11;
00614 
00615         /* Read next two samples from scratch2 buffer */
00616         y10 = *(pIn2 + 2U);
00617         y11 = *(pIn2 + 3U);
00618 
00619         /* multiply and accumlate */
00620         acc0 += (q63_t) x20 *y10;
00621         acc0 += (q63_t) x21 *y11;
00622         acc2 += (q63_t) x10 *y10;
00623         acc2 += (q63_t) x11 *y11;
00624         acc1 += (q63_t) x21 *y10;
00625         acc1 += (q63_t) x10 *y11;
00626 
00627         /* Read next two samples from scratch1 buffer */
00628         x20 = *(pScr1 + 2);
00629         x21 = *(pScr1 + 3);
00630 
00631         /* multiply and accumlate */
00632         acc3 += (q63_t) x11 *y10;
00633         acc3 += (q63_t) x20 *y11;
00634 
00635         /* update scratch pointers */
00636         pIn2 += 4U;
00637         pScr1 += 4U;
00638 
00639         /* Decrement the loop counter */
00640         tapCnt--;
00641       }
00642 
00643       /* Update scratch pointer for remaining samples of smaller length sequence */
00644       pScr1 -= 4U;
00645 
00646       /* apply same above for remaining samples of smaller length sequence */
00647       tapCnt = (srcBLen) & 3U;
00648 
00649       while (tapCnt > 0U)
00650       {
00651         /* accumlate the results */
00652         acc0 += (*pScr1++ * *pIn2);
00653         acc1 += (*pScr1++ * *pIn2);
00654         acc2 += (*pScr1++ * *pIn2);
00655         acc3 += (*pScr1++ * *pIn2++);
00656 
00657         pScr1 -= 3U;
00658 
00659         /* Decrement the loop counter */
00660         tapCnt--;
00661       }
00662 
00663       blkCnt--;
00664 
00665 
00666       /* Store the results in the accumulators in the destination buffer. */
00667       *pOut++ = __SSAT((acc0 >> 15), 16);
00668       *pOut++ = __SSAT((acc1 >> 15), 16);
00669       *pOut++ = __SSAT((acc2 >> 15), 16);
00670       *pOut++ = __SSAT((acc3 >> 15), 16);
00671 
00672 
00673       /* Initialization of inputB pointer */
00674       pIn2 = py;
00675 
00676       pScratch1 += 4U;
00677 
00678     }
00679 
00680 
00681     blkCnt = numPoints & 0x3;
00682 
00683     /* Calculate convolution for remaining samples of Bigger length sequence */
00684     while (blkCnt > 0)
00685     {
00686       /* Initialze temporary scratch pointer as scratch1 */
00687       pScr1 = pScratch1;
00688 
00689       /* Clear Accumlators */
00690       acc0 = 0;
00691 
00692       tapCnt = (srcBLen) >> 1U;
00693 
00694       while (tapCnt > 0U)
00695       {
00696 
00697         /* Read next two samples from scratch1 buffer */
00698         x10 = *pScr1++;
00699         x11 = *pScr1++;
00700 
00701         /* Read two samples from smaller buffer */
00702         y10 = *pIn2++;
00703         y11 = *pIn2++;
00704 
00705         /* multiply and accumlate */
00706         acc0 += (q63_t) x10 *y10;
00707         acc0 += (q63_t) x11 *y11;
00708 
00709         /* Decrement the loop counter */
00710         tapCnt--;
00711       }
00712 
00713       tapCnt = (srcBLen) & 1U;
00714 
00715       /* apply same above for remaining samples of smaller length sequence */
00716       while (tapCnt > 0U)
00717       {
00718 
00719         /* accumlate the results */
00720         acc0 += (*pScr1++ * *pIn2++);
00721 
00722         /* Decrement the loop counter */
00723         tapCnt--;
00724       }
00725 
00726       blkCnt--;
00727 
00728       /* Store the result in the accumulator in the destination buffer. */
00729       *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00730 
00731 
00732       /* Initialization of inputB pointer */
00733       pIn2 = py;
00734 
00735       pScratch1 += 1U;
00736 
00737     }
00738 
00739     /* set status as ARM_MATH_SUCCESS */
00740     status = ARM_MATH_SUCCESS;
00741 
00742   }
00743 
00744   /* Return to application */
00745   return (status);
00746 }
00747 
00748 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00749 
00750 
00751 /**
00752  * @} end of PartialConv group
00753  */
00754