Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_opt_q7.c Source File

arm_conv_partial_opt_q7.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_partial_opt_q7.c
00004  * Description:  Partial convolution of Q7 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup PartialConv
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Partial convolution of Q7 sequences.
00042  * @param[in]       *pSrcA points to the first input sequence.
00043  * @param[in]       srcALen length of the first input sequence.
00044  * @param[in]       *pSrcB points to the second input sequence.
00045  * @param[in]       srcBLen length of the second input sequence.
00046  * @param[out]      *pDst points to the location where the output result is written.
00047  * @param[in]       firstIndex is the first output sample to start with.
00048  * @param[in]       numPoints is the number of output points to be computed.
00049  * @param[in]      *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
00050  * @param[in]      *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
00051  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
00052  *
00053  * \par Restrictions
00054  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
00055  *  In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
00056  *
00057  *
00058  *
00059  */
00060 
00061 
00062 #ifndef UNALIGNED_SUPPORT_DISABLE
00063 
00064 arm_status arm_conv_partial_opt_q7(
00065   q7_t * pSrcA,
00066   uint32_t srcALen,
00067   q7_t * pSrcB,
00068   uint32_t srcBLen,
00069   q7_t * pDst,
00070   uint32_t firstIndex,
00071   uint32_t numPoints,
00072   q15_t * pScratch1,
00073   q15_t * pScratch2)
00074 {
00075 
00076   q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
00077   q15_t x4;                                      /* Temporary input variable */
00078   q7_t *pIn1, *pIn2;                             /* inputA and inputB pointer */
00079   uint32_t j, k, blkCnt, tapCnt;                 /* loop counter */
00080   q7_t *px;                                      /* Temporary input1 pointer */
00081   q15_t *py;                                     /* Temporary input2 pointer */
00082   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00083   q31_t x1, x2, x3, y1;                          /* Temporary input variables */
00084   arm_status status;
00085   q7_t *pOut = pDst;                             /* output pointer */
00086   q7_t out0, out1, out2, out3;                   /* temporary variables */
00087 
00088   /* Check for range of output samples to be calculated */
00089   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00090   {
00091     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00092     status = ARM_MATH_ARGUMENT_ERROR;
00093   }
00094   else
00095   {
00096 
00097     /* The algorithm implementation is based on the lengths of the inputs. */
00098     /* srcB is always made to slide across srcA. */
00099     /* So srcBLen is always considered as shorter or equal to srcALen */
00100     if (srcALen >= srcBLen)
00101     {
00102       /* Initialization of inputA pointer */
00103       pIn1 = pSrcA;
00104 
00105       /* Initialization of inputB pointer */
00106       pIn2 = pSrcB;
00107     }
00108     else
00109     {
00110       /* Initialization of inputA pointer */
00111       pIn1 = pSrcB;
00112 
00113       /* Initialization of inputB pointer */
00114       pIn2 = pSrcA;
00115 
00116       /* srcBLen is always considered as shorter or equal to srcALen */
00117       j = srcBLen;
00118       srcBLen = srcALen;
00119       srcALen = j;
00120     }
00121 
00122     /* pointer to take end of scratch2 buffer */
00123     pScr2 = pScratch2;
00124 
00125     /* points to smaller length sequence */
00126     px = pIn2 + srcBLen - 1;
00127 
00128     /* Apply loop unrolling and do 4 Copies simultaneously. */
00129     k = srcBLen >> 2U;
00130 
00131     /* First part of the processing with loop unrolling copies 4 data points at a time.
00132      ** a second loop below copies for the remaining 1 to 3 samples. */
00133     while (k > 0U)
00134     {
00135       /* copy second buffer in reversal manner */
00136       x4 = (q15_t) * px--;
00137       *pScr2++ = x4;
00138       x4 = (q15_t) * px--;
00139       *pScr2++ = x4;
00140       x4 = (q15_t) * px--;
00141       *pScr2++ = x4;
00142       x4 = (q15_t) * px--;
00143       *pScr2++ = x4;
00144 
00145       /* Decrement the loop counter */
00146       k--;
00147     }
00148 
00149     /* If the count is not a multiple of 4, copy remaining samples here.
00150      ** No loop unrolling is used. */
00151     k = srcBLen % 0x4U;
00152 
00153     while (k > 0U)
00154     {
00155       /* copy second buffer in reversal manner for remaining samples */
00156       x4 = (q15_t) * px--;
00157       *pScr2++ = x4;
00158 
00159       /* Decrement the loop counter */
00160       k--;
00161     }
00162 
00163     /* Initialze temporary scratch pointer */
00164     pScr1 = pScratch1;
00165 
00166     /* Fill (srcBLen - 1U) zeros in scratch buffer */
00167     arm_fill_q15(0, pScr1, (srcBLen - 1U));
00168 
00169     /* Update temporary scratch pointer */
00170     pScr1 += (srcBLen - 1U);
00171 
00172     /* Copy (srcALen) samples in scratch buffer */
00173     /* Apply loop unrolling and do 4 Copies simultaneously. */
00174     k = srcALen >> 2U;
00175 
00176     /* First part of the processing with loop unrolling copies 4 data points at a time.
00177      ** a second loop below copies for the remaining 1 to 3 samples. */
00178     while (k > 0U)
00179     {
00180       /* copy second buffer in reversal manner */
00181       x4 = (q15_t) * pIn1++;
00182       *pScr1++ = x4;
00183       x4 = (q15_t) * pIn1++;
00184       *pScr1++ = x4;
00185       x4 = (q15_t) * pIn1++;
00186       *pScr1++ = x4;
00187       x4 = (q15_t) * pIn1++;
00188       *pScr1++ = x4;
00189 
00190       /* Decrement the loop counter */
00191       k--;
00192     }
00193 
00194     /* If the count is not a multiple of 4, copy remaining samples here.
00195      ** No loop unrolling is used. */
00196     k = srcALen % 0x4U;
00197 
00198     while (k > 0U)
00199     {
00200       /* copy second buffer in reversal manner for remaining samples */
00201       x4 = (q15_t) * pIn1++;
00202       *pScr1++ = x4;
00203 
00204       /* Decrement the loop counter */
00205       k--;
00206     }
00207 
00208     /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
00209     arm_fill_q15(0, pScr1, (srcBLen - 1U));
00210 
00211     /* Update pointer */
00212     pScr1 += (srcBLen - 1U);
00213 
00214 
00215     /* Temporary pointer for scratch2 */
00216     py = pScratch2;
00217 
00218     /* Initialization of pIn2 pointer */
00219     pIn2 = (q7_t *) py;
00220 
00221     pScr2 = py;
00222 
00223     pOut = pDst + firstIndex;
00224 
00225     pScratch1 += firstIndex;
00226 
00227     /* Actual convolution process starts here */
00228     blkCnt = (numPoints) >> 2;
00229 
00230 
00231     while (blkCnt > 0)
00232     {
00233       /* Initialze temporary scratch pointer as scratch1 */
00234       pScr1 = pScratch1;
00235 
00236       /* Clear Accumlators */
00237       acc0 = 0;
00238       acc1 = 0;
00239       acc2 = 0;
00240       acc3 = 0;
00241 
00242       /* Read two samples from scratch1 buffer */
00243       x1 = *__SIMD32(pScr1)++;
00244 
00245       /* Read next two samples from scratch1 buffer */
00246       x2 = *__SIMD32(pScr1)++;
00247 
00248       tapCnt = (srcBLen) >> 2U;
00249 
00250       while (tapCnt > 0U)
00251       {
00252 
00253         /* Read four samples from smaller buffer */
00254         y1 = _SIMD32_OFFSET(pScr2);
00255 
00256         /* multiply and accumlate */
00257         acc0 = __SMLAD(x1, y1, acc0);
00258         acc2 = __SMLAD(x2, y1, acc2);
00259 
00260         /* pack input data */
00261 #ifndef ARM_MATH_BIG_ENDIAN
00262         x3 = __PKHBT(x2, x1, 0);
00263 #else
00264         x3 = __PKHBT(x1, x2, 0);
00265 #endif
00266 
00267         /* multiply and accumlate */
00268         acc1 = __SMLADX(x3, y1, acc1);
00269 
00270         /* Read next two samples from scratch1 buffer */
00271         x1 = *__SIMD32(pScr1)++;
00272 
00273         /* pack input data */
00274 #ifndef ARM_MATH_BIG_ENDIAN
00275         x3 = __PKHBT(x1, x2, 0);
00276 #else
00277         x3 = __PKHBT(x2, x1, 0);
00278 #endif
00279 
00280         acc3 = __SMLADX(x3, y1, acc3);
00281 
00282         /* Read four samples from smaller buffer */
00283         y1 = _SIMD32_OFFSET(pScr2 + 2U);
00284 
00285         acc0 = __SMLAD(x2, y1, acc0);
00286 
00287         acc2 = __SMLAD(x1, y1, acc2);
00288 
00289         acc1 = __SMLADX(x3, y1, acc1);
00290 
00291         x2 = *__SIMD32(pScr1)++;
00292 
00293 #ifndef ARM_MATH_BIG_ENDIAN
00294         x3 = __PKHBT(x2, x1, 0);
00295 #else
00296         x3 = __PKHBT(x1, x2, 0);
00297 #endif
00298 
00299         acc3 = __SMLADX(x3, y1, acc3);
00300 
00301         pScr2 += 4U;
00302 
00303 
00304         /* Decrement the loop counter */
00305         tapCnt--;
00306       }
00307 
00308 
00309 
00310       /* Update scratch pointer for remaining samples of smaller length sequence */
00311       pScr1 -= 4U;
00312 
00313 
00314       /* apply same above for remaining samples of smaller length sequence */
00315       tapCnt = (srcBLen) & 3U;
00316 
00317       while (tapCnt > 0U)
00318       {
00319 
00320         /* accumlate the results */
00321         acc0 += (*pScr1++ * *pScr2);
00322         acc1 += (*pScr1++ * *pScr2);
00323         acc2 += (*pScr1++ * *pScr2);
00324         acc3 += (*pScr1++ * *pScr2++);
00325 
00326         pScr1 -= 3U;
00327 
00328         /* Decrement the loop counter */
00329         tapCnt--;
00330       }
00331 
00332       blkCnt--;
00333 
00334       /* Store the result in the accumulator in the destination buffer. */
00335       out0 = (q7_t) (__SSAT(acc0 >> 7U, 8));
00336       out1 = (q7_t) (__SSAT(acc1 >> 7U, 8));
00337       out2 = (q7_t) (__SSAT(acc2 >> 7U, 8));
00338       out3 = (q7_t) (__SSAT(acc3 >> 7U, 8));
00339 
00340       *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
00341 
00342       /* Initialization of inputB pointer */
00343       pScr2 = py;
00344 
00345       pScratch1 += 4U;
00346 
00347     }
00348 
00349     blkCnt = (numPoints) & 0x3;
00350 
00351     /* Calculate convolution for remaining samples of Bigger length sequence */
00352     while (blkCnt > 0)
00353     {
00354       /* Initialze temporary scratch pointer as scratch1 */
00355       pScr1 = pScratch1;
00356 
00357       /* Clear Accumlators */
00358       acc0 = 0;
00359 
00360       tapCnt = (srcBLen) >> 1U;
00361 
00362       while (tapCnt > 0U)
00363       {
00364 
00365         /* Read next two samples from scratch1 buffer */
00366         x1 = *__SIMD32(pScr1)++;
00367 
00368         /* Read two samples from smaller buffer */
00369         y1 = *__SIMD32(pScr2)++;
00370 
00371         acc0 = __SMLAD(x1, y1, acc0);
00372 
00373         /* Decrement the loop counter */
00374         tapCnt--;
00375       }
00376 
00377       tapCnt = (srcBLen) & 1U;
00378 
00379       /* apply same above for remaining samples of smaller length sequence */
00380       while (tapCnt > 0U)
00381       {
00382 
00383         /* accumlate the results */
00384         acc0 += (*pScr1++ * *pScr2++);
00385 
00386         /* Decrement the loop counter */
00387         tapCnt--;
00388       }
00389 
00390       blkCnt--;
00391 
00392       /* Store the result in the accumulator in the destination buffer. */
00393       *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
00394 
00395       /* Initialization of inputB pointer */
00396       pScr2 = py;
00397 
00398       pScratch1 += 1U;
00399 
00400     }
00401 
00402     /* set status as ARM_MATH_SUCCESS */
00403     status = ARM_MATH_SUCCESS;
00404 
00405 
00406   }
00407 
00408   return (status);
00409 
00410 }
00411 
00412 #else
00413 
00414 arm_status arm_conv_partial_opt_q7(
00415   q7_t * pSrcA,
00416   uint32_t srcALen,
00417   q7_t * pSrcB,
00418   uint32_t srcBLen,
00419   q7_t * pDst,
00420   uint32_t firstIndex,
00421   uint32_t numPoints,
00422   q15_t * pScratch1,
00423   q15_t * pScratch2)
00424 {
00425 
00426   q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
00427   q15_t x4;                                      /* Temporary input variable */
00428   q7_t *pIn1, *pIn2;                             /* inputA and inputB pointer */
00429   uint32_t j, k, blkCnt, tapCnt;                 /* loop counter */
00430   q7_t *px;                                      /* Temporary input1 pointer */
00431   q15_t *py;                                     /* Temporary input2 pointer */
00432   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00433   arm_status status;
00434   q7_t *pOut = pDst;                             /* output pointer */
00435   q15_t x10, x11, x20, x21;                      /* Temporary input variables */
00436   q15_t y10, y11;                                /* Temporary input variables */
00437 
00438   /* Check for range of output samples to be calculated */
00439   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00440   {
00441     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00442     status = ARM_MATH_ARGUMENT_ERROR;
00443   }
00444   else
00445   {
00446 
00447     /* The algorithm implementation is based on the lengths of the inputs. */
00448     /* srcB is always made to slide across srcA. */
00449     /* So srcBLen is always considered as shorter or equal to srcALen */
00450     if (srcALen >= srcBLen)
00451     {
00452       /* Initialization of inputA pointer */
00453       pIn1 = pSrcA;
00454 
00455       /* Initialization of inputB pointer */
00456       pIn2 = pSrcB;
00457     }
00458     else
00459     {
00460       /* Initialization of inputA pointer */
00461       pIn1 = pSrcB;
00462 
00463       /* Initialization of inputB pointer */
00464       pIn2 = pSrcA;
00465 
00466       /* srcBLen is always considered as shorter or equal to srcALen */
00467       j = srcBLen;
00468       srcBLen = srcALen;
00469       srcALen = j;
00470     }
00471 
00472     /* pointer to take end of scratch2 buffer */
00473     pScr2 = pScratch2;
00474 
00475     /* points to smaller length sequence */
00476     px = pIn2 + srcBLen - 1;
00477 
00478     /* Apply loop unrolling and do 4 Copies simultaneously. */
00479     k = srcBLen >> 2U;
00480 
00481     /* First part of the processing with loop unrolling copies 4 data points at a time.
00482      ** a second loop below copies for the remaining 1 to 3 samples. */
00483     while (k > 0U)
00484     {
00485       /* copy second buffer in reversal manner */
00486       x4 = (q15_t) * px--;
00487       *pScr2++ = x4;
00488       x4 = (q15_t) * px--;
00489       *pScr2++ = x4;
00490       x4 = (q15_t) * px--;
00491       *pScr2++ = x4;
00492       x4 = (q15_t) * px--;
00493       *pScr2++ = x4;
00494 
00495       /* Decrement the loop counter */
00496       k--;
00497     }
00498 
00499     /* If the count is not a multiple of 4, copy remaining samples here.
00500      ** No loop unrolling is used. */
00501     k = srcBLen % 0x4U;
00502 
00503     while (k > 0U)
00504     {
00505       /* copy second buffer in reversal manner for remaining samples */
00506       x4 = (q15_t) * px--;
00507       *pScr2++ = x4;
00508 
00509       /* Decrement the loop counter */
00510       k--;
00511     }
00512 
00513     /* Initialze temporary scratch pointer */
00514     pScr1 = pScratch1;
00515 
00516     /* Fill (srcBLen - 1U) zeros in scratch buffer */
00517     arm_fill_q15(0, pScr1, (srcBLen - 1U));
00518 
00519     /* Update temporary scratch pointer */
00520     pScr1 += (srcBLen - 1U);
00521 
00522     /* Copy (srcALen) samples in scratch buffer */
00523     /* Apply loop unrolling and do 4 Copies simultaneously. */
00524     k = srcALen >> 2U;
00525 
00526     /* First part of the processing with loop unrolling copies 4 data points at a time.
00527      ** a second loop below copies for the remaining 1 to 3 samples. */
00528     while (k > 0U)
00529     {
00530       /* copy second buffer in reversal manner */
00531       x4 = (q15_t) * pIn1++;
00532       *pScr1++ = x4;
00533       x4 = (q15_t) * pIn1++;
00534       *pScr1++ = x4;
00535       x4 = (q15_t) * pIn1++;
00536       *pScr1++ = x4;
00537       x4 = (q15_t) * pIn1++;
00538       *pScr1++ = x4;
00539 
00540       /* Decrement the loop counter */
00541       k--;
00542     }
00543 
00544     /* If the count is not a multiple of 4, copy remaining samples here.
00545      ** No loop unrolling is used. */
00546     k = srcALen % 0x4U;
00547 
00548     while (k > 0U)
00549     {
00550       /* copy second buffer in reversal manner for remaining samples */
00551       x4 = (q15_t) * pIn1++;
00552       *pScr1++ = x4;
00553 
00554       /* Decrement the loop counter */
00555       k--;
00556     }
00557 
00558     /* Apply loop unrolling and do 4 Copies simultaneously. */
00559     k = (srcBLen - 1U) >> 2U;
00560 
00561     /* First part of the processing with loop unrolling copies 4 data points at a time.
00562      ** a second loop below copies for the remaining 1 to 3 samples. */
00563     while (k > 0U)
00564     {
00565       /* copy second buffer in reversal manner */
00566       *pScr1++ = 0;
00567       *pScr1++ = 0;
00568       *pScr1++ = 0;
00569       *pScr1++ = 0;
00570 
00571       /* Decrement the loop counter */
00572       k--;
00573     }
00574 
00575     /* If the count is not a multiple of 4, copy remaining samples here.
00576      ** No loop unrolling is used. */
00577     k = (srcBLen - 1U) % 0x4U;
00578 
00579     while (k > 0U)
00580     {
00581       /* copy second buffer in reversal manner for remaining samples */
00582       *pScr1++ = 0;
00583 
00584       /* Decrement the loop counter */
00585       k--;
00586     }
00587 
00588 
00589     /* Temporary pointer for scratch2 */
00590     py = pScratch2;
00591 
00592     /* Initialization of pIn2 pointer */
00593     pIn2 = (q7_t *) py;
00594 
00595     pScr2 = py;
00596 
00597     pOut = pDst + firstIndex;
00598 
00599     pScratch1 += firstIndex;
00600 
00601     /* Actual convolution process starts here */
00602     blkCnt = (numPoints) >> 2;
00603 
00604 
00605     while (blkCnt > 0)
00606     {
00607       /* Initialze temporary scratch pointer as scratch1 */
00608       pScr1 = pScratch1;
00609 
00610       /* Clear Accumlators */
00611       acc0 = 0;
00612       acc1 = 0;
00613       acc2 = 0;
00614       acc3 = 0;
00615 
00616       /* Read two samples from scratch1 buffer */
00617       x10 = *pScr1++;
00618       x11 = *pScr1++;
00619 
00620       /* Read next two samples from scratch1 buffer */
00621       x20 = *pScr1++;
00622       x21 = *pScr1++;
00623 
00624       tapCnt = (srcBLen) >> 2U;
00625 
00626       while (tapCnt > 0U)
00627       {
00628 
00629         /* Read four samples from smaller buffer */
00630         y10 = *pScr2;
00631         y11 = *(pScr2 + 1U);
00632 
00633         /* multiply and accumlate */
00634         acc0 += (q31_t) x10 *y10;
00635         acc0 += (q31_t) x11 *y11;
00636         acc2 += (q31_t) x20 *y10;
00637         acc2 += (q31_t) x21 *y11;
00638 
00639 
00640         acc1 += (q31_t) x11 *y10;
00641         acc1 += (q31_t) x20 *y11;
00642 
00643         /* Read next two samples from scratch1 buffer */
00644         x10 = *pScr1;
00645         x11 = *(pScr1 + 1U);
00646 
00647         /* multiply and accumlate */
00648         acc3 += (q31_t) x21 *y10;
00649         acc3 += (q31_t) x10 *y11;
00650 
00651         /* Read next two samples from scratch2 buffer */
00652         y10 = *(pScr2 + 2U);
00653         y11 = *(pScr2 + 3U);
00654 
00655         /* multiply and accumlate */
00656         acc0 += (q31_t) x20 *y10;
00657         acc0 += (q31_t) x21 *y11;
00658         acc2 += (q31_t) x10 *y10;
00659         acc2 += (q31_t) x11 *y11;
00660         acc1 += (q31_t) x21 *y10;
00661         acc1 += (q31_t) x10 *y11;
00662 
00663         /* Read next two samples from scratch1 buffer */
00664         x20 = *(pScr1 + 2);
00665         x21 = *(pScr1 + 3);
00666 
00667         /* multiply and accumlate */
00668         acc3 += (q31_t) x11 *y10;
00669         acc3 += (q31_t) x20 *y11;
00670 
00671         /* update scratch pointers */
00672 
00673         pScr1 += 4U;
00674         pScr2 += 4U;
00675 
00676         /* Decrement the loop counter */
00677         tapCnt--;
00678       }
00679 
00680 
00681 
00682       /* Update scratch pointer for remaining samples of smaller length sequence */
00683       pScr1 -= 4U;
00684 
00685 
00686       /* apply same above for remaining samples of smaller length sequence */
00687       tapCnt = (srcBLen) & 3U;
00688 
00689       while (tapCnt > 0U)
00690       {
00691 
00692         /* accumlate the results */
00693         acc0 += (*pScr1++ * *pScr2);
00694         acc1 += (*pScr1++ * *pScr2);
00695         acc2 += (*pScr1++ * *pScr2);
00696         acc3 += (*pScr1++ * *pScr2++);
00697 
00698         pScr1 -= 3U;
00699 
00700         /* Decrement the loop counter */
00701         tapCnt--;
00702       }
00703 
00704       blkCnt--;
00705 
00706       /* Store the result in the accumulator in the destination buffer. */
00707       *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
00708       *pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8));
00709       *pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8));
00710       *pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8));
00711 
00712       /* Initialization of inputB pointer */
00713       pScr2 = py;
00714 
00715       pScratch1 += 4U;
00716 
00717     }
00718 
00719     blkCnt = (numPoints) & 0x3;
00720 
00721     /* Calculate convolution for remaining samples of Bigger length sequence */
00722     while (blkCnt > 0)
00723     {
00724       /* Initialze temporary scratch pointer as scratch1 */
00725       pScr1 = pScratch1;
00726 
00727       /* Clear Accumlators */
00728       acc0 = 0;
00729 
00730       tapCnt = (srcBLen) >> 1U;
00731 
00732       while (tapCnt > 0U)
00733       {
00734 
00735         /* Read next two samples from scratch1 buffer */
00736         x10 = *pScr1++;
00737         x11 = *pScr1++;
00738 
00739         /* Read two samples from smaller buffer */
00740         y10 = *pScr2++;
00741         y11 = *pScr2++;
00742 
00743         /* multiply and accumlate */
00744         acc0 += (q31_t) x10 *y10;
00745         acc0 += (q31_t) x11 *y11;
00746 
00747         /* Decrement the loop counter */
00748         tapCnt--;
00749       }
00750 
00751       tapCnt = (srcBLen) & 1U;
00752 
00753       /* apply same above for remaining samples of smaller length sequence */
00754       while (tapCnt > 0U)
00755       {
00756 
00757         /* accumlate the results */
00758         acc0 += (*pScr1++ * *pScr2++);
00759 
00760         /* Decrement the loop counter */
00761         tapCnt--;
00762       }
00763 
00764       blkCnt--;
00765 
00766       /* Store the result in the accumulator in the destination buffer. */
00767       *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
00768 
00769       /* Initialization of inputB pointer */
00770       pScr2 = py;
00771 
00772       pScratch1 += 1U;
00773 
00774     }
00775 
00776     /* set status as ARM_MATH_SUCCESS */
00777     status = ARM_MATH_SUCCESS;
00778 
00779   }
00780 
00781   return (status);
00782 
00783 }
00784 
00785 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00786 
00787 
00788 
00789 /**
00790  * @} end of PartialConv group
00791  */
00792