CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_opt_q7.c Source File

arm_conv_partial_opt_q7.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_conv_partial_opt_q7.c    
00009 *    
00010 * Description:  Partial convolution of Q7 sequences.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup PartialConv    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Partial convolution of Q7 sequences.    
00054  * @param[in]       *pSrcA points to the first input sequence.    
00055  * @param[in]       srcALen length of the first input sequence.    
00056  * @param[in]       *pSrcB points to the second input sequence.    
00057  * @param[in]       srcBLen length of the second input sequence.    
00058  * @param[out]      *pDst points to the location where the output result is written.    
00059  * @param[in]       firstIndex is the first output sample to start with.    
00060  * @param[in]       numPoints is the number of output points to be computed.    
00061  * @param[in]      *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.    
00062  * @param[in]      *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).    
00063  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].    
00064  *    
00065  * \par Restrictions    
00066  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE    
00067  *  In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit   
00068  * 
00069  *
00070  * 
00071  */
00072 
00073 
00074 #ifndef UNALIGNED_SUPPORT_DISABLE
00075 
00076 arm_status arm_conv_partial_opt_q7(
00077   q7_t * pSrcA,
00078   uint32_t srcALen,
00079   q7_t * pSrcB,
00080   uint32_t srcBLen,
00081   q7_t * pDst,
00082   uint32_t firstIndex,
00083   uint32_t numPoints,
00084   q15_t * pScratch1,
00085   q15_t * pScratch2)
00086 {
00087 
00088   q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
00089   q15_t x4;                                      /* Temporary input variable */
00090   q7_t *pIn1, *pIn2;                             /* inputA and inputB pointer */
00091   uint32_t j, k, blkCnt, tapCnt;                 /* loop counter */
00092   q7_t *px;                                      /* Temporary input1 pointer */
00093   q15_t *py;                                     /* Temporary input2 pointer */
00094   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00095   q31_t x1, x2, x3, y1;                          /* Temporary input variables */
00096   arm_status status;
00097   q7_t *pOut = pDst;                             /* output pointer */
00098   q7_t out0, out1, out2, out3;                   /* temporary variables */
00099 
00100   /* Check for range of output samples to be calculated */
00101   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00102   {
00103     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00104     status = ARM_MATH_ARGUMENT_ERROR;
00105   }
00106   else
00107   {
00108 
00109     /* The algorithm implementation is based on the lengths of the inputs. */
00110     /* srcB is always made to slide across srcA. */
00111     /* So srcBLen is always considered as shorter or equal to srcALen */
00112     if(srcALen >= srcBLen)
00113     {
00114       /* Initialization of inputA pointer */
00115       pIn1 = pSrcA;
00116 
00117       /* Initialization of inputB pointer */
00118       pIn2 = pSrcB;
00119     }
00120     else
00121     {
00122       /* Initialization of inputA pointer */
00123       pIn1 = pSrcB;
00124 
00125       /* Initialization of inputB pointer */
00126       pIn2 = pSrcA;
00127 
00128       /* srcBLen is always considered as shorter or equal to srcALen */
00129       j = srcBLen;
00130       srcBLen = srcALen;
00131       srcALen = j;
00132     }
00133 
00134     /* pointer to take end of scratch2 buffer */
00135     pScr2 = pScratch2;
00136 
00137     /* points to smaller length sequence */
00138     px = pIn2 + srcBLen - 1;
00139 
00140     /* Apply loop unrolling and do 4 Copies simultaneously. */
00141     k = srcBLen >> 2u;
00142 
00143     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00144      ** a second loop below copies for the remaining 1 to 3 samples. */
00145     while(k > 0u)
00146     {
00147       /* copy second buffer in reversal manner */
00148       x4 = (q15_t) * px--;
00149       *pScr2++ = x4;
00150       x4 = (q15_t) * px--;
00151       *pScr2++ = x4;
00152       x4 = (q15_t) * px--;
00153       *pScr2++ = x4;
00154       x4 = (q15_t) * px--;
00155       *pScr2++ = x4;
00156 
00157       /* Decrement the loop counter */
00158       k--;
00159     }
00160 
00161     /* If the count is not a multiple of 4, copy remaining samples here.       
00162      ** No loop unrolling is used. */
00163     k = srcBLen % 0x4u;
00164 
00165     while(k > 0u)
00166     {
00167       /* copy second buffer in reversal manner for remaining samples */
00168       x4 = (q15_t) * px--;
00169       *pScr2++ = x4;
00170 
00171       /* Decrement the loop counter */
00172       k--;
00173     }
00174 
00175     /* Initialze temporary scratch pointer */
00176     pScr1 = pScratch1;
00177 
00178     /* Fill (srcBLen - 1u) zeros in scratch buffer */
00179     arm_fill_q15(0, pScr1, (srcBLen - 1u));
00180 
00181     /* Update temporary scratch pointer */
00182     pScr1 += (srcBLen - 1u);
00183 
00184     /* Copy (srcALen) samples in scratch buffer */
00185     /* Apply loop unrolling and do 4 Copies simultaneously. */
00186     k = srcALen >> 2u;
00187 
00188     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00189      ** a second loop below copies for the remaining 1 to 3 samples. */
00190     while(k > 0u)
00191     {
00192       /* copy second buffer in reversal manner */
00193       x4 = (q15_t) * pIn1++;
00194       *pScr1++ = x4;
00195       x4 = (q15_t) * pIn1++;
00196       *pScr1++ = x4;
00197       x4 = (q15_t) * pIn1++;
00198       *pScr1++ = x4;
00199       x4 = (q15_t) * pIn1++;
00200       *pScr1++ = x4;
00201 
00202       /* Decrement the loop counter */
00203       k--;
00204     }
00205 
00206     /* If the count is not a multiple of 4, copy remaining samples here.       
00207      ** No loop unrolling is used. */
00208     k = srcALen % 0x4u;
00209 
00210     while(k > 0u)
00211     {
00212       /* copy second buffer in reversal manner for remaining samples */
00213       x4 = (q15_t) * pIn1++;
00214       *pScr1++ = x4;
00215 
00216       /* Decrement the loop counter */
00217       k--;
00218     }
00219 
00220     /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
00221     arm_fill_q15(0, pScr1, (srcBLen - 1u));
00222 
00223     /* Update pointer */
00224     pScr1 += (srcBLen - 1u);
00225 
00226 
00227     /* Temporary pointer for scratch2 */
00228     py = pScratch2;
00229 
00230     /* Initialization of pIn2 pointer */
00231     pIn2 = (q7_t *) py;
00232 
00233     pScr2 = py;
00234 
00235     pOut = pDst + firstIndex;
00236 
00237     pScratch1 += firstIndex;
00238 
00239     /* Actual convolution process starts here */
00240     blkCnt = (numPoints) >> 2;
00241 
00242 
00243     while(blkCnt > 0)
00244     {
00245       /* Initialze temporary scratch pointer as scratch1 */
00246       pScr1 = pScratch1;
00247 
00248       /* Clear Accumlators */
00249       acc0 = 0;
00250       acc1 = 0;
00251       acc2 = 0;
00252       acc3 = 0;
00253 
00254       /* Read two samples from scratch1 buffer */
00255       x1 = *__SIMD32(pScr1)++;
00256 
00257       /* Read next two samples from scratch1 buffer */
00258       x2 = *__SIMD32(pScr1)++;
00259 
00260       tapCnt = (srcBLen) >> 2u;
00261 
00262       while(tapCnt > 0u)
00263       {
00264 
00265         /* Read four samples from smaller buffer */
00266         y1 = _SIMD32_OFFSET(pScr2);
00267 
00268         /* multiply and accumlate */
00269         acc0 = __SMLAD(x1, y1, acc0);
00270         acc2 = __SMLAD(x2, y1, acc2);
00271 
00272         /* pack input data */
00273 #ifndef ARM_MATH_BIG_ENDIAN
00274         x3 = __PKHBT(x2, x1, 0);
00275 #else
00276         x3 = __PKHBT(x1, x2, 0);
00277 #endif
00278 
00279         /* multiply and accumlate */
00280         acc1 = __SMLADX(x3, y1, acc1);
00281 
00282         /* Read next two samples from scratch1 buffer */
00283         x1 = *__SIMD32(pScr1)++;
00284 
00285         /* pack input data */
00286 #ifndef ARM_MATH_BIG_ENDIAN
00287         x3 = __PKHBT(x1, x2, 0);
00288 #else
00289         x3 = __PKHBT(x2, x1, 0);
00290 #endif
00291 
00292         acc3 = __SMLADX(x3, y1, acc3);
00293 
00294         /* Read four samples from smaller buffer */
00295         y1 = _SIMD32_OFFSET(pScr2 + 2u);
00296 
00297         acc0 = __SMLAD(x2, y1, acc0);
00298 
00299         acc2 = __SMLAD(x1, y1, acc2);
00300 
00301         acc1 = __SMLADX(x3, y1, acc1);
00302 
00303         x2 = *__SIMD32(pScr1)++;
00304 
00305 #ifndef ARM_MATH_BIG_ENDIAN
00306         x3 = __PKHBT(x2, x1, 0);
00307 #else
00308         x3 = __PKHBT(x1, x2, 0);
00309 #endif
00310 
00311         acc3 = __SMLADX(x3, y1, acc3);
00312 
00313         pScr2 += 4u;
00314 
00315 
00316         /* Decrement the loop counter */
00317         tapCnt--;
00318       }
00319 
00320 
00321 
00322       /* Update scratch pointer for remaining samples of smaller length sequence */
00323       pScr1 -= 4u;
00324 
00325 
00326       /* apply same above for remaining samples of smaller length sequence */
00327       tapCnt = (srcBLen) & 3u;
00328 
00329       while(tapCnt > 0u)
00330       {
00331 
00332         /* accumlate the results */
00333         acc0 += (*pScr1++ * *pScr2);
00334         acc1 += (*pScr1++ * *pScr2);
00335         acc2 += (*pScr1++ * *pScr2);
00336         acc3 += (*pScr1++ * *pScr2++);
00337 
00338         pScr1 -= 3u;
00339 
00340         /* Decrement the loop counter */
00341         tapCnt--;
00342       }
00343 
00344       blkCnt--;
00345 
00346       /* Store the result in the accumulator in the destination buffer. */
00347       out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
00348       out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
00349       out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
00350       out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
00351 
00352       *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
00353 
00354       /* Initialization of inputB pointer */
00355       pScr2 = py;
00356 
00357       pScratch1 += 4u;
00358 
00359     }
00360 
00361     blkCnt = (numPoints) & 0x3;
00362 
00363     /* Calculate convolution for remaining samples of Bigger length sequence */
00364     while(blkCnt > 0)
00365     {
00366       /* Initialze temporary scratch pointer as scratch1 */
00367       pScr1 = pScratch1;
00368 
00369       /* Clear Accumlators */
00370       acc0 = 0;
00371 
00372       tapCnt = (srcBLen) >> 1u;
00373 
00374       while(tapCnt > 0u)
00375       {
00376 
00377         /* Read next two samples from scratch1 buffer */
00378         x1 = *__SIMD32(pScr1)++;
00379 
00380         /* Read two samples from smaller buffer */
00381         y1 = *__SIMD32(pScr2)++;
00382 
00383         acc0 = __SMLAD(x1, y1, acc0);
00384 
00385         /* Decrement the loop counter */
00386         tapCnt--;
00387       }
00388 
00389       tapCnt = (srcBLen) & 1u;
00390 
00391       /* apply same above for remaining samples of smaller length sequence */
00392       while(tapCnt > 0u)
00393       {
00394 
00395         /* accumlate the results */
00396         acc0 += (*pScr1++ * *pScr2++);
00397 
00398         /* Decrement the loop counter */
00399         tapCnt--;
00400       }
00401 
00402       blkCnt--;
00403 
00404       /* Store the result in the accumulator in the destination buffer. */
00405       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
00406 
00407       /* Initialization of inputB pointer */
00408       pScr2 = py;
00409 
00410       pScratch1 += 1u;
00411 
00412     }
00413 
00414     /* set status as ARM_MATH_SUCCESS */
00415     status = ARM_MATH_SUCCESS;
00416 
00417 
00418   }
00419 
00420   return (status);
00421 
00422 }
00423 
00424 #else
00425 
00426 arm_status arm_conv_partial_opt_q7(
00427   q7_t * pSrcA,
00428   uint32_t srcALen,
00429   q7_t * pSrcB,
00430   uint32_t srcBLen,
00431   q7_t * pDst,
00432   uint32_t firstIndex,
00433   uint32_t numPoints,
00434   q15_t * pScratch1,
00435   q15_t * pScratch2)
00436 {
00437 
00438   q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
00439   q15_t x4;                                      /* Temporary input variable */
00440   q7_t *pIn1, *pIn2;                             /* inputA and inputB pointer */
00441   uint32_t j, k, blkCnt, tapCnt;                 /* loop counter */
00442   q7_t *px;                                      /* Temporary input1 pointer */
00443   q15_t *py;                                     /* Temporary input2 pointer */
00444   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00445   arm_status status;
00446   q7_t *pOut = pDst;                             /* output pointer */
00447   q15_t x10, x11, x20, x21;                      /* Temporary input variables */
00448   q15_t y10, y11;                                /* Temporary input variables */
00449 
00450   /* Check for range of output samples to be calculated */
00451   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00452   {
00453     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00454     status = ARM_MATH_ARGUMENT_ERROR;
00455   }
00456   else
00457   {
00458 
00459     /* The algorithm implementation is based on the lengths of the inputs. */
00460     /* srcB is always made to slide across srcA. */
00461     /* So srcBLen is always considered as shorter or equal to srcALen */
00462     if(srcALen >= srcBLen)
00463     {
00464       /* Initialization of inputA pointer */
00465       pIn1 = pSrcA;
00466 
00467       /* Initialization of inputB pointer */
00468       pIn2 = pSrcB;
00469     }
00470     else
00471     {
00472       /* Initialization of inputA pointer */
00473       pIn1 = pSrcB;
00474 
00475       /* Initialization of inputB pointer */
00476       pIn2 = pSrcA;
00477 
00478       /* srcBLen is always considered as shorter or equal to srcALen */
00479       j = srcBLen;
00480       srcBLen = srcALen;
00481       srcALen = j;
00482     }
00483 
00484     /* pointer to take end of scratch2 buffer */
00485     pScr2 = pScratch2;
00486 
00487     /* points to smaller length sequence */
00488     px = pIn2 + srcBLen - 1;
00489 
00490     /* Apply loop unrolling and do 4 Copies simultaneously. */
00491     k = srcBLen >> 2u;
00492 
00493     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00494      ** a second loop below copies for the remaining 1 to 3 samples. */
00495     while(k > 0u)
00496     {
00497       /* copy second buffer in reversal manner */
00498       x4 = (q15_t) * px--;
00499       *pScr2++ = x4;
00500       x4 = (q15_t) * px--;
00501       *pScr2++ = x4;
00502       x4 = (q15_t) * px--;
00503       *pScr2++ = x4;
00504       x4 = (q15_t) * px--;
00505       *pScr2++ = x4;
00506 
00507       /* Decrement the loop counter */
00508       k--;
00509     }
00510 
00511     /* If the count is not a multiple of 4, copy remaining samples here.       
00512      ** No loop unrolling is used. */
00513     k = srcBLen % 0x4u;
00514 
00515     while(k > 0u)
00516     {
00517       /* copy second buffer in reversal manner for remaining samples */
00518       x4 = (q15_t) * px--;
00519       *pScr2++ = x4;
00520 
00521       /* Decrement the loop counter */
00522       k--;
00523     }
00524 
00525     /* Initialze temporary scratch pointer */
00526     pScr1 = pScratch1;
00527 
00528     /* Fill (srcBLen - 1u) zeros in scratch buffer */
00529     arm_fill_q15(0, pScr1, (srcBLen - 1u));
00530 
00531     /* Update temporary scratch pointer */
00532     pScr1 += (srcBLen - 1u);
00533 
00534     /* Copy (srcALen) samples in scratch buffer */
00535     /* Apply loop unrolling and do 4 Copies simultaneously. */
00536     k = srcALen >> 2u;
00537 
00538     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00539      ** a second loop below copies for the remaining 1 to 3 samples. */
00540     while(k > 0u)
00541     {
00542       /* copy second buffer in reversal manner */
00543       x4 = (q15_t) * pIn1++;
00544       *pScr1++ = x4;
00545       x4 = (q15_t) * pIn1++;
00546       *pScr1++ = x4;
00547       x4 = (q15_t) * pIn1++;
00548       *pScr1++ = x4;
00549       x4 = (q15_t) * pIn1++;
00550       *pScr1++ = x4;
00551 
00552       /* Decrement the loop counter */
00553       k--;
00554     }
00555 
00556     /* If the count is not a multiple of 4, copy remaining samples here.       
00557      ** No loop unrolling is used. */
00558     k = srcALen % 0x4u;
00559 
00560     while(k > 0u)
00561     {
00562       /* copy second buffer in reversal manner for remaining samples */
00563       x4 = (q15_t) * pIn1++;
00564       *pScr1++ = x4;
00565 
00566       /* Decrement the loop counter */
00567       k--;
00568     }
00569 
00570     /* Apply loop unrolling and do 4 Copies simultaneously. */
00571     k = (srcBLen - 1u) >> 2u;
00572 
00573     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00574      ** a second loop below copies for the remaining 1 to 3 samples. */
00575     while(k > 0u)
00576     {
00577       /* copy second buffer in reversal manner */
00578       *pScr1++ = 0;
00579       *pScr1++ = 0;
00580       *pScr1++ = 0;
00581       *pScr1++ = 0;
00582 
00583       /* Decrement the loop counter */
00584       k--;
00585     }
00586 
00587     /* If the count is not a multiple of 4, copy remaining samples here.       
00588      ** No loop unrolling is used. */
00589     k = (srcBLen - 1u) % 0x4u;
00590 
00591     while(k > 0u)
00592     {
00593       /* copy second buffer in reversal manner for remaining samples */
00594       *pScr1++ = 0;
00595 
00596       /* Decrement the loop counter */
00597       k--;
00598     }
00599 
00600 
00601     /* Temporary pointer for scratch2 */
00602     py = pScratch2;
00603 
00604     /* Initialization of pIn2 pointer */
00605     pIn2 = (q7_t *) py;
00606 
00607     pScr2 = py;
00608 
00609     pOut = pDst + firstIndex;
00610 
00611     pScratch1 += firstIndex;
00612 
00613     /* Actual convolution process starts here */
00614     blkCnt = (numPoints) >> 2;
00615 
00616 
00617     while(blkCnt > 0)
00618     {
00619       /* Initialze temporary scratch pointer as scratch1 */
00620       pScr1 = pScratch1;
00621 
00622       /* Clear Accumlators */
00623       acc0 = 0;
00624       acc1 = 0;
00625       acc2 = 0;
00626       acc3 = 0;
00627 
00628       /* Read two samples from scratch1 buffer */
00629       x10 = *pScr1++;
00630       x11 = *pScr1++;
00631 
00632       /* Read next two samples from scratch1 buffer */
00633       x20 = *pScr1++;
00634       x21 = *pScr1++;
00635 
00636       tapCnt = (srcBLen) >> 2u;
00637 
00638       while(tapCnt > 0u)
00639       {
00640 
00641         /* Read four samples from smaller buffer */
00642         y10 = *pScr2;
00643         y11 = *(pScr2 + 1u);
00644 
00645         /* multiply and accumlate */
00646         acc0 += (q31_t) x10 *y10;
00647         acc0 += (q31_t) x11 *y11;
00648         acc2 += (q31_t) x20 *y10;
00649         acc2 += (q31_t) x21 *y11;
00650 
00651 
00652         acc1 += (q31_t) x11 *y10;
00653         acc1 += (q31_t) x20 *y11;
00654 
00655         /* Read next two samples from scratch1 buffer */
00656         x10 = *pScr1;
00657         x11 = *(pScr1 + 1u);
00658 
00659         /* multiply and accumlate */
00660         acc3 += (q31_t) x21 *y10;
00661         acc3 += (q31_t) x10 *y11;
00662 
00663         /* Read next two samples from scratch2 buffer */
00664         y10 = *(pScr2 + 2u);
00665         y11 = *(pScr2 + 3u);
00666 
00667         /* multiply and accumlate */
00668         acc0 += (q31_t) x20 *y10;
00669         acc0 += (q31_t) x21 *y11;
00670         acc2 += (q31_t) x10 *y10;
00671         acc2 += (q31_t) x11 *y11;
00672         acc1 += (q31_t) x21 *y10;
00673         acc1 += (q31_t) x10 *y11;
00674 
00675         /* Read next two samples from scratch1 buffer */
00676         x20 = *(pScr1 + 2);
00677         x21 = *(pScr1 + 3);
00678 
00679         /* multiply and accumlate */
00680         acc3 += (q31_t) x11 *y10;
00681         acc3 += (q31_t) x20 *y11;
00682 
00683         /* update scratch pointers */
00684 
00685         pScr1 += 4u;
00686         pScr2 += 4u;
00687 
00688         /* Decrement the loop counter */
00689         tapCnt--;
00690       }
00691 
00692 
00693 
00694       /* Update scratch pointer for remaining samples of smaller length sequence */
00695       pScr1 -= 4u;
00696 
00697 
00698       /* apply same above for remaining samples of smaller length sequence */
00699       tapCnt = (srcBLen) & 3u;
00700 
00701       while(tapCnt > 0u)
00702       {
00703 
00704         /* accumlate the results */
00705         acc0 += (*pScr1++ * *pScr2);
00706         acc1 += (*pScr1++ * *pScr2);
00707         acc2 += (*pScr1++ * *pScr2);
00708         acc3 += (*pScr1++ * *pScr2++);
00709 
00710         pScr1 -= 3u;
00711 
00712         /* Decrement the loop counter */
00713         tapCnt--;
00714       }
00715 
00716       blkCnt--;
00717 
00718       /* Store the result in the accumulator in the destination buffer. */
00719       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
00720       *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8));
00721       *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8));
00722       *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8));
00723 
00724       /* Initialization of inputB pointer */
00725       pScr2 = py;
00726 
00727       pScratch1 += 4u;
00728 
00729     }
00730 
00731     blkCnt = (numPoints) & 0x3;
00732 
00733     /* Calculate convolution for remaining samples of Bigger length sequence */
00734     while(blkCnt > 0)
00735     {
00736       /* Initialze temporary scratch pointer as scratch1 */
00737       pScr1 = pScratch1;
00738 
00739       /* Clear Accumlators */
00740       acc0 = 0;
00741 
00742       tapCnt = (srcBLen) >> 1u;
00743 
00744       while(tapCnt > 0u)
00745       {
00746 
00747         /* Read next two samples from scratch1 buffer */
00748         x10 = *pScr1++;
00749         x11 = *pScr1++;
00750 
00751         /* Read two samples from smaller buffer */
00752         y10 = *pScr2++;
00753         y11 = *pScr2++;
00754 
00755         /* multiply and accumlate */
00756         acc0 += (q31_t) x10 *y10;
00757         acc0 += (q31_t) x11 *y11;
00758 
00759         /* Decrement the loop counter */
00760         tapCnt--;
00761       }
00762 
00763       tapCnt = (srcBLen) & 1u;
00764 
00765       /* apply same above for remaining samples of smaller length sequence */
00766       while(tapCnt > 0u)
00767       {
00768 
00769         /* accumlate the results */
00770         acc0 += (*pScr1++ * *pScr2++);
00771 
00772         /* Decrement the loop counter */
00773         tapCnt--;
00774       }
00775 
00776       blkCnt--;
00777 
00778       /* Store the result in the accumulator in the destination buffer. */
00779       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
00780 
00781       /* Initialization of inputB pointer */
00782       pScr2 = py;
00783 
00784       pScratch1 += 1u;
00785 
00786     }
00787 
00788     /* set status as ARM_MATH_SUCCESS */
00789     status = ARM_MATH_SUCCESS;
00790 
00791   }
00792 
00793   return (status);
00794 
00795 }
00796 
00797 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00798 
00799 
00800 
00801 /**    
00802  * @} end of PartialConv group    
00803  */