CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_fast_opt_q15.c Source File

arm_conv_partial_fast_opt_q15.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_conv_partial_fast_opt_q15.c    
00009 *    
00010 * Description:  Fast Q15 Partial convolution.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.     
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup PartialConv    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.    
00054  * @param[in]       *pSrcA points to the first input sequence.    
00055  * @param[in]       srcALen length of the first input sequence.    
00056  * @param[in]       *pSrcB points to the second input sequence.    
00057  * @param[in]       srcBLen length of the second input sequence.    
00058  * @param[out]      *pDst points to the location where the output result is written.    
00059  * @param[in]       firstIndex is the first output sample to start with.    
00060  * @param[in]       numPoints is the number of output points to be computed.    
00061  * @param[in]       *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.   
00062  * @param[in]       *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).   
00063  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].    
00064  *    
00065  * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.    
00066  *    
00067  * \par Restrictions    
00068  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE    
00069  *  In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit    
00070  *     
00071  */
00072 
00073 #ifndef UNALIGNED_SUPPORT_DISABLE
00074 
00075 arm_status arm_conv_partial_fast_opt_q15(
00076   q15_t * pSrcA,
00077   uint32_t srcALen,
00078   q15_t * pSrcB,
00079   uint32_t srcBLen,
00080   q15_t * pDst,
00081   uint32_t firstIndex,
00082   uint32_t numPoints,
00083   q15_t * pScratch1,
00084   q15_t * pScratch2)
00085 {
00086 
00087   q15_t *pOut = pDst;                            /* output pointer */
00088   q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
00089   q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
00090   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00091   q31_t x1, x2, x3;                              /* Temporary variables to hold state and coefficient values */
00092   q31_t y1, y2;                                  /* State variables */
00093   q15_t *pIn1;                                   /* inputA pointer */
00094   q15_t *pIn2;                                   /* inputB pointer */
00095   q15_t *px;                                     /* Intermediate inputA pointer  */
00096   q15_t *py;                                     /* Intermediate inputB pointer  */
00097   uint32_t j, k, blkCnt;                         /* loop counter */
00098   arm_status status;
00099 
00100   uint32_t tapCnt;                               /* loop count */
00101 
00102   /* Check for range of output samples to be calculated */
00103   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00104   {
00105     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00106     status = ARM_MATH_ARGUMENT_ERROR;
00107   }
00108   else
00109   {
00110 
00111     /* The algorithm implementation is based on the lengths of the inputs. */
00112     /* srcB is always made to slide across srcA. */
00113     /* So srcBLen is always considered as shorter or equal to srcALen */
00114     if(srcALen >= srcBLen)
00115     {
00116       /* Initialization of inputA pointer */
00117       pIn1 = pSrcA;
00118 
00119       /* Initialization of inputB pointer */
00120       pIn2 = pSrcB;
00121     }
00122     else
00123     {
00124       /* Initialization of inputA pointer */
00125       pIn1 = pSrcB;
00126 
00127       /* Initialization of inputB pointer */
00128       pIn2 = pSrcA;
00129 
00130       /* srcBLen is always considered as shorter or equal to srcALen */
00131       j = srcBLen;
00132       srcBLen = srcALen;
00133       srcALen = j;
00134     }
00135 
00136     /* Temporary pointer for scratch2 */
00137     py = pScratch2;
00138 
00139     /* pointer to take end of scratch2 buffer */
00140     pScr2 = pScratch2 + srcBLen - 1;
00141 
00142     /* points to smaller length sequence */
00143     px = pIn2;
00144 
00145     /* Apply loop unrolling and do 4 Copies simultaneously. */
00146     k = srcBLen >> 2u;
00147 
00148     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00149      ** a second loop below copies for the remaining 1 to 3 samples. */
00150 
00151     /* Copy smaller length input sequence in reverse order into second scratch buffer */
00152     while(k > 0u)
00153     {
00154       /* copy second buffer in reversal manner */
00155       *pScr2-- = *px++;
00156       *pScr2-- = *px++;
00157       *pScr2-- = *px++;
00158       *pScr2-- = *px++;
00159 
00160       /* Decrement the loop counter */
00161       k--;
00162     }
00163 
00164     /* If the count is not a multiple of 4, copy remaining samples here.       
00165      ** No loop unrolling is used. */
00166     k = srcBLen % 0x4u;
00167 
00168     while(k > 0u)
00169     {
00170       /* copy second buffer in reversal manner for remaining samples */
00171       *pScr2-- = *px++;
00172 
00173       /* Decrement the loop counter */
00174       k--;
00175     }
00176 
00177     /* Initialze temporary scratch pointer */
00178     pScr1 = pScratch1;
00179 
00180     /* Assuming scratch1 buffer is aligned by 32-bit */
00181     /* Fill (srcBLen - 1u) zeros in scratch buffer */
00182     arm_fill_q15(0, pScr1, (srcBLen - 1u));
00183 
00184     /* Update temporary scratch pointer */
00185     pScr1 += (srcBLen - 1u);
00186 
00187     /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
00188 
00189     /* Copy (srcALen) samples in scratch buffer */
00190     arm_copy_q15(pIn1, pScr1, srcALen);
00191 
00192     /* Update pointers */
00193     pScr1 += srcALen;
00194 
00195     /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
00196     arm_fill_q15(0, pScr1, (srcBLen - 1u));
00197 
00198     /* Update pointer */
00199     pScr1 += (srcBLen - 1u);
00200 
00201     /* Initialization of pIn2 pointer */
00202     pIn2 = py;
00203 
00204     pScratch1 += firstIndex;
00205 
00206     pOut = pDst + firstIndex;
00207 
00208     /* First part of the processing with loop unrolling process 4 data points at a time.       
00209      ** a second loop below process for the remaining 1 to 3 samples. */
00210 
00211     /* Actual convolution process starts here */
00212     blkCnt = (numPoints) >> 2;
00213 
00214     while(blkCnt > 0)
00215     {
00216       /* Initialze temporary scratch pointer as scratch1 */
00217       pScr1 = pScratch1;
00218 
00219       /* Clear Accumlators */
00220       acc0 = 0;
00221       acc1 = 0;
00222       acc2 = 0;
00223       acc3 = 0;
00224 
00225       /* Read two samples from scratch1 buffer */
00226       x1 = *__SIMD32(pScr1)++;
00227 
00228       /* Read next two samples from scratch1 buffer */
00229       x2 = *__SIMD32(pScr1)++;
00230 
00231       tapCnt = (srcBLen) >> 2u;
00232 
00233       while(tapCnt > 0u)
00234       {
00235 
00236         /* Read four samples from smaller buffer */
00237         y1 = _SIMD32_OFFSET(pIn2);
00238         y2 = _SIMD32_OFFSET(pIn2 + 2u);
00239 
00240         /* multiply and accumlate */
00241         acc0 = __SMLAD(x1, y1, acc0);
00242         acc2 = __SMLAD(x2, y1, acc2);
00243 
00244         /* pack input data */
00245 #ifndef ARM_MATH_BIG_ENDIAN
00246         x3 = __PKHBT(x2, x1, 0);
00247 #else
00248         x3 = __PKHBT(x1, x2, 0);
00249 #endif
00250 
00251         /* multiply and accumlate */
00252         acc1 = __SMLADX(x3, y1, acc1);
00253 
00254         /* Read next two samples from scratch1 buffer */
00255         x1 = _SIMD32_OFFSET(pScr1);
00256 
00257         /* multiply and accumlate */
00258         acc0 = __SMLAD(x2, y2, acc0);
00259 
00260         acc2 = __SMLAD(x1, y2, acc2);
00261 
00262         /* pack input data */
00263 #ifndef ARM_MATH_BIG_ENDIAN
00264         x3 = __PKHBT(x1, x2, 0);
00265 #else
00266         x3 = __PKHBT(x2, x1, 0);
00267 #endif
00268 
00269         acc3 = __SMLADX(x3, y1, acc3);
00270         acc1 = __SMLADX(x3, y2, acc1);
00271 
00272         x2 = _SIMD32_OFFSET(pScr1 + 2u);
00273 
00274 #ifndef ARM_MATH_BIG_ENDIAN
00275         x3 = __PKHBT(x2, x1, 0);
00276 #else
00277         x3 = __PKHBT(x1, x2, 0);
00278 #endif
00279 
00280         acc3 = __SMLADX(x3, y2, acc3);
00281 
00282         /* update scratch pointers */
00283         pIn2 += 4u;
00284         pScr1 += 4u;
00285 
00286 
00287         /* Decrement the loop counter */
00288         tapCnt--;
00289       }
00290 
00291       /* Update scratch pointer for remaining samples of smaller length sequence */
00292       pScr1 -= 4u;
00293 
00294       /* apply same above for remaining samples of smaller length sequence */
00295       tapCnt = (srcBLen) & 3u;
00296 
00297       while(tapCnt > 0u)
00298       {
00299 
00300         /* accumlate the results */
00301         acc0 += (*pScr1++ * *pIn2);
00302         acc1 += (*pScr1++ * *pIn2);
00303         acc2 += (*pScr1++ * *pIn2);
00304         acc3 += (*pScr1++ * *pIn2++);
00305 
00306         pScr1 -= 3u;
00307 
00308         /* Decrement the loop counter */
00309         tapCnt--;
00310       }
00311 
00312       blkCnt--;
00313 
00314 
00315       /* Store the results in the accumulators in the destination buffer. */
00316 
00317 #ifndef  ARM_MATH_BIG_ENDIAN
00318 
00319       *__SIMD32(pOut)++ =
00320         __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00321       *__SIMD32(pOut)++ =
00322         __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00323 
00324 #else
00325 
00326       *__SIMD32(pOut)++ =
00327         __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00328       *__SIMD32(pOut)++ =
00329         __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00330 
00331 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00332 
00333       /* Initialization of inputB pointer */
00334       pIn2 = py;
00335 
00336       pScratch1 += 4u;
00337 
00338     }
00339 
00340 
00341     blkCnt = numPoints & 0x3;
00342 
00343     /* Calculate convolution for remaining samples of Bigger length sequence */
00344     while(blkCnt > 0)
00345     {
00346       /* Initialze temporary scratch pointer as scratch1 */
00347       pScr1 = pScratch1;
00348 
00349       /* Clear Accumlators */
00350       acc0 = 0;
00351 
00352       tapCnt = (srcBLen) >> 1u;
00353 
00354       while(tapCnt > 0u)
00355       {
00356 
00357         /* Read next two samples from scratch1 buffer */
00358         x1 = *__SIMD32(pScr1)++;
00359 
00360         /* Read two samples from smaller buffer */
00361         y1 = *__SIMD32(pIn2)++;
00362 
00363         acc0 = __SMLAD(x1, y1, acc0);
00364 
00365         /* Decrement the loop counter */
00366         tapCnt--;
00367       }
00368 
00369       tapCnt = (srcBLen) & 1u;
00370 
00371       /* apply same above for remaining samples of smaller length sequence */
00372       while(tapCnt > 0u)
00373       {
00374 
00375         /* accumlate the results */
00376         acc0 += (*pScr1++ * *pIn2++);
00377 
00378         /* Decrement the loop counter */
00379         tapCnt--;
00380       }
00381 
00382       blkCnt--;
00383 
00384       /* The result is in 2.30 format.  Convert to 1.15 with saturation.       
00385        ** Then store the output in the destination buffer. */
00386       *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00387 
00388       /* Initialization of inputB pointer */
00389       pIn2 = py;
00390 
00391       pScratch1 += 1u;
00392 
00393     }
00394     /* set status as ARM_MATH_SUCCESS */
00395     status = ARM_MATH_SUCCESS;
00396   }
00397   /* Return to application */
00398   return (status);
00399 }
00400 
00401 #else
00402 
00403 arm_status arm_conv_partial_fast_opt_q15(
00404   q15_t * pSrcA,
00405   uint32_t srcALen,
00406   q15_t * pSrcB,
00407   uint32_t srcBLen,
00408   q15_t * pDst,
00409   uint32_t firstIndex,
00410   uint32_t numPoints,
00411   q15_t * pScratch1,
00412   q15_t * pScratch2)
00413 {
00414 
00415   q15_t *pOut = pDst;                            /* output pointer */
00416   q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
00417   q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
00418   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00419   q15_t *pIn1;                                   /* inputA pointer */
00420   q15_t *pIn2;                                   /* inputB pointer */
00421   q15_t *px;                                     /* Intermediate inputA pointer  */
00422   q15_t *py;                                     /* Intermediate inputB pointer  */
00423   uint32_t j, k, blkCnt;                         /* loop counter */
00424   arm_status status;                             /* Status variable */
00425   uint32_t tapCnt;                               /* loop count */
00426   q15_t x10, x11, x20, x21;                      /* Temporary variables to hold srcA buffer */
00427   q15_t y10, y11;                                /* Temporary variables to hold srcB buffer */
00428 
00429 
00430   /* Check for range of output samples to be calculated */
00431   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00432   {
00433     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00434     status = ARM_MATH_ARGUMENT_ERROR;
00435   }
00436   else
00437   {
00438 
00439     /* The algorithm implementation is based on the lengths of the inputs. */
00440     /* srcB is always made to slide across srcA. */
00441     /* So srcBLen is always considered as shorter or equal to srcALen */
00442     if(srcALen >= srcBLen)
00443     {
00444       /* Initialization of inputA pointer */
00445       pIn1 = pSrcA;
00446 
00447       /* Initialization of inputB pointer */
00448       pIn2 = pSrcB;
00449     }
00450     else
00451     {
00452       /* Initialization of inputA pointer */
00453       pIn1 = pSrcB;
00454 
00455       /* Initialization of inputB pointer */
00456       pIn2 = pSrcA;
00457 
00458       /* srcBLen is always considered as shorter or equal to srcALen */
00459       j = srcBLen;
00460       srcBLen = srcALen;
00461       srcALen = j;
00462     }
00463 
00464     /* Temporary pointer for scratch2 */
00465     py = pScratch2;
00466 
00467     /* pointer to take end of scratch2 buffer */
00468     pScr2 = pScratch2 + srcBLen - 1;
00469 
00470     /* points to smaller length sequence */
00471     px = pIn2;
00472 
00473     /* Apply loop unrolling and do 4 Copies simultaneously. */
00474     k = srcBLen >> 2u;
00475 
00476     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00477      ** a second loop below copies for the remaining 1 to 3 samples. */
00478     while(k > 0u)
00479     {
00480       /* copy second buffer in reversal manner */
00481       *pScr2-- = *px++;
00482       *pScr2-- = *px++;
00483       *pScr2-- = *px++;
00484       *pScr2-- = *px++;
00485 
00486       /* Decrement the loop counter */
00487       k--;
00488     }
00489 
00490     /* If the count is not a multiple of 4, copy remaining samples here.       
00491      ** No loop unrolling is used. */
00492     k = srcBLen % 0x4u;
00493 
00494     while(k > 0u)
00495     {
00496       /* copy second buffer in reversal manner for remaining samples */
00497       *pScr2-- = *px++;
00498 
00499       /* Decrement the loop counter */
00500       k--;
00501     }
00502 
00503     /* Initialze temporary scratch pointer */
00504     pScr1 = pScratch1;
00505 
00506     /* Fill (srcBLen - 1u) zeros in scratch buffer */
00507     arm_fill_q15(0, pScr1, (srcBLen - 1u));
00508 
00509     /* Update temporary scratch pointer */
00510     pScr1 += (srcBLen - 1u);
00511 
00512     /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
00513 
00514 
00515     /* Apply loop unrolling and do 4 Copies simultaneously. */
00516     k = srcALen >> 2u;
00517 
00518     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00519      ** a second loop below copies for the remaining 1 to 3 samples. */
00520     while(k > 0u)
00521     {
00522       /* copy second buffer in reversal manner */
00523       *pScr1++ = *pIn1++;
00524       *pScr1++ = *pIn1++;
00525       *pScr1++ = *pIn1++;
00526       *pScr1++ = *pIn1++;
00527 
00528       /* Decrement the loop counter */
00529       k--;
00530     }
00531 
00532     /* If the count is not a multiple of 4, copy remaining samples here.       
00533      ** No loop unrolling is used. */
00534     k = srcALen % 0x4u;
00535 
00536     while(k > 0u)
00537     {
00538       /* copy second buffer in reversal manner for remaining samples */
00539       *pScr1++ = *pIn1++;
00540 
00541       /* Decrement the loop counter */
00542       k--;
00543     }
00544 
00545 
00546     /* Apply loop unrolling and do 4 Copies simultaneously. */
00547     k = (srcBLen - 1u) >> 2u;
00548 
00549     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00550      ** a second loop below copies for the remaining 1 to 3 samples. */
00551     while(k > 0u)
00552     {
00553       /* copy second buffer in reversal manner */
00554       *pScr1++ = 0;
00555       *pScr1++ = 0;
00556       *pScr1++ = 0;
00557       *pScr1++ = 0;
00558 
00559       /* Decrement the loop counter */
00560       k--;
00561     }
00562 
00563     /* If the count is not a multiple of 4, copy remaining samples here.       
00564      ** No loop unrolling is used. */
00565     k = (srcBLen - 1u) % 0x4u;
00566 
00567     while(k > 0u)
00568     {
00569       /* copy second buffer in reversal manner for remaining samples */
00570       *pScr1++ = 0;
00571 
00572       /* Decrement the loop counter */
00573       k--;
00574     }
00575 
00576 
00577     /* Initialization of pIn2 pointer */
00578     pIn2 = py;
00579 
00580     pScratch1 += firstIndex;
00581 
00582     pOut = pDst + firstIndex;
00583 
00584     /* Actual convolution process starts here */
00585     blkCnt = (numPoints) >> 2;
00586 
00587     while(blkCnt > 0)
00588     {
00589       /* Initialze temporary scratch pointer as scratch1 */
00590       pScr1 = pScratch1;
00591 
00592       /* Clear Accumlators */
00593       acc0 = 0;
00594       acc1 = 0;
00595       acc2 = 0;
00596       acc3 = 0;
00597 
00598       /* Read two samples from scratch1 buffer */
00599       x10 = *pScr1++;
00600       x11 = *pScr1++;
00601 
00602       /* Read next two samples from scratch1 buffer */
00603       x20 = *pScr1++;
00604       x21 = *pScr1++;
00605 
00606       tapCnt = (srcBLen) >> 2u;
00607 
00608       while(tapCnt > 0u)
00609       {
00610 
00611         /* Read two samples from smaller buffer */
00612         y10 = *pIn2;
00613         y11 = *(pIn2 + 1u);
00614 
00615         /* multiply and accumlate */
00616         acc0 += (q31_t) x10 *y10;
00617         acc0 += (q31_t) x11 *y11;
00618         acc2 += (q31_t) x20 *y10;
00619         acc2 += (q31_t) x21 *y11;
00620 
00621         /* multiply and accumlate */
00622         acc1 += (q31_t) x11 *y10;
00623         acc1 += (q31_t) x20 *y11;
00624 
00625         /* Read next two samples from scratch1 buffer */
00626         x10 = *pScr1;
00627         x11 = *(pScr1 + 1u);
00628 
00629         /* multiply and accumlate */
00630         acc3 += (q31_t) x21 *y10;
00631         acc3 += (q31_t) x10 *y11;
00632 
00633         /* Read next two samples from scratch2 buffer */
00634         y10 = *(pIn2 + 2u);
00635         y11 = *(pIn2 + 3u);
00636 
00637         /* multiply and accumlate */
00638         acc0 += (q31_t) x20 *y10;
00639         acc0 += (q31_t) x21 *y11;
00640         acc2 += (q31_t) x10 *y10;
00641         acc2 += (q31_t) x11 *y11;
00642         acc1 += (q31_t) x21 *y10;
00643         acc1 += (q31_t) x10 *y11;
00644 
00645         /* Read next two samples from scratch1 buffer */
00646         x20 = *(pScr1 + 2);
00647         x21 = *(pScr1 + 3);
00648 
00649         /* multiply and accumlate */
00650         acc3 += (q31_t) x11 *y10;
00651         acc3 += (q31_t) x20 *y11;
00652 
00653         /* update scratch pointers */
00654         pIn2 += 4u;
00655         pScr1 += 4u;
00656 
00657         /* Decrement the loop counter */
00658         tapCnt--;
00659       }
00660 
00661       /* Update scratch pointer for remaining samples of smaller length sequence */
00662       pScr1 -= 4u;
00663 
00664       /* apply same above for remaining samples of smaller length sequence */
00665       tapCnt = (srcBLen) & 3u;
00666 
00667       while(tapCnt > 0u)
00668       {
00669         /* accumlate the results */
00670         acc0 += (*pScr1++ * *pIn2);
00671         acc1 += (*pScr1++ * *pIn2);
00672         acc2 += (*pScr1++ * *pIn2);
00673         acc3 += (*pScr1++ * *pIn2++);
00674 
00675         pScr1 -= 3u;
00676 
00677         /* Decrement the loop counter */
00678         tapCnt--;
00679       }
00680 
00681       blkCnt--;
00682 
00683 
00684       /* Store the results in the accumulators in the destination buffer. */
00685       *pOut++ = __SSAT((acc0 >> 15), 16);
00686       *pOut++ = __SSAT((acc1 >> 15), 16);
00687       *pOut++ = __SSAT((acc2 >> 15), 16);
00688       *pOut++ = __SSAT((acc3 >> 15), 16);
00689 
00690       /* Initialization of inputB pointer */
00691       pIn2 = py;
00692 
00693       pScratch1 += 4u;
00694 
00695     }
00696 
00697 
00698     blkCnt = numPoints & 0x3;
00699 
00700     /* Calculate convolution for remaining samples of Bigger length sequence */
00701     while(blkCnt > 0)
00702     {
00703       /* Initialze temporary scratch pointer as scratch1 */
00704       pScr1 = pScratch1;
00705 
00706       /* Clear Accumlators */
00707       acc0 = 0;
00708 
00709       tapCnt = (srcBLen) >> 1u;
00710 
00711       while(tapCnt > 0u)
00712       {
00713 
00714         /* Read next two samples from scratch1 buffer */
00715         x10 = *pScr1++;
00716         x11 = *pScr1++;
00717 
00718         /* Read two samples from smaller buffer */
00719         y10 = *pIn2++;
00720         y11 = *pIn2++;
00721 
00722         /* multiply and accumlate */
00723         acc0 += (q31_t) x10 *y10;
00724         acc0 += (q31_t) x11 *y11;
00725 
00726         /* Decrement the loop counter */
00727         tapCnt--;
00728       }
00729 
00730       tapCnt = (srcBLen) & 1u;
00731 
00732       /* apply same above for remaining samples of smaller length sequence */
00733       while(tapCnt > 0u)
00734       {
00735 
00736         /* accumlate the results */
00737         acc0 += (*pScr1++ * *pIn2++);
00738 
00739         /* Decrement the loop counter */
00740         tapCnt--;
00741       }
00742 
00743       blkCnt--;
00744 
00745       /* Store the result in the accumulator in the destination buffer. */
00746       *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00747 
00748       /* Initialization of inputB pointer */
00749       pIn2 = py;
00750 
00751       pScratch1 += 1u;
00752 
00753     }
00754 
00755     /* set status as ARM_MATH_SUCCESS */
00756     status = ARM_MATH_SUCCESS;
00757 
00758   }
00759 
00760   /* Return to application */
00761   return (status);
00762 }
00763 
00764 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00765 
00766 /**    
00767  * @} end of PartialConv group    
00768  */