CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_conv_partial_fast_opt_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_fast_opt_q15.c 00009 * 00010 * Description: Fast Q15 Partial convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00062 * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). 00063 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00064 * 00065 * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. 00066 * 00067 * \par Restrictions 00068 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00069 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00070 * 00071 */ 00072 00073 #ifndef UNALIGNED_SUPPORT_DISABLE 00074 00075 arm_status arm_conv_partial_fast_opt_q15( 00076 q15_t * pSrcA, 00077 uint32_t srcALen, 00078 q15_t * pSrcB, 00079 uint32_t srcBLen, 00080 q15_t * pDst, 00081 uint32_t firstIndex, 00082 uint32_t numPoints, 00083 q15_t * pScratch1, 00084 q15_t * pScratch2) 00085 { 00086 00087 q15_t *pOut = pDst; /* output pointer */ 00088 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00089 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00090 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00091 q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ 00092 q31_t y1, y2; /* State variables */ 00093 q15_t *pIn1; /* inputA pointer */ 00094 q15_t *pIn2; /* inputB pointer */ 00095 q15_t *px; /* Intermediate inputA pointer */ 00096 q15_t *py; /* Intermediate inputB pointer */ 00097 uint32_t j, k, blkCnt; /* loop counter */ 00098 arm_status status; 00099 00100 uint32_t tapCnt; /* loop count */ 00101 00102 /* Check for range of output samples to be calculated */ 00103 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00104 { 00105 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00106 status = ARM_MATH_ARGUMENT_ERROR; 00107 } 00108 else 00109 { 00110 00111 /* The algorithm implementation is based on the lengths of the inputs. */ 00112 /* srcB is always made to slide across srcA. */ 00113 /* So srcBLen is always considered as shorter or equal to srcALen */ 00114 if(srcALen >= srcBLen) 00115 { 00116 /* Initialization of inputA pointer */ 00117 pIn1 = pSrcA; 00118 00119 /* Initialization of inputB pointer */ 00120 pIn2 = pSrcB; 00121 } 00122 else 00123 { 00124 /* Initialization of inputA pointer */ 00125 pIn1 = pSrcB; 00126 00127 /* Initialization of inputB pointer */ 00128 pIn2 = pSrcA; 00129 00130 /* srcBLen is always considered as shorter or equal to srcALen */ 00131 j = srcBLen; 00132 srcBLen = srcALen; 00133 srcALen = j; 00134 } 00135 00136 /* Temporary pointer for scratch2 */ 00137 py = pScratch2; 00138 00139 /* pointer to take end of scratch2 buffer */ 00140 pScr2 = pScratch2 + srcBLen - 1; 00141 00142 /* points to smaller length sequence */ 00143 px = pIn2; 00144 00145 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00146 k = srcBLen >> 2u; 00147 00148 /* First part of the processing with loop unrolling copies 4 data points at a time. 00149 ** a second loop below copies for the remaining 1 to 3 samples. */ 00150 00151 /* Copy smaller length input sequence in reverse order into second scratch buffer */ 00152 while(k > 0u) 00153 { 00154 /* copy second buffer in reversal manner */ 00155 *pScr2-- = *px++; 00156 *pScr2-- = *px++; 00157 *pScr2-- = *px++; 00158 *pScr2-- = *px++; 00159 00160 /* Decrement the loop counter */ 00161 k--; 00162 } 00163 00164 /* If the count is not a multiple of 4, copy remaining samples here. 00165 ** No loop unrolling is used. */ 00166 k = srcBLen % 0x4u; 00167 00168 while(k > 0u) 00169 { 00170 /* copy second buffer in reversal manner for remaining samples */ 00171 *pScr2-- = *px++; 00172 00173 /* Decrement the loop counter */ 00174 k--; 00175 } 00176 00177 /* Initialze temporary scratch pointer */ 00178 pScr1 = pScratch1; 00179 00180 /* Assuming scratch1 buffer is aligned by 32-bit */ 00181 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00182 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00183 00184 /* Update temporary scratch pointer */ 00185 pScr1 += (srcBLen - 1u); 00186 00187 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00188 00189 /* Copy (srcALen) samples in scratch buffer */ 00190 arm_copy_q15(pIn1, pScr1, srcALen); 00191 00192 /* Update pointers */ 00193 pScr1 += srcALen; 00194 00195 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00196 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00197 00198 /* Update pointer */ 00199 pScr1 += (srcBLen - 1u); 00200 00201 /* Initialization of pIn2 pointer */ 00202 pIn2 = py; 00203 00204 pScratch1 += firstIndex; 00205 00206 pOut = pDst + firstIndex; 00207 00208 /* First part of the processing with loop unrolling process 4 data points at a time. 00209 ** a second loop below process for the remaining 1 to 3 samples. */ 00210 00211 /* Actual convolution process starts here */ 00212 blkCnt = (numPoints) >> 2; 00213 00214 while(blkCnt > 0) 00215 { 00216 /* Initialze temporary scratch pointer as scratch1 */ 00217 pScr1 = pScratch1; 00218 00219 /* Clear Accumlators */ 00220 acc0 = 0; 00221 acc1 = 0; 00222 acc2 = 0; 00223 acc3 = 0; 00224 00225 /* Read two samples from scratch1 buffer */ 00226 x1 = *__SIMD32(pScr1)++; 00227 00228 /* Read next two samples from scratch1 buffer */ 00229 x2 = *__SIMD32(pScr1)++; 00230 00231 tapCnt = (srcBLen) >> 2u; 00232 00233 while(tapCnt > 0u) 00234 { 00235 00236 /* Read four samples from smaller buffer */ 00237 y1 = _SIMD32_OFFSET(pIn2); 00238 y2 = _SIMD32_OFFSET(pIn2 + 2u); 00239 00240 /* multiply and accumlate */ 00241 acc0 = __SMLAD(x1, y1, acc0); 00242 acc2 = __SMLAD(x2, y1, acc2); 00243 00244 /* pack input data */ 00245 #ifndef ARM_MATH_BIG_ENDIAN 00246 x3 = __PKHBT(x2, x1, 0); 00247 #else 00248 x3 = __PKHBT(x1, x2, 0); 00249 #endif 00250 00251 /* multiply and accumlate */ 00252 acc1 = __SMLADX(x3, y1, acc1); 00253 00254 /* Read next two samples from scratch1 buffer */ 00255 x1 = _SIMD32_OFFSET(pScr1); 00256 00257 /* multiply and accumlate */ 00258 acc0 = __SMLAD(x2, y2, acc0); 00259 00260 acc2 = __SMLAD(x1, y2, acc2); 00261 00262 /* pack input data */ 00263 #ifndef ARM_MATH_BIG_ENDIAN 00264 x3 = __PKHBT(x1, x2, 0); 00265 #else 00266 x3 = __PKHBT(x2, x1, 0); 00267 #endif 00268 00269 acc3 = __SMLADX(x3, y1, acc3); 00270 acc1 = __SMLADX(x3, y2, acc1); 00271 00272 x2 = _SIMD32_OFFSET(pScr1 + 2u); 00273 00274 #ifndef ARM_MATH_BIG_ENDIAN 00275 x3 = __PKHBT(x2, x1, 0); 00276 #else 00277 x3 = __PKHBT(x1, x2, 0); 00278 #endif 00279 00280 acc3 = __SMLADX(x3, y2, acc3); 00281 00282 /* update scratch pointers */ 00283 pIn2 += 4u; 00284 pScr1 += 4u; 00285 00286 00287 /* Decrement the loop counter */ 00288 tapCnt--; 00289 } 00290 00291 /* Update scratch pointer for remaining samples of smaller length sequence */ 00292 pScr1 -= 4u; 00293 00294 /* apply same above for remaining samples of smaller length sequence */ 00295 tapCnt = (srcBLen) & 3u; 00296 00297 while(tapCnt > 0u) 00298 { 00299 00300 /* accumlate the results */ 00301 acc0 += (*pScr1++ * *pIn2); 00302 acc1 += (*pScr1++ * *pIn2); 00303 acc2 += (*pScr1++ * *pIn2); 00304 acc3 += (*pScr1++ * *pIn2++); 00305 00306 pScr1 -= 3u; 00307 00308 /* Decrement the loop counter */ 00309 tapCnt--; 00310 } 00311 00312 blkCnt--; 00313 00314 00315 /* Store the results in the accumulators in the destination buffer. */ 00316 00317 #ifndef ARM_MATH_BIG_ENDIAN 00318 00319 *__SIMD32(pOut)++ = 00320 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00321 *__SIMD32(pOut)++ = 00322 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00323 00324 #else 00325 00326 *__SIMD32(pOut)++ = 00327 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00328 *__SIMD32(pOut)++ = 00329 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00330 00331 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00332 00333 /* Initialization of inputB pointer */ 00334 pIn2 = py; 00335 00336 pScratch1 += 4u; 00337 00338 } 00339 00340 00341 blkCnt = numPoints & 0x3; 00342 00343 /* Calculate convolution for remaining samples of Bigger length sequence */ 00344 while(blkCnt > 0) 00345 { 00346 /* Initialze temporary scratch pointer as scratch1 */ 00347 pScr1 = pScratch1; 00348 00349 /* Clear Accumlators */ 00350 acc0 = 0; 00351 00352 tapCnt = (srcBLen) >> 1u; 00353 00354 while(tapCnt > 0u) 00355 { 00356 00357 /* Read next two samples from scratch1 buffer */ 00358 x1 = *__SIMD32(pScr1)++; 00359 00360 /* Read two samples from smaller buffer */ 00361 y1 = *__SIMD32(pIn2)++; 00362 00363 acc0 = __SMLAD(x1, y1, acc0); 00364 00365 /* Decrement the loop counter */ 00366 tapCnt--; 00367 } 00368 00369 tapCnt = (srcBLen) & 1u; 00370 00371 /* apply same above for remaining samples of smaller length sequence */ 00372 while(tapCnt > 0u) 00373 { 00374 00375 /* accumlate the results */ 00376 acc0 += (*pScr1++ * *pIn2++); 00377 00378 /* Decrement the loop counter */ 00379 tapCnt--; 00380 } 00381 00382 blkCnt--; 00383 00384 /* The result is in 2.30 format. Convert to 1.15 with saturation. 00385 ** Then store the output in the destination buffer. */ 00386 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00387 00388 /* Initialization of inputB pointer */ 00389 pIn2 = py; 00390 00391 pScratch1 += 1u; 00392 00393 } 00394 /* set status as ARM_MATH_SUCCESS */ 00395 status = ARM_MATH_SUCCESS; 00396 } 00397 /* Return to application */ 00398 return (status); 00399 } 00400 00401 #else 00402 00403 arm_status arm_conv_partial_fast_opt_q15( 00404 q15_t * pSrcA, 00405 uint32_t srcALen, 00406 q15_t * pSrcB, 00407 uint32_t srcBLen, 00408 q15_t * pDst, 00409 uint32_t firstIndex, 00410 uint32_t numPoints, 00411 q15_t * pScratch1, 00412 q15_t * pScratch2) 00413 { 00414 00415 q15_t *pOut = pDst; /* output pointer */ 00416 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00417 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00418 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00419 q15_t *pIn1; /* inputA pointer */ 00420 q15_t *pIn2; /* inputB pointer */ 00421 q15_t *px; /* Intermediate inputA pointer */ 00422 q15_t *py; /* Intermediate inputB pointer */ 00423 uint32_t j, k, blkCnt; /* loop counter */ 00424 arm_status status; /* Status variable */ 00425 uint32_t tapCnt; /* loop count */ 00426 q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */ 00427 q15_t y10, y11; /* Temporary variables to hold srcB buffer */ 00428 00429 00430 /* Check for range of output samples to be calculated */ 00431 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00432 { 00433 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00434 status = ARM_MATH_ARGUMENT_ERROR; 00435 } 00436 else 00437 { 00438 00439 /* The algorithm implementation is based on the lengths of the inputs. */ 00440 /* srcB is always made to slide across srcA. */ 00441 /* So srcBLen is always considered as shorter or equal to srcALen */ 00442 if(srcALen >= srcBLen) 00443 { 00444 /* Initialization of inputA pointer */ 00445 pIn1 = pSrcA; 00446 00447 /* Initialization of inputB pointer */ 00448 pIn2 = pSrcB; 00449 } 00450 else 00451 { 00452 /* Initialization of inputA pointer */ 00453 pIn1 = pSrcB; 00454 00455 /* Initialization of inputB pointer */ 00456 pIn2 = pSrcA; 00457 00458 /* srcBLen is always considered as shorter or equal to srcALen */ 00459 j = srcBLen; 00460 srcBLen = srcALen; 00461 srcALen = j; 00462 } 00463 00464 /* Temporary pointer for scratch2 */ 00465 py = pScratch2; 00466 00467 /* pointer to take end of scratch2 buffer */ 00468 pScr2 = pScratch2 + srcBLen - 1; 00469 00470 /* points to smaller length sequence */ 00471 px = pIn2; 00472 00473 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00474 k = srcBLen >> 2u; 00475 00476 /* First part of the processing with loop unrolling copies 4 data points at a time. 00477 ** a second loop below copies for the remaining 1 to 3 samples. */ 00478 while(k > 0u) 00479 { 00480 /* copy second buffer in reversal manner */ 00481 *pScr2-- = *px++; 00482 *pScr2-- = *px++; 00483 *pScr2-- = *px++; 00484 *pScr2-- = *px++; 00485 00486 /* Decrement the loop counter */ 00487 k--; 00488 } 00489 00490 /* If the count is not a multiple of 4, copy remaining samples here. 00491 ** No loop unrolling is used. */ 00492 k = srcBLen % 0x4u; 00493 00494 while(k > 0u) 00495 { 00496 /* copy second buffer in reversal manner for remaining samples */ 00497 *pScr2-- = *px++; 00498 00499 /* Decrement the loop counter */ 00500 k--; 00501 } 00502 00503 /* Initialze temporary scratch pointer */ 00504 pScr1 = pScratch1; 00505 00506 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00507 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00508 00509 /* Update temporary scratch pointer */ 00510 pScr1 += (srcBLen - 1u); 00511 00512 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00513 00514 00515 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00516 k = srcALen >> 2u; 00517 00518 /* First part of the processing with loop unrolling copies 4 data points at a time. 00519 ** a second loop below copies for the remaining 1 to 3 samples. */ 00520 while(k > 0u) 00521 { 00522 /* copy second buffer in reversal manner */ 00523 *pScr1++ = *pIn1++; 00524 *pScr1++ = *pIn1++; 00525 *pScr1++ = *pIn1++; 00526 *pScr1++ = *pIn1++; 00527 00528 /* Decrement the loop counter */ 00529 k--; 00530 } 00531 00532 /* If the count is not a multiple of 4, copy remaining samples here. 00533 ** No loop unrolling is used. */ 00534 k = srcALen % 0x4u; 00535 00536 while(k > 0u) 00537 { 00538 /* copy second buffer in reversal manner for remaining samples */ 00539 *pScr1++ = *pIn1++; 00540 00541 /* Decrement the loop counter */ 00542 k--; 00543 } 00544 00545 00546 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00547 k = (srcBLen - 1u) >> 2u; 00548 00549 /* First part of the processing with loop unrolling copies 4 data points at a time. 00550 ** a second loop below copies for the remaining 1 to 3 samples. */ 00551 while(k > 0u) 00552 { 00553 /* copy second buffer in reversal manner */ 00554 *pScr1++ = 0; 00555 *pScr1++ = 0; 00556 *pScr1++ = 0; 00557 *pScr1++ = 0; 00558 00559 /* Decrement the loop counter */ 00560 k--; 00561 } 00562 00563 /* If the count is not a multiple of 4, copy remaining samples here. 00564 ** No loop unrolling is used. */ 00565 k = (srcBLen - 1u) % 0x4u; 00566 00567 while(k > 0u) 00568 { 00569 /* copy second buffer in reversal manner for remaining samples */ 00570 *pScr1++ = 0; 00571 00572 /* Decrement the loop counter */ 00573 k--; 00574 } 00575 00576 00577 /* Initialization of pIn2 pointer */ 00578 pIn2 = py; 00579 00580 pScratch1 += firstIndex; 00581 00582 pOut = pDst + firstIndex; 00583 00584 /* Actual convolution process starts here */ 00585 blkCnt = (numPoints) >> 2; 00586 00587 while(blkCnt > 0) 00588 { 00589 /* Initialze temporary scratch pointer as scratch1 */ 00590 pScr1 = pScratch1; 00591 00592 /* Clear Accumlators */ 00593 acc0 = 0; 00594 acc1 = 0; 00595 acc2 = 0; 00596 acc3 = 0; 00597 00598 /* Read two samples from scratch1 buffer */ 00599 x10 = *pScr1++; 00600 x11 = *pScr1++; 00601 00602 /* Read next two samples from scratch1 buffer */ 00603 x20 = *pScr1++; 00604 x21 = *pScr1++; 00605 00606 tapCnt = (srcBLen) >> 2u; 00607 00608 while(tapCnt > 0u) 00609 { 00610 00611 /* Read two samples from smaller buffer */ 00612 y10 = *pIn2; 00613 y11 = *(pIn2 + 1u); 00614 00615 /* multiply and accumlate */ 00616 acc0 += (q31_t) x10 *y10; 00617 acc0 += (q31_t) x11 *y11; 00618 acc2 += (q31_t) x20 *y10; 00619 acc2 += (q31_t) x21 *y11; 00620 00621 /* multiply and accumlate */ 00622 acc1 += (q31_t) x11 *y10; 00623 acc1 += (q31_t) x20 *y11; 00624 00625 /* Read next two samples from scratch1 buffer */ 00626 x10 = *pScr1; 00627 x11 = *(pScr1 + 1u); 00628 00629 /* multiply and accumlate */ 00630 acc3 += (q31_t) x21 *y10; 00631 acc3 += (q31_t) x10 *y11; 00632 00633 /* Read next two samples from scratch2 buffer */ 00634 y10 = *(pIn2 + 2u); 00635 y11 = *(pIn2 + 3u); 00636 00637 /* multiply and accumlate */ 00638 acc0 += (q31_t) x20 *y10; 00639 acc0 += (q31_t) x21 *y11; 00640 acc2 += (q31_t) x10 *y10; 00641 acc2 += (q31_t) x11 *y11; 00642 acc1 += (q31_t) x21 *y10; 00643 acc1 += (q31_t) x10 *y11; 00644 00645 /* Read next two samples from scratch1 buffer */ 00646 x20 = *(pScr1 + 2); 00647 x21 = *(pScr1 + 3); 00648 00649 /* multiply and accumlate */ 00650 acc3 += (q31_t) x11 *y10; 00651 acc3 += (q31_t) x20 *y11; 00652 00653 /* update scratch pointers */ 00654 pIn2 += 4u; 00655 pScr1 += 4u; 00656 00657 /* Decrement the loop counter */ 00658 tapCnt--; 00659 } 00660 00661 /* Update scratch pointer for remaining samples of smaller length sequence */ 00662 pScr1 -= 4u; 00663 00664 /* apply same above for remaining samples of smaller length sequence */ 00665 tapCnt = (srcBLen) & 3u; 00666 00667 while(tapCnt > 0u) 00668 { 00669 /* accumlate the results */ 00670 acc0 += (*pScr1++ * *pIn2); 00671 acc1 += (*pScr1++ * *pIn2); 00672 acc2 += (*pScr1++ * *pIn2); 00673 acc3 += (*pScr1++ * *pIn2++); 00674 00675 pScr1 -= 3u; 00676 00677 /* Decrement the loop counter */ 00678 tapCnt--; 00679 } 00680 00681 blkCnt--; 00682 00683 00684 /* Store the results in the accumulators in the destination buffer. */ 00685 *pOut++ = __SSAT((acc0 >> 15), 16); 00686 *pOut++ = __SSAT((acc1 >> 15), 16); 00687 *pOut++ = __SSAT((acc2 >> 15), 16); 00688 *pOut++ = __SSAT((acc3 >> 15), 16); 00689 00690 /* Initialization of inputB pointer */ 00691 pIn2 = py; 00692 00693 pScratch1 += 4u; 00694 00695 } 00696 00697 00698 blkCnt = numPoints & 0x3; 00699 00700 /* Calculate convolution for remaining samples of Bigger length sequence */ 00701 while(blkCnt > 0) 00702 { 00703 /* Initialze temporary scratch pointer as scratch1 */ 00704 pScr1 = pScratch1; 00705 00706 /* Clear Accumlators */ 00707 acc0 = 0; 00708 00709 tapCnt = (srcBLen) >> 1u; 00710 00711 while(tapCnt > 0u) 00712 { 00713 00714 /* Read next two samples from scratch1 buffer */ 00715 x10 = *pScr1++; 00716 x11 = *pScr1++; 00717 00718 /* Read two samples from smaller buffer */ 00719 y10 = *pIn2++; 00720 y11 = *pIn2++; 00721 00722 /* multiply and accumlate */ 00723 acc0 += (q31_t) x10 *y10; 00724 acc0 += (q31_t) x11 *y11; 00725 00726 /* Decrement the loop counter */ 00727 tapCnt--; 00728 } 00729 00730 tapCnt = (srcBLen) & 1u; 00731 00732 /* apply same above for remaining samples of smaller length sequence */ 00733 while(tapCnt > 0u) 00734 { 00735 00736 /* accumlate the results */ 00737 acc0 += (*pScr1++ * *pIn2++); 00738 00739 /* Decrement the loop counter */ 00740 tapCnt--; 00741 } 00742 00743 blkCnt--; 00744 00745 /* Store the result in the accumulator in the destination buffer. */ 00746 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00747 00748 /* Initialization of inputB pointer */ 00749 pIn2 = py; 00750 00751 pScratch1 += 1u; 00752 00753 } 00754 00755 /* set status as ARM_MATH_SUCCESS */ 00756 status = ARM_MATH_SUCCESS; 00757 00758 } 00759 00760 /* Return to application */ 00761 return (status); 00762 } 00763 00764 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00765 00766 /** 00767 * @} end of PartialConv group 00768 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2