CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_conv_partial_opt_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_opt_q15.c 00009 * 00010 * Description: Partial convolution of Q15 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q15 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00062 * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). 00063 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00064 * 00065 * \par Restrictions 00066 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00067 * In this case input, output, state buffers should be aligned by 32-bit 00068 * 00069 * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00070 * 00071 * 00072 */ 00073 00074 #ifndef UNALIGNED_SUPPORT_DISABLE 00075 00076 arm_status arm_conv_partial_opt_q15( 00077 q15_t * pSrcA, 00078 uint32_t srcALen, 00079 q15_t * pSrcB, 00080 uint32_t srcBLen, 00081 q15_t * pDst, 00082 uint32_t firstIndex, 00083 uint32_t numPoints, 00084 q15_t * pScratch1, 00085 q15_t * pScratch2) 00086 { 00087 00088 q15_t *pOut = pDst; /* output pointer */ 00089 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00090 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00091 q63_t acc0, acc1, acc2, acc3; /* Accumulator */ 00092 q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ 00093 q31_t y1, y2; /* State variables */ 00094 q15_t *pIn1; /* inputA pointer */ 00095 q15_t *pIn2; /* inputB pointer */ 00096 q15_t *px; /* Intermediate inputA pointer */ 00097 q15_t *py; /* Intermediate inputB pointer */ 00098 uint32_t j, k, blkCnt; /* loop counter */ 00099 arm_status status; /* Status variable */ 00100 uint32_t tapCnt; /* loop count */ 00101 00102 /* Check for range of output samples to be calculated */ 00103 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00104 { 00105 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00106 status = ARM_MATH_ARGUMENT_ERROR; 00107 } 00108 else 00109 { 00110 00111 /* The algorithm implementation is based on the lengths of the inputs. */ 00112 /* srcB is always made to slide across srcA. */ 00113 /* So srcBLen is always considered as shorter or equal to srcALen */ 00114 if(srcALen >= srcBLen) 00115 { 00116 /* Initialization of inputA pointer */ 00117 pIn1 = pSrcA; 00118 00119 /* Initialization of inputB pointer */ 00120 pIn2 = pSrcB; 00121 } 00122 else 00123 { 00124 /* Initialization of inputA pointer */ 00125 pIn1 = pSrcB; 00126 00127 /* Initialization of inputB pointer */ 00128 pIn2 = pSrcA; 00129 00130 /* srcBLen is always considered as shorter or equal to srcALen */ 00131 j = srcBLen; 00132 srcBLen = srcALen; 00133 srcALen = j; 00134 } 00135 00136 /* Temporary pointer for scratch2 */ 00137 py = pScratch2; 00138 00139 /* pointer to take end of scratch2 buffer */ 00140 pScr2 = pScratch2 + srcBLen - 1; 00141 00142 /* points to smaller length sequence */ 00143 px = pIn2; 00144 00145 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00146 k = srcBLen >> 2u; 00147 00148 /* First part of the processing with loop unrolling copies 4 data points at a time. 00149 ** a second loop below copies for the remaining 1 to 3 samples. */ 00150 while(k > 0u) 00151 { 00152 /* copy second buffer in reversal manner */ 00153 *pScr2-- = *px++; 00154 *pScr2-- = *px++; 00155 *pScr2-- = *px++; 00156 *pScr2-- = *px++; 00157 00158 /* Decrement the loop counter */ 00159 k--; 00160 } 00161 00162 /* If the count is not a multiple of 4, copy remaining samples here. 00163 ** No loop unrolling is used. */ 00164 k = srcBLen % 0x4u; 00165 00166 while(k > 0u) 00167 { 00168 /* copy second buffer in reversal manner for remaining samples */ 00169 *pScr2-- = *px++; 00170 00171 /* Decrement the loop counter */ 00172 k--; 00173 } 00174 00175 /* Initialze temporary scratch pointer */ 00176 pScr1 = pScratch1; 00177 00178 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00179 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00180 00181 /* Update temporary scratch pointer */ 00182 pScr1 += (srcBLen - 1u); 00183 00184 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00185 00186 /* Copy (srcALen) samples in scratch buffer */ 00187 arm_copy_q15(pIn1, pScr1, srcALen); 00188 00189 /* Update pointers */ 00190 pScr1 += srcALen; 00191 00192 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00193 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00194 00195 /* Update pointer */ 00196 pScr1 += (srcBLen - 1u); 00197 00198 /* Initialization of pIn2 pointer */ 00199 pIn2 = py; 00200 00201 pScratch1 += firstIndex; 00202 00203 pOut = pDst + firstIndex; 00204 00205 /* Actual convolution process starts here */ 00206 blkCnt = (numPoints) >> 2; 00207 00208 while(blkCnt > 0) 00209 { 00210 /* Initialze temporary scratch pointer as scratch1 */ 00211 pScr1 = pScratch1; 00212 00213 /* Clear Accumlators */ 00214 acc0 = 0; 00215 acc1 = 0; 00216 acc2 = 0; 00217 acc3 = 0; 00218 00219 /* Read two samples from scratch1 buffer */ 00220 x1 = *__SIMD32(pScr1)++; 00221 00222 /* Read next two samples from scratch1 buffer */ 00223 x2 = *__SIMD32(pScr1)++; 00224 00225 tapCnt = (srcBLen) >> 2u; 00226 00227 while(tapCnt > 0u) 00228 { 00229 00230 /* Read four samples from smaller buffer */ 00231 y1 = _SIMD32_OFFSET(pIn2); 00232 y2 = _SIMD32_OFFSET(pIn2 + 2u); 00233 00234 /* multiply and accumlate */ 00235 acc0 = __SMLALD(x1, y1, acc0); 00236 acc2 = __SMLALD(x2, y1, acc2); 00237 00238 /* pack input data */ 00239 #ifndef ARM_MATH_BIG_ENDIAN 00240 x3 = __PKHBT(x2, x1, 0); 00241 #else 00242 x3 = __PKHBT(x1, x2, 0); 00243 #endif 00244 00245 /* multiply and accumlate */ 00246 acc1 = __SMLALDX(x3, y1, acc1); 00247 00248 /* Read next two samples from scratch1 buffer */ 00249 x1 = _SIMD32_OFFSET(pScr1); 00250 00251 /* multiply and accumlate */ 00252 acc0 = __SMLALD(x2, y2, acc0); 00253 acc2 = __SMLALD(x1, y2, acc2); 00254 00255 /* pack input data */ 00256 #ifndef ARM_MATH_BIG_ENDIAN 00257 x3 = __PKHBT(x1, x2, 0); 00258 #else 00259 x3 = __PKHBT(x2, x1, 0); 00260 #endif 00261 00262 acc3 = __SMLALDX(x3, y1, acc3); 00263 acc1 = __SMLALDX(x3, y2, acc1); 00264 00265 x2 = _SIMD32_OFFSET(pScr1 + 2u); 00266 00267 #ifndef ARM_MATH_BIG_ENDIAN 00268 x3 = __PKHBT(x2, x1, 0); 00269 #else 00270 x3 = __PKHBT(x1, x2, 0); 00271 #endif 00272 00273 acc3 = __SMLALDX(x3, y2, acc3); 00274 00275 /* update scratch pointers */ 00276 pIn2 += 4u; 00277 pScr1 += 4u; 00278 00279 00280 /* Decrement the loop counter */ 00281 tapCnt--; 00282 } 00283 00284 /* Update scratch pointer for remaining samples of smaller length sequence */ 00285 pScr1 -= 4u; 00286 00287 /* apply same above for remaining samples of smaller length sequence */ 00288 tapCnt = (srcBLen) & 3u; 00289 00290 while(tapCnt > 0u) 00291 { 00292 /* accumlate the results */ 00293 acc0 += (*pScr1++ * *pIn2); 00294 acc1 += (*pScr1++ * *pIn2); 00295 acc2 += (*pScr1++ * *pIn2); 00296 acc3 += (*pScr1++ * *pIn2++); 00297 00298 pScr1 -= 3u; 00299 00300 /* Decrement the loop counter */ 00301 tapCnt--; 00302 } 00303 00304 blkCnt--; 00305 00306 00307 /* Store the results in the accumulators in the destination buffer. */ 00308 00309 #ifndef ARM_MATH_BIG_ENDIAN 00310 00311 *__SIMD32(pOut)++ = 00312 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00313 *__SIMD32(pOut)++ = 00314 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00315 00316 #else 00317 00318 *__SIMD32(pOut)++ = 00319 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00320 *__SIMD32(pOut)++ = 00321 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00322 00323 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00324 00325 /* Initialization of inputB pointer */ 00326 pIn2 = py; 00327 00328 pScratch1 += 4u; 00329 00330 } 00331 00332 00333 blkCnt = numPoints & 0x3; 00334 00335 /* Calculate convolution for remaining samples of Bigger length sequence */ 00336 while(blkCnt > 0) 00337 { 00338 /* Initialze temporary scratch pointer as scratch1 */ 00339 pScr1 = pScratch1; 00340 00341 /* Clear Accumlators */ 00342 acc0 = 0; 00343 00344 tapCnt = (srcBLen) >> 1u; 00345 00346 while(tapCnt > 0u) 00347 { 00348 00349 /* Read next two samples from scratch1 buffer */ 00350 x1 = *__SIMD32(pScr1)++; 00351 00352 /* Read two samples from smaller buffer */ 00353 y1 = *__SIMD32(pIn2)++; 00354 00355 acc0 = __SMLALD(x1, y1, acc0); 00356 00357 /* Decrement the loop counter */ 00358 tapCnt--; 00359 } 00360 00361 tapCnt = (srcBLen) & 1u; 00362 00363 /* apply same above for remaining samples of smaller length sequence */ 00364 while(tapCnt > 0u) 00365 { 00366 00367 /* accumlate the results */ 00368 acc0 += (*pScr1++ * *pIn2++); 00369 00370 /* Decrement the loop counter */ 00371 tapCnt--; 00372 } 00373 00374 blkCnt--; 00375 00376 /* Store the result in the accumulator in the destination buffer. */ 00377 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00378 00379 /* Initialization of inputB pointer */ 00380 pIn2 = py; 00381 00382 pScratch1 += 1u; 00383 00384 } 00385 00386 /* set status as ARM_MATH_SUCCESS */ 00387 status = ARM_MATH_SUCCESS; 00388 00389 } 00390 00391 /* Return to application */ 00392 return (status); 00393 } 00394 00395 #else 00396 00397 arm_status arm_conv_partial_opt_q15( 00398 q15_t * pSrcA, 00399 uint32_t srcALen, 00400 q15_t * pSrcB, 00401 uint32_t srcBLen, 00402 q15_t * pDst, 00403 uint32_t firstIndex, 00404 uint32_t numPoints, 00405 q15_t * pScratch1, 00406 q15_t * pScratch2) 00407 { 00408 00409 q15_t *pOut = pDst; /* output pointer */ 00410 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00411 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00412 q63_t acc0, acc1, acc2, acc3; /* Accumulator */ 00413 q15_t *pIn1; /* inputA pointer */ 00414 q15_t *pIn2; /* inputB pointer */ 00415 q15_t *px; /* Intermediate inputA pointer */ 00416 q15_t *py; /* Intermediate inputB pointer */ 00417 uint32_t j, k, blkCnt; /* loop counter */ 00418 arm_status status; /* Status variable */ 00419 uint32_t tapCnt; /* loop count */ 00420 q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */ 00421 q15_t y10, y11; /* Temporary variables to hold srcB buffer */ 00422 00423 00424 /* Check for range of output samples to be calculated */ 00425 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00426 { 00427 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00428 status = ARM_MATH_ARGUMENT_ERROR; 00429 } 00430 else 00431 { 00432 00433 /* The algorithm implementation is based on the lengths of the inputs. */ 00434 /* srcB is always made to slide across srcA. */ 00435 /* So srcBLen is always considered as shorter or equal to srcALen */ 00436 if(srcALen >= srcBLen) 00437 { 00438 /* Initialization of inputA pointer */ 00439 pIn1 = pSrcA; 00440 00441 /* Initialization of inputB pointer */ 00442 pIn2 = pSrcB; 00443 } 00444 else 00445 { 00446 /* Initialization of inputA pointer */ 00447 pIn1 = pSrcB; 00448 00449 /* Initialization of inputB pointer */ 00450 pIn2 = pSrcA; 00451 00452 /* srcBLen is always considered as shorter or equal to srcALen */ 00453 j = srcBLen; 00454 srcBLen = srcALen; 00455 srcALen = j; 00456 } 00457 00458 /* Temporary pointer for scratch2 */ 00459 py = pScratch2; 00460 00461 /* pointer to take end of scratch2 buffer */ 00462 pScr2 = pScratch2 + srcBLen - 1; 00463 00464 /* points to smaller length sequence */ 00465 px = pIn2; 00466 00467 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00468 k = srcBLen >> 2u; 00469 00470 /* First part of the processing with loop unrolling copies 4 data points at a time. 00471 ** a second loop below copies for the remaining 1 to 3 samples. */ 00472 while(k > 0u) 00473 { 00474 /* copy second buffer in reversal manner */ 00475 *pScr2-- = *px++; 00476 *pScr2-- = *px++; 00477 *pScr2-- = *px++; 00478 *pScr2-- = *px++; 00479 00480 /* Decrement the loop counter */ 00481 k--; 00482 } 00483 00484 /* If the count is not a multiple of 4, copy remaining samples here. 00485 ** No loop unrolling is used. */ 00486 k = srcBLen % 0x4u; 00487 00488 while(k > 0u) 00489 { 00490 /* copy second buffer in reversal manner for remaining samples */ 00491 *pScr2-- = *px++; 00492 00493 /* Decrement the loop counter */ 00494 k--; 00495 } 00496 00497 /* Initialze temporary scratch pointer */ 00498 pScr1 = pScratch1; 00499 00500 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00501 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00502 00503 /* Update temporary scratch pointer */ 00504 pScr1 += (srcBLen - 1u); 00505 00506 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00507 00508 00509 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00510 k = srcALen >> 2u; 00511 00512 /* First part of the processing with loop unrolling copies 4 data points at a time. 00513 ** a second loop below copies for the remaining 1 to 3 samples. */ 00514 while(k > 0u) 00515 { 00516 /* copy second buffer in reversal manner */ 00517 *pScr1++ = *pIn1++; 00518 *pScr1++ = *pIn1++; 00519 *pScr1++ = *pIn1++; 00520 *pScr1++ = *pIn1++; 00521 00522 /* Decrement the loop counter */ 00523 k--; 00524 } 00525 00526 /* If the count is not a multiple of 4, copy remaining samples here. 00527 ** No loop unrolling is used. */ 00528 k = srcALen % 0x4u; 00529 00530 while(k > 0u) 00531 { 00532 /* copy second buffer in reversal manner for remaining samples */ 00533 *pScr1++ = *pIn1++; 00534 00535 /* Decrement the loop counter */ 00536 k--; 00537 } 00538 00539 00540 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00541 k = (srcBLen - 1u) >> 2u; 00542 00543 /* First part of the processing with loop unrolling copies 4 data points at a time. 00544 ** a second loop below copies for the remaining 1 to 3 samples. */ 00545 while(k > 0u) 00546 { 00547 /* copy second buffer in reversal manner */ 00548 *pScr1++ = 0; 00549 *pScr1++ = 0; 00550 *pScr1++ = 0; 00551 *pScr1++ = 0; 00552 00553 /* Decrement the loop counter */ 00554 k--; 00555 } 00556 00557 /* If the count is not a multiple of 4, copy remaining samples here. 00558 ** No loop unrolling is used. */ 00559 k = (srcBLen - 1u) % 0x4u; 00560 00561 while(k > 0u) 00562 { 00563 /* copy second buffer in reversal manner for remaining samples */ 00564 *pScr1++ = 0; 00565 00566 /* Decrement the loop counter */ 00567 k--; 00568 } 00569 00570 00571 /* Initialization of pIn2 pointer */ 00572 pIn2 = py; 00573 00574 pScratch1 += firstIndex; 00575 00576 pOut = pDst + firstIndex; 00577 00578 /* Actual convolution process starts here */ 00579 blkCnt = (numPoints) >> 2; 00580 00581 while(blkCnt > 0) 00582 { 00583 /* Initialze temporary scratch pointer as scratch1 */ 00584 pScr1 = pScratch1; 00585 00586 /* Clear Accumlators */ 00587 acc0 = 0; 00588 acc1 = 0; 00589 acc2 = 0; 00590 acc3 = 0; 00591 00592 /* Read two samples from scratch1 buffer */ 00593 x10 = *pScr1++; 00594 x11 = *pScr1++; 00595 00596 /* Read next two samples from scratch1 buffer */ 00597 x20 = *pScr1++; 00598 x21 = *pScr1++; 00599 00600 tapCnt = (srcBLen) >> 2u; 00601 00602 while(tapCnt > 0u) 00603 { 00604 00605 /* Read two samples from smaller buffer */ 00606 y10 = *pIn2; 00607 y11 = *(pIn2 + 1u); 00608 00609 /* multiply and accumlate */ 00610 acc0 += (q63_t) x10 *y10; 00611 acc0 += (q63_t) x11 *y11; 00612 acc2 += (q63_t) x20 *y10; 00613 acc2 += (q63_t) x21 *y11; 00614 00615 /* multiply and accumlate */ 00616 acc1 += (q63_t) x11 *y10; 00617 acc1 += (q63_t) x20 *y11; 00618 00619 /* Read next two samples from scratch1 buffer */ 00620 x10 = *pScr1; 00621 x11 = *(pScr1 + 1u); 00622 00623 /* multiply and accumlate */ 00624 acc3 += (q63_t) x21 *y10; 00625 acc3 += (q63_t) x10 *y11; 00626 00627 /* Read next two samples from scratch2 buffer */ 00628 y10 = *(pIn2 + 2u); 00629 y11 = *(pIn2 + 3u); 00630 00631 /* multiply and accumlate */ 00632 acc0 += (q63_t) x20 *y10; 00633 acc0 += (q63_t) x21 *y11; 00634 acc2 += (q63_t) x10 *y10; 00635 acc2 += (q63_t) x11 *y11; 00636 acc1 += (q63_t) x21 *y10; 00637 acc1 += (q63_t) x10 *y11; 00638 00639 /* Read next two samples from scratch1 buffer */ 00640 x20 = *(pScr1 + 2); 00641 x21 = *(pScr1 + 3); 00642 00643 /* multiply and accumlate */ 00644 acc3 += (q63_t) x11 *y10; 00645 acc3 += (q63_t) x20 *y11; 00646 00647 /* update scratch pointers */ 00648 pIn2 += 4u; 00649 pScr1 += 4u; 00650 00651 /* Decrement the loop counter */ 00652 tapCnt--; 00653 } 00654 00655 /* Update scratch pointer for remaining samples of smaller length sequence */ 00656 pScr1 -= 4u; 00657 00658 /* apply same above for remaining samples of smaller length sequence */ 00659 tapCnt = (srcBLen) & 3u; 00660 00661 while(tapCnt > 0u) 00662 { 00663 /* accumlate the results */ 00664 acc0 += (*pScr1++ * *pIn2); 00665 acc1 += (*pScr1++ * *pIn2); 00666 acc2 += (*pScr1++ * *pIn2); 00667 acc3 += (*pScr1++ * *pIn2++); 00668 00669 pScr1 -= 3u; 00670 00671 /* Decrement the loop counter */ 00672 tapCnt--; 00673 } 00674 00675 blkCnt--; 00676 00677 00678 /* Store the results in the accumulators in the destination buffer. */ 00679 *pOut++ = __SSAT((acc0 >> 15), 16); 00680 *pOut++ = __SSAT((acc1 >> 15), 16); 00681 *pOut++ = __SSAT((acc2 >> 15), 16); 00682 *pOut++ = __SSAT((acc3 >> 15), 16); 00683 00684 00685 /* Initialization of inputB pointer */ 00686 pIn2 = py; 00687 00688 pScratch1 += 4u; 00689 00690 } 00691 00692 00693 blkCnt = numPoints & 0x3; 00694 00695 /* Calculate convolution for remaining samples of Bigger length sequence */ 00696 while(blkCnt > 0) 00697 { 00698 /* Initialze temporary scratch pointer as scratch1 */ 00699 pScr1 = pScratch1; 00700 00701 /* Clear Accumlators */ 00702 acc0 = 0; 00703 00704 tapCnt = (srcBLen) >> 1u; 00705 00706 while(tapCnt > 0u) 00707 { 00708 00709 /* Read next two samples from scratch1 buffer */ 00710 x10 = *pScr1++; 00711 x11 = *pScr1++; 00712 00713 /* Read two samples from smaller buffer */ 00714 y10 = *pIn2++; 00715 y11 = *pIn2++; 00716 00717 /* multiply and accumlate */ 00718 acc0 += (q63_t) x10 *y10; 00719 acc0 += (q63_t) x11 *y11; 00720 00721 /* Decrement the loop counter */ 00722 tapCnt--; 00723 } 00724 00725 tapCnt = (srcBLen) & 1u; 00726 00727 /* apply same above for remaining samples of smaller length sequence */ 00728 while(tapCnt > 0u) 00729 { 00730 00731 /* accumlate the results */ 00732 acc0 += (*pScr1++ * *pIn2++); 00733 00734 /* Decrement the loop counter */ 00735 tapCnt--; 00736 } 00737 00738 blkCnt--; 00739 00740 /* Store the result in the accumulator in the destination buffer. */ 00741 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00742 00743 00744 /* Initialization of inputB pointer */ 00745 pIn2 = py; 00746 00747 pScratch1 += 1u; 00748 00749 } 00750 00751 /* set status as ARM_MATH_SUCCESS */ 00752 status = ARM_MATH_SUCCESS; 00753 00754 } 00755 00756 /* Return to application */ 00757 return (status); 00758 } 00759 00760 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00761 00762 00763 /** 00764 * @} end of PartialConv group 00765 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2