Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_conv_partial_opt_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_opt_q15.c 00009 * 00010 * Description: Partial convolution of Q15 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q15 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00062 * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). 00063 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00064 * 00065 * \par Restrictions 00066 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00067 * In this case input, output, state buffers should be aligned by 32-bit 00068 * 00069 * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00070 * 00071 * 00072 */ 00073 00074 #ifndef UNALIGNED_SUPPORT_DISABLE 00075 00076 arm_status arm_conv_partial_opt_q15( 00077 q15_t * pSrcA, 00078 uint32_t srcALen, 00079 q15_t * pSrcB, 00080 uint32_t srcBLen, 00081 q15_t * pDst, 00082 uint32_t firstIndex, 00083 uint32_t numPoints, 00084 q15_t * pScratch1, 00085 q15_t * pScratch2) 00086 { 00087 00088 q15_t *pOut = pDst; /* output pointer */ 00089 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00090 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00091 q63_t acc0, acc1, acc2, acc3; /* Accumulator */ 00092 q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ 00093 q31_t y1, y2; /* State variables */ 00094 q15_t *pIn1; /* inputA pointer */ 00095 q15_t *pIn2; /* inputB pointer */ 00096 q15_t *px; /* Intermediate inputA pointer */ 00097 q15_t *py; /* Intermediate inputB pointer */ 00098 uint32_t j, k, blkCnt; /* loop counter */ 00099 arm_status status; /* Status variable */ 00100 uint32_t tapCnt; /* loop count */ 00101 00102 /* Check for range of output samples to be calculated */ 00103 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00104 { 00105 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00106 status = ARM_MATH_ARGUMENT_ERROR; 00107 } 00108 else 00109 { 00110 00111 /* The algorithm implementation is based on the lengths of the inputs. */ 00112 /* srcB is always made to slide across srcA. */ 00113 /* So srcBLen is always considered as shorter or equal to srcALen */ 00114 if(srcALen >= srcBLen) 00115 { 00116 /* Initialization of inputA pointer */ 00117 pIn1 = pSrcA; 00118 00119 /* Initialization of inputB pointer */ 00120 pIn2 = pSrcB; 00121 } 00122 else 00123 { 00124 /* Initialization of inputA pointer */ 00125 pIn1 = pSrcB; 00126 00127 /* Initialization of inputB pointer */ 00128 pIn2 = pSrcA; 00129 00130 /* srcBLen is always considered as shorter or equal to srcALen */ 00131 j = srcBLen; 00132 srcBLen = srcALen; 00133 srcALen = j; 00134 } 00135 00136 /* Temporary pointer for scratch2 */ 00137 py = pScratch2; 00138 00139 /* pointer to take end of scratch2 buffer */ 00140 pScr2 = pScratch2 + srcBLen - 1; 00141 00142 /* points to smaller length sequence */ 00143 px = pIn2; 00144 00145 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00146 k = srcBLen >> 2u; 00147 00148 /* First part of the processing with loop unrolling copies 4 data points at a time. 00149 ** a second loop below copies for the remaining 1 to 3 samples. */ 00150 while(k > 0u) 00151 { 00152 /* copy second buffer in reversal manner */ 00153 *pScr2-- = *px++; 00154 *pScr2-- = *px++; 00155 *pScr2-- = *px++; 00156 *pScr2-- = *px++; 00157 00158 /* Decrement the loop counter */ 00159 k--; 00160 } 00161 00162 /* If the count is not a multiple of 4, copy remaining samples here. 00163 ** No loop unrolling is used. */ 00164 k = srcBLen % 0x4u; 00165 00166 while(k > 0u) 00167 { 00168 /* copy second buffer in reversal manner for remaining samples */ 00169 *pScr2-- = *px++; 00170 00171 /* Decrement the loop counter */ 00172 k--; 00173 } 00174 00175 /* Initialze temporary scratch pointer */ 00176 pScr1 = pScratch1; 00177 00178 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00179 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00180 00181 /* Update temporary scratch pointer */ 00182 pScr1 += (srcBLen - 1u); 00183 00184 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00185 00186 /* Copy (srcALen) samples in scratch buffer */ 00187 arm_copy_q15(pIn1, pScr1, srcALen); 00188 00189 /* Update pointers */ 00190 pScr1 += srcALen; 00191 00192 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00193 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00194 00195 /* Update pointer */ 00196 pScr1 += (srcBLen - 1u); 00197 00198 /* Initialization of pIn2 pointer */ 00199 pIn2 = py; 00200 00201 pScratch1 += firstIndex; 00202 00203 pOut = pDst + firstIndex; 00204 00205 /* Actual convolution process starts here */ 00206 blkCnt = (numPoints) >> 2; 00207 00208 while(blkCnt > 0) 00209 { 00210 /* Initialze temporary scratch pointer as scratch1 */ 00211 pScr1 = pScratch1; 00212 00213 /* Clear Accumlators */ 00214 acc0 = 0; 00215 acc1 = 0; 00216 acc2 = 0; 00217 acc3 = 0; 00218 00219 /* Read two samples from scratch1 buffer */ 00220 x1 = *__SIMD32(pScr1)++; 00221 00222 /* Read next two samples from scratch1 buffer */ 00223 x2 = *__SIMD32(pScr1)++; 00224 00225 tapCnt = (srcBLen) >> 2u; 00226 00227 while(tapCnt > 0u) 00228 { 00229 00230 /* Read four samples from smaller buffer */ 00231 y1 = _SIMD32_OFFSET(pIn2); 00232 y2 = _SIMD32_OFFSET(pIn2 + 2u); 00233 00234 /* multiply and accumlate */ 00235 acc0 = __SMLALD(x1, y1, acc0); 00236 acc2 = __SMLALD(x2, y1, acc2); 00237 00238 /* pack input data */ 00239 #ifndef ARM_MATH_BIG_ENDIAN 00240 x3 = __PKHBT(x2, x1, 0); 00241 #else 00242 x3 = __PKHBT(x1, x2, 0); 00243 #endif 00244 00245 /* multiply and accumlate */ 00246 acc1 = __SMLALDX(x3, y1, acc1); 00247 00248 /* Read next two samples from scratch1 buffer */ 00249 x1 = _SIMD32_OFFSET(pScr1); 00250 00251 /* multiply and accumlate */ 00252 acc0 = __SMLALD(x2, y2, acc0); 00253 acc2 = __SMLALD(x1, y2, acc2); 00254 00255 /* pack input data */ 00256 #ifndef ARM_MATH_BIG_ENDIAN 00257 x3 = __PKHBT(x1, x2, 0); 00258 #else 00259 x3 = __PKHBT(x2, x1, 0); 00260 #endif 00261 00262 acc3 = __SMLALDX(x3, y1, acc3); 00263 acc1 = __SMLALDX(x3, y2, acc1); 00264 00265 x2 = _SIMD32_OFFSET(pScr1 + 2u); 00266 00267 #ifndef ARM_MATH_BIG_ENDIAN 00268 x3 = __PKHBT(x2, x1, 0); 00269 #else 00270 x3 = __PKHBT(x1, x2, 0); 00271 #endif 00272 00273 acc3 = __SMLALDX(x3, y2, acc3); 00274 00275 /* update scratch pointers */ 00276 pIn2 += 4u; 00277 pScr1 += 4u; 00278 00279 00280 /* Decrement the loop counter */ 00281 tapCnt--; 00282 } 00283 00284 /* Update scratch pointer for remaining samples of smaller length sequence */ 00285 pScr1 -= 4u; 00286 00287 /* apply same above for remaining samples of smaller length sequence */ 00288 tapCnt = (srcBLen) & 3u; 00289 00290 while(tapCnt > 0u) 00291 { 00292 /* accumlate the results */ 00293 acc0 += (*pScr1++ * *pIn2); 00294 acc1 += (*pScr1++ * *pIn2); 00295 acc2 += (*pScr1++ * *pIn2); 00296 acc3 += (*pScr1++ * *pIn2++); 00297 00298 pScr1 -= 3u; 00299 00300 /* Decrement the loop counter */ 00301 tapCnt--; 00302 } 00303 00304 blkCnt--; 00305 00306 00307 /* Store the results in the accumulators in the destination buffer. */ 00308 00309 #ifndef ARM_MATH_BIG_ENDIAN 00310 00311 *__SIMD32(pOut)++ = 00312 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00313 *__SIMD32(pOut)++ = 00314 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00315 00316 #else 00317 00318 *__SIMD32(pOut)++ = 00319 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00320 *__SIMD32(pOut)++ = 00321 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00322 00323 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00324 00325 /* Initialization of inputB pointer */ 00326 pIn2 = py; 00327 00328 pScratch1 += 4u; 00329 00330 } 00331 00332 00333 blkCnt = numPoints & 0x3; 00334 00335 /* Calculate convolution for remaining samples of Bigger length sequence */ 00336 while(blkCnt > 0) 00337 { 00338 /* Initialze temporary scratch pointer as scratch1 */ 00339 pScr1 = pScratch1; 00340 00341 /* Clear Accumlators */ 00342 acc0 = 0; 00343 00344 tapCnt = (srcBLen) >> 1u; 00345 00346 while(tapCnt > 0u) 00347 { 00348 00349 /* Read next two samples from scratch1 buffer */ 00350 x1 = *__SIMD32(pScr1)++; 00351 00352 /* Read two samples from smaller buffer */ 00353 y1 = *__SIMD32(pIn2)++; 00354 00355 acc0 = __SMLALD(x1, y1, acc0); 00356 00357 /* Decrement the loop counter */ 00358 tapCnt--; 00359 } 00360 00361 tapCnt = (srcBLen) & 1u; 00362 00363 /* apply same above for remaining samples of smaller length sequence */ 00364 while(tapCnt > 0u) 00365 { 00366 00367 /* accumlate the results */ 00368 acc0 += (*pScr1++ * *pIn2++); 00369 00370 /* Decrement the loop counter */ 00371 tapCnt--; 00372 } 00373 00374 blkCnt--; 00375 00376 /* Store the result in the accumulator in the destination buffer. */ 00377 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00378 00379 /* Initialization of inputB pointer */ 00380 pIn2 = py; 00381 00382 pScratch1 += 1u; 00383 00384 } 00385 00386 /* set status as ARM_MATH_SUCCESS */ 00387 status = ARM_MATH_SUCCESS; 00388 00389 } 00390 00391 /* Return to application */ 00392 return (status); 00393 } 00394 00395 #else 00396 00397 arm_status arm_conv_partial_opt_q15( 00398 q15_t * pSrcA, 00399 uint32_t srcALen, 00400 q15_t * pSrcB, 00401 uint32_t srcBLen, 00402 q15_t * pDst, 00403 uint32_t firstIndex, 00404 uint32_t numPoints, 00405 q15_t * pScratch1, 00406 q15_t * pScratch2) 00407 { 00408 00409 q15_t *pOut = pDst; /* output pointer */ 00410 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00411 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00412 q63_t acc0, acc1, acc2, acc3; /* Accumulator */ 00413 q15_t *pIn1; /* inputA pointer */ 00414 q15_t *pIn2; /* inputB pointer */ 00415 q15_t *px; /* Intermediate inputA pointer */ 00416 q15_t *py; /* Intermediate inputB pointer */ 00417 uint32_t j, k, blkCnt; /* loop counter */ 00418 arm_status status; /* Status variable */ 00419 uint32_t tapCnt; /* loop count */ 00420 q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */ 00421 q15_t y10, y11; /* Temporary variables to hold srcB buffer */ 00422 00423 00424 /* Check for range of output samples to be calculated */ 00425 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00426 { 00427 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00428 status = ARM_MATH_ARGUMENT_ERROR; 00429 } 00430 else 00431 { 00432 00433 /* The algorithm implementation is based on the lengths of the inputs. */ 00434 /* srcB is always made to slide across srcA. */ 00435 /* So srcBLen is always considered as shorter or equal to srcALen */ 00436 if(srcALen >= srcBLen) 00437 { 00438 /* Initialization of inputA pointer */ 00439 pIn1 = pSrcA; 00440 00441 /* Initialization of inputB pointer */ 00442 pIn2 = pSrcB; 00443 } 00444 else 00445 { 00446 /* Initialization of inputA pointer */ 00447 pIn1 = pSrcB; 00448 00449 /* Initialization of inputB pointer */ 00450 pIn2 = pSrcA; 00451 00452 /* srcBLen is always considered as shorter or equal to srcALen */ 00453 j = srcBLen; 00454 srcBLen = srcALen; 00455 srcALen = j; 00456 } 00457 00458 /* Temporary pointer for scratch2 */ 00459 py = pScratch2; 00460 00461 /* pointer to take end of scratch2 buffer */ 00462 pScr2 = pScratch2 + srcBLen - 1; 00463 00464 /* points to smaller length sequence */ 00465 px = pIn2; 00466 00467 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00468 k = srcBLen >> 2u; 00469 00470 /* First part of the processing with loop unrolling copies 4 data points at a time. 00471 ** a second loop below copies for the remaining 1 to 3 samples. */ 00472 while(k > 0u) 00473 { 00474 /* copy second buffer in reversal manner */ 00475 *pScr2-- = *px++; 00476 *pScr2-- = *px++; 00477 *pScr2-- = *px++; 00478 *pScr2-- = *px++; 00479 00480 /* Decrement the loop counter */ 00481 k--; 00482 } 00483 00484 /* If the count is not a multiple of 4, copy remaining samples here. 00485 ** No loop unrolling is used. */ 00486 k = srcBLen % 0x4u; 00487 00488 while(k > 0u) 00489 { 00490 /* copy second buffer in reversal manner for remaining samples */ 00491 *pScr2-- = *px++; 00492 00493 /* Decrement the loop counter */ 00494 k--; 00495 } 00496 00497 /* Initialze temporary scratch pointer */ 00498 pScr1 = pScratch1; 00499 00500 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00501 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00502 00503 /* Update temporary scratch pointer */ 00504 pScr1 += (srcBLen - 1u); 00505 00506 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00507 00508 00509 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00510 k = srcALen >> 2u; 00511 00512 /* First part of the processing with loop unrolling copies 4 data points at a time. 00513 ** a second loop below copies for the remaining 1 to 3 samples. */ 00514 while(k > 0u) 00515 { 00516 /* copy second buffer in reversal manner */ 00517 *pScr1++ = *pIn1++; 00518 *pScr1++ = *pIn1++; 00519 *pScr1++ = *pIn1++; 00520 *pScr1++ = *pIn1++; 00521 00522 /* Decrement the loop counter */ 00523 k--; 00524 } 00525 00526 /* If the count is not a multiple of 4, copy remaining samples here. 00527 ** No loop unrolling is used. */ 00528 k = srcALen % 0x4u; 00529 00530 while(k > 0u) 00531 { 00532 /* copy second buffer in reversal manner for remaining samples */ 00533 *pScr1++ = *pIn1++; 00534 00535 /* Decrement the loop counter */ 00536 k--; 00537 } 00538 00539 00540 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00541 k = (srcBLen - 1u) >> 2u; 00542 00543 /* First part of the processing with loop unrolling copies 4 data points at a time. 00544 ** a second loop below copies for the remaining 1 to 3 samples. */ 00545 while(k > 0u) 00546 { 00547 /* copy second buffer in reversal manner */ 00548 *pScr1++ = 0; 00549 *pScr1++ = 0; 00550 *pScr1++ = 0; 00551 *pScr1++ = 0; 00552 00553 /* Decrement the loop counter */ 00554 k--; 00555 } 00556 00557 /* If the count is not a multiple of 4, copy remaining samples here. 00558 ** No loop unrolling is used. */ 00559 k = (srcBLen - 1u) % 0x4u; 00560 00561 while(k > 0u) 00562 { 00563 /* copy second buffer in reversal manner for remaining samples */ 00564 *pScr1++ = 0; 00565 00566 /* Decrement the loop counter */ 00567 k--; 00568 } 00569 00570 00571 /* Initialization of pIn2 pointer */ 00572 pIn2 = py; 00573 00574 pScratch1 += firstIndex; 00575 00576 pOut = pDst + firstIndex; 00577 00578 /* Actual convolution process starts here */ 00579 blkCnt = (numPoints) >> 2; 00580 00581 while(blkCnt > 0) 00582 { 00583 /* Initialze temporary scratch pointer as scratch1 */ 00584 pScr1 = pScratch1; 00585 00586 /* Clear Accumlators */ 00587 acc0 = 0; 00588 acc1 = 0; 00589 acc2 = 0; 00590 acc3 = 0; 00591 00592 /* Read two samples from scratch1 buffer */ 00593 x10 = *pScr1++; 00594 x11 = *pScr1++; 00595 00596 /* Read next two samples from scratch1 buffer */ 00597 x20 = *pScr1++; 00598 x21 = *pScr1++; 00599 00600 tapCnt = (srcBLen) >> 2u; 00601 00602 while(tapCnt > 0u) 00603 { 00604 00605 /* Read two samples from smaller buffer */ 00606 y10 = *pIn2; 00607 y11 = *(pIn2 + 1u); 00608 00609 /* multiply and accumlate */ 00610 acc0 += (q63_t) x10 *y10; 00611 acc0 += (q63_t) x11 *y11; 00612 acc2 += (q63_t) x20 *y10; 00613 acc2 += (q63_t) x21 *y11; 00614 00615 /* multiply and accumlate */ 00616 acc1 += (q63_t) x11 *y10; 00617 acc1 += (q63_t) x20 *y11; 00618 00619 /* Read next two samples from scratch1 buffer */ 00620 x10 = *pScr1; 00621 x11 = *(pScr1 + 1u); 00622 00623 /* multiply and accumlate */ 00624 acc3 += (q63_t) x21 *y10; 00625 acc3 += (q63_t) x10 *y11; 00626 00627 /* Read next two samples from scratch2 buffer */ 00628 y10 = *(pIn2 + 2u); 00629 y11 = *(pIn2 + 3u); 00630 00631 /* multiply and accumlate */ 00632 acc0 += (q63_t) x20 *y10; 00633 acc0 += (q63_t) x21 *y11; 00634 acc2 += (q63_t) x10 *y10; 00635 acc2 += (q63_t) x11 *y11; 00636 acc1 += (q63_t) x21 *y10; 00637 acc1 += (q63_t) x10 *y11; 00638 00639 /* Read next two samples from scratch1 buffer */ 00640 x20 = *(pScr1 + 2); 00641 x21 = *(pScr1 + 3); 00642 00643 /* multiply and accumlate */ 00644 acc3 += (q63_t) x11 *y10; 00645 acc3 += (q63_t) x20 *y11; 00646 00647 /* update scratch pointers */ 00648 pIn2 += 4u; 00649 pScr1 += 4u; 00650 00651 /* Decrement the loop counter */ 00652 tapCnt--; 00653 } 00654 00655 /* Update scratch pointer for remaining samples of smaller length sequence */ 00656 pScr1 -= 4u; 00657 00658 /* apply same above for remaining samples of smaller length sequence */ 00659 tapCnt = (srcBLen) & 3u; 00660 00661 while(tapCnt > 0u) 00662 { 00663 /* accumlate the results */ 00664 acc0 += (*pScr1++ * *pIn2); 00665 acc1 += (*pScr1++ * *pIn2); 00666 acc2 += (*pScr1++ * *pIn2); 00667 acc3 += (*pScr1++ * *pIn2++); 00668 00669 pScr1 -= 3u; 00670 00671 /* Decrement the loop counter */ 00672 tapCnt--; 00673 } 00674 00675 blkCnt--; 00676 00677 00678 /* Store the results in the accumulators in the destination buffer. */ 00679 *pOut++ = __SSAT((acc0 >> 15), 16); 00680 *pOut++ = __SSAT((acc1 >> 15), 16); 00681 *pOut++ = __SSAT((acc2 >> 15), 16); 00682 *pOut++ = __SSAT((acc3 >> 15), 16); 00683 00684 00685 /* Initialization of inputB pointer */ 00686 pIn2 = py; 00687 00688 pScratch1 += 4u; 00689 00690 } 00691 00692 00693 blkCnt = numPoints & 0x3; 00694 00695 /* Calculate convolution for remaining samples of Bigger length sequence */ 00696 while(blkCnt > 0) 00697 { 00698 /* Initialze temporary scratch pointer as scratch1 */ 00699 pScr1 = pScratch1; 00700 00701 /* Clear Accumlators */ 00702 acc0 = 0; 00703 00704 tapCnt = (srcBLen) >> 1u; 00705 00706 while(tapCnt > 0u) 00707 { 00708 00709 /* Read next two samples from scratch1 buffer */ 00710 x10 = *pScr1++; 00711 x11 = *pScr1++; 00712 00713 /* Read two samples from smaller buffer */ 00714 y10 = *pIn2++; 00715 y11 = *pIn2++; 00716 00717 /* multiply and accumlate */ 00718 acc0 += (q63_t) x10 *y10; 00719 acc0 += (q63_t) x11 *y11; 00720 00721 /* Decrement the loop counter */ 00722 tapCnt--; 00723 } 00724 00725 tapCnt = (srcBLen) & 1u; 00726 00727 /* apply same above for remaining samples of smaller length sequence */ 00728 while(tapCnt > 0u) 00729 { 00730 00731 /* accumlate the results */ 00732 acc0 += (*pScr1++ * *pIn2++); 00733 00734 /* Decrement the loop counter */ 00735 tapCnt--; 00736 } 00737 00738 blkCnt--; 00739 00740 /* Store the result in the accumulator in the destination buffer. */ 00741 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00742 00743 00744 /* Initialization of inputB pointer */ 00745 pIn2 = py; 00746 00747 pScratch1 += 1u; 00748 00749 } 00750 00751 /* set status as ARM_MATH_SUCCESS */ 00752 status = ARM_MATH_SUCCESS; 00753 00754 } 00755 00756 /* Return to application */ 00757 return (status); 00758 } 00759 00760 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00761 00762 00763 /** 00764 * @} end of PartialConv group 00765 */
Generated on Tue Jul 12 2022 18:44:08 by
