Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_conv_partial_fast_opt_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_fast_opt_q15.c 00009 * 00010 * Description: Fast Q15 Partial convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00062 * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). 00063 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00064 * 00065 * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. 00066 * 00067 * \par Restrictions 00068 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00069 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00070 * 00071 */ 00072 00073 #ifndef UNALIGNED_SUPPORT_DISABLE 00074 00075 arm_status arm_conv_partial_fast_opt_q15( 00076 q15_t * pSrcA, 00077 uint32_t srcALen, 00078 q15_t * pSrcB, 00079 uint32_t srcBLen, 00080 q15_t * pDst, 00081 uint32_t firstIndex, 00082 uint32_t numPoints, 00083 q15_t * pScratch1, 00084 q15_t * pScratch2) 00085 { 00086 00087 q15_t *pOut = pDst; /* output pointer */ 00088 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00089 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00090 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00091 q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ 00092 q31_t y1, y2; /* State variables */ 00093 q15_t *pIn1; /* inputA pointer */ 00094 q15_t *pIn2; /* inputB pointer */ 00095 q15_t *px; /* Intermediate inputA pointer */ 00096 q15_t *py; /* Intermediate inputB pointer */ 00097 uint32_t j, k, blkCnt; /* loop counter */ 00098 arm_status status; 00099 00100 uint32_t tapCnt; /* loop count */ 00101 00102 /* Check for range of output samples to be calculated */ 00103 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00104 { 00105 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00106 status = ARM_MATH_ARGUMENT_ERROR; 00107 } 00108 else 00109 { 00110 00111 /* The algorithm implementation is based on the lengths of the inputs. */ 00112 /* srcB is always made to slide across srcA. */ 00113 /* So srcBLen is always considered as shorter or equal to srcALen */ 00114 if(srcALen >= srcBLen) 00115 { 00116 /* Initialization of inputA pointer */ 00117 pIn1 = pSrcA; 00118 00119 /* Initialization of inputB pointer */ 00120 pIn2 = pSrcB; 00121 } 00122 else 00123 { 00124 /* Initialization of inputA pointer */ 00125 pIn1 = pSrcB; 00126 00127 /* Initialization of inputB pointer */ 00128 pIn2 = pSrcA; 00129 00130 /* srcBLen is always considered as shorter or equal to srcALen */ 00131 j = srcBLen; 00132 srcBLen = srcALen; 00133 srcALen = j; 00134 } 00135 00136 /* Temporary pointer for scratch2 */ 00137 py = pScratch2; 00138 00139 /* pointer to take end of scratch2 buffer */ 00140 pScr2 = pScratch2 + srcBLen - 1; 00141 00142 /* points to smaller length sequence */ 00143 px = pIn2; 00144 00145 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00146 k = srcBLen >> 2u; 00147 00148 /* First part of the processing with loop unrolling copies 4 data points at a time. 00149 ** a second loop below copies for the remaining 1 to 3 samples. */ 00150 00151 /* Copy smaller length input sequence in reverse order into second scratch buffer */ 00152 while(k > 0u) 00153 { 00154 /* copy second buffer in reversal manner */ 00155 *pScr2-- = *px++; 00156 *pScr2-- = *px++; 00157 *pScr2-- = *px++; 00158 *pScr2-- = *px++; 00159 00160 /* Decrement the loop counter */ 00161 k--; 00162 } 00163 00164 /* If the count is not a multiple of 4, copy remaining samples here. 00165 ** No loop unrolling is used. */ 00166 k = srcBLen % 0x4u; 00167 00168 while(k > 0u) 00169 { 00170 /* copy second buffer in reversal manner for remaining samples */ 00171 *pScr2-- = *px++; 00172 00173 /* Decrement the loop counter */ 00174 k--; 00175 } 00176 00177 /* Initialze temporary scratch pointer */ 00178 pScr1 = pScratch1; 00179 00180 /* Assuming scratch1 buffer is aligned by 32-bit */ 00181 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00182 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00183 00184 /* Update temporary scratch pointer */ 00185 pScr1 += (srcBLen - 1u); 00186 00187 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00188 00189 /* Copy (srcALen) samples in scratch buffer */ 00190 arm_copy_q15(pIn1, pScr1, srcALen); 00191 00192 /* Update pointers */ 00193 pScr1 += srcALen; 00194 00195 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00196 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00197 00198 /* Update pointer */ 00199 pScr1 += (srcBLen - 1u); 00200 00201 /* Initialization of pIn2 pointer */ 00202 pIn2 = py; 00203 00204 pScratch1 += firstIndex; 00205 00206 pOut = pDst + firstIndex; 00207 00208 /* First part of the processing with loop unrolling process 4 data points at a time. 00209 ** a second loop below process for the remaining 1 to 3 samples. */ 00210 00211 /* Actual convolution process starts here */ 00212 blkCnt = (numPoints) >> 2; 00213 00214 while(blkCnt > 0) 00215 { 00216 /* Initialze temporary scratch pointer as scratch1 */ 00217 pScr1 = pScratch1; 00218 00219 /* Clear Accumlators */ 00220 acc0 = 0; 00221 acc1 = 0; 00222 acc2 = 0; 00223 acc3 = 0; 00224 00225 /* Read two samples from scratch1 buffer */ 00226 x1 = *__SIMD32(pScr1)++; 00227 00228 /* Read next two samples from scratch1 buffer */ 00229 x2 = *__SIMD32(pScr1)++; 00230 00231 tapCnt = (srcBLen) >> 2u; 00232 00233 while(tapCnt > 0u) 00234 { 00235 00236 /* Read four samples from smaller buffer */ 00237 y1 = _SIMD32_OFFSET(pIn2); 00238 y2 = _SIMD32_OFFSET(pIn2 + 2u); 00239 00240 /* multiply and accumlate */ 00241 acc0 = __SMLAD(x1, y1, acc0); 00242 acc2 = __SMLAD(x2, y1, acc2); 00243 00244 /* pack input data */ 00245 #ifndef ARM_MATH_BIG_ENDIAN 00246 x3 = __PKHBT(x2, x1, 0); 00247 #else 00248 x3 = __PKHBT(x1, x2, 0); 00249 #endif 00250 00251 /* multiply and accumlate */ 00252 acc1 = __SMLADX(x3, y1, acc1); 00253 00254 /* Read next two samples from scratch1 buffer */ 00255 x1 = _SIMD32_OFFSET(pScr1); 00256 00257 /* multiply and accumlate */ 00258 acc0 = __SMLAD(x2, y2, acc0); 00259 00260 acc2 = __SMLAD(x1, y2, acc2); 00261 00262 /* pack input data */ 00263 #ifndef ARM_MATH_BIG_ENDIAN 00264 x3 = __PKHBT(x1, x2, 0); 00265 #else 00266 x3 = __PKHBT(x2, x1, 0); 00267 #endif 00268 00269 acc3 = __SMLADX(x3, y1, acc3); 00270 acc1 = __SMLADX(x3, y2, acc1); 00271 00272 x2 = _SIMD32_OFFSET(pScr1 + 2u); 00273 00274 #ifndef ARM_MATH_BIG_ENDIAN 00275 x3 = __PKHBT(x2, x1, 0); 00276 #else 00277 x3 = __PKHBT(x1, x2, 0); 00278 #endif 00279 00280 acc3 = __SMLADX(x3, y2, acc3); 00281 00282 /* update scratch pointers */ 00283 pIn2 += 4u; 00284 pScr1 += 4u; 00285 00286 00287 /* Decrement the loop counter */ 00288 tapCnt--; 00289 } 00290 00291 /* Update scratch pointer for remaining samples of smaller length sequence */ 00292 pScr1 -= 4u; 00293 00294 /* apply same above for remaining samples of smaller length sequence */ 00295 tapCnt = (srcBLen) & 3u; 00296 00297 while(tapCnt > 0u) 00298 { 00299 00300 /* accumlate the results */ 00301 acc0 += (*pScr1++ * *pIn2); 00302 acc1 += (*pScr1++ * *pIn2); 00303 acc2 += (*pScr1++ * *pIn2); 00304 acc3 += (*pScr1++ * *pIn2++); 00305 00306 pScr1 -= 3u; 00307 00308 /* Decrement the loop counter */ 00309 tapCnt--; 00310 } 00311 00312 blkCnt--; 00313 00314 00315 /* Store the results in the accumulators in the destination buffer. */ 00316 00317 #ifndef ARM_MATH_BIG_ENDIAN 00318 00319 *__SIMD32(pOut)++ = 00320 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00321 *__SIMD32(pOut)++ = 00322 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00323 00324 #else 00325 00326 *__SIMD32(pOut)++ = 00327 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00328 *__SIMD32(pOut)++ = 00329 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00330 00331 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00332 00333 /* Initialization of inputB pointer */ 00334 pIn2 = py; 00335 00336 pScratch1 += 4u; 00337 00338 } 00339 00340 00341 blkCnt = numPoints & 0x3; 00342 00343 /* Calculate convolution for remaining samples of Bigger length sequence */ 00344 while(blkCnt > 0) 00345 { 00346 /* Initialze temporary scratch pointer as scratch1 */ 00347 pScr1 = pScratch1; 00348 00349 /* Clear Accumlators */ 00350 acc0 = 0; 00351 00352 tapCnt = (srcBLen) >> 1u; 00353 00354 while(tapCnt > 0u) 00355 { 00356 00357 /* Read next two samples from scratch1 buffer */ 00358 x1 = *__SIMD32(pScr1)++; 00359 00360 /* Read two samples from smaller buffer */ 00361 y1 = *__SIMD32(pIn2)++; 00362 00363 acc0 = __SMLAD(x1, y1, acc0); 00364 00365 /* Decrement the loop counter */ 00366 tapCnt--; 00367 } 00368 00369 tapCnt = (srcBLen) & 1u; 00370 00371 /* apply same above for remaining samples of smaller length sequence */ 00372 while(tapCnt > 0u) 00373 { 00374 00375 /* accumlate the results */ 00376 acc0 += (*pScr1++ * *pIn2++); 00377 00378 /* Decrement the loop counter */ 00379 tapCnt--; 00380 } 00381 00382 blkCnt--; 00383 00384 /* The result is in 2.30 format. Convert to 1.15 with saturation. 00385 ** Then store the output in the destination buffer. */ 00386 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00387 00388 /* Initialization of inputB pointer */ 00389 pIn2 = py; 00390 00391 pScratch1 += 1u; 00392 00393 } 00394 /* set status as ARM_MATH_SUCCESS */ 00395 status = ARM_MATH_SUCCESS; 00396 } 00397 /* Return to application */ 00398 return (status); 00399 } 00400 00401 #else 00402 00403 arm_status arm_conv_partial_fast_opt_q15( 00404 q15_t * pSrcA, 00405 uint32_t srcALen, 00406 q15_t * pSrcB, 00407 uint32_t srcBLen, 00408 q15_t * pDst, 00409 uint32_t firstIndex, 00410 uint32_t numPoints, 00411 q15_t * pScratch1, 00412 q15_t * pScratch2) 00413 { 00414 00415 q15_t *pOut = pDst; /* output pointer */ 00416 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00417 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00418 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00419 q15_t *pIn1; /* inputA pointer */ 00420 q15_t *pIn2; /* inputB pointer */ 00421 q15_t *px; /* Intermediate inputA pointer */ 00422 q15_t *py; /* Intermediate inputB pointer */ 00423 uint32_t j, k, blkCnt; /* loop counter */ 00424 arm_status status; /* Status variable */ 00425 uint32_t tapCnt; /* loop count */ 00426 q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */ 00427 q15_t y10, y11; /* Temporary variables to hold srcB buffer */ 00428 00429 00430 /* Check for range of output samples to be calculated */ 00431 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00432 { 00433 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00434 status = ARM_MATH_ARGUMENT_ERROR; 00435 } 00436 else 00437 { 00438 00439 /* The algorithm implementation is based on the lengths of the inputs. */ 00440 /* srcB is always made to slide across srcA. */ 00441 /* So srcBLen is always considered as shorter or equal to srcALen */ 00442 if(srcALen >= srcBLen) 00443 { 00444 /* Initialization of inputA pointer */ 00445 pIn1 = pSrcA; 00446 00447 /* Initialization of inputB pointer */ 00448 pIn2 = pSrcB; 00449 } 00450 else 00451 { 00452 /* Initialization of inputA pointer */ 00453 pIn1 = pSrcB; 00454 00455 /* Initialization of inputB pointer */ 00456 pIn2 = pSrcA; 00457 00458 /* srcBLen is always considered as shorter or equal to srcALen */ 00459 j = srcBLen; 00460 srcBLen = srcALen; 00461 srcALen = j; 00462 } 00463 00464 /* Temporary pointer for scratch2 */ 00465 py = pScratch2; 00466 00467 /* pointer to take end of scratch2 buffer */ 00468 pScr2 = pScratch2 + srcBLen - 1; 00469 00470 /* points to smaller length sequence */ 00471 px = pIn2; 00472 00473 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00474 k = srcBLen >> 2u; 00475 00476 /* First part of the processing with loop unrolling copies 4 data points at a time. 00477 ** a second loop below copies for the remaining 1 to 3 samples. */ 00478 while(k > 0u) 00479 { 00480 /* copy second buffer in reversal manner */ 00481 *pScr2-- = *px++; 00482 *pScr2-- = *px++; 00483 *pScr2-- = *px++; 00484 *pScr2-- = *px++; 00485 00486 /* Decrement the loop counter */ 00487 k--; 00488 } 00489 00490 /* If the count is not a multiple of 4, copy remaining samples here. 00491 ** No loop unrolling is used. */ 00492 k = srcBLen % 0x4u; 00493 00494 while(k > 0u) 00495 { 00496 /* copy second buffer in reversal manner for remaining samples */ 00497 *pScr2-- = *px++; 00498 00499 /* Decrement the loop counter */ 00500 k--; 00501 } 00502 00503 /* Initialze temporary scratch pointer */ 00504 pScr1 = pScratch1; 00505 00506 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00507 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00508 00509 /* Update temporary scratch pointer */ 00510 pScr1 += (srcBLen - 1u); 00511 00512 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00513 00514 00515 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00516 k = srcALen >> 2u; 00517 00518 /* First part of the processing with loop unrolling copies 4 data points at a time. 00519 ** a second loop below copies for the remaining 1 to 3 samples. */ 00520 while(k > 0u) 00521 { 00522 /* copy second buffer in reversal manner */ 00523 *pScr1++ = *pIn1++; 00524 *pScr1++ = *pIn1++; 00525 *pScr1++ = *pIn1++; 00526 *pScr1++ = *pIn1++; 00527 00528 /* Decrement the loop counter */ 00529 k--; 00530 } 00531 00532 /* If the count is not a multiple of 4, copy remaining samples here. 00533 ** No loop unrolling is used. */ 00534 k = srcALen % 0x4u; 00535 00536 while(k > 0u) 00537 { 00538 /* copy second buffer in reversal manner for remaining samples */ 00539 *pScr1++ = *pIn1++; 00540 00541 /* Decrement the loop counter */ 00542 k--; 00543 } 00544 00545 00546 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00547 k = (srcBLen - 1u) >> 2u; 00548 00549 /* First part of the processing with loop unrolling copies 4 data points at a time. 00550 ** a second loop below copies for the remaining 1 to 3 samples. */ 00551 while(k > 0u) 00552 { 00553 /* copy second buffer in reversal manner */ 00554 *pScr1++ = 0; 00555 *pScr1++ = 0; 00556 *pScr1++ = 0; 00557 *pScr1++ = 0; 00558 00559 /* Decrement the loop counter */ 00560 k--; 00561 } 00562 00563 /* If the count is not a multiple of 4, copy remaining samples here. 00564 ** No loop unrolling is used. */ 00565 k = (srcBLen - 1u) % 0x4u; 00566 00567 while(k > 0u) 00568 { 00569 /* copy second buffer in reversal manner for remaining samples */ 00570 *pScr1++ = 0; 00571 00572 /* Decrement the loop counter */ 00573 k--; 00574 } 00575 00576 00577 /* Initialization of pIn2 pointer */ 00578 pIn2 = py; 00579 00580 pScratch1 += firstIndex; 00581 00582 pOut = pDst + firstIndex; 00583 00584 /* Actual convolution process starts here */ 00585 blkCnt = (numPoints) >> 2; 00586 00587 while(blkCnt > 0) 00588 { 00589 /* Initialze temporary scratch pointer as scratch1 */ 00590 pScr1 = pScratch1; 00591 00592 /* Clear Accumlators */ 00593 acc0 = 0; 00594 acc1 = 0; 00595 acc2 = 0; 00596 acc3 = 0; 00597 00598 /* Read two samples from scratch1 buffer */ 00599 x10 = *pScr1++; 00600 x11 = *pScr1++; 00601 00602 /* Read next two samples from scratch1 buffer */ 00603 x20 = *pScr1++; 00604 x21 = *pScr1++; 00605 00606 tapCnt = (srcBLen) >> 2u; 00607 00608 while(tapCnt > 0u) 00609 { 00610 00611 /* Read two samples from smaller buffer */ 00612 y10 = *pIn2; 00613 y11 = *(pIn2 + 1u); 00614 00615 /* multiply and accumlate */ 00616 acc0 += (q31_t) x10 *y10; 00617 acc0 += (q31_t) x11 *y11; 00618 acc2 += (q31_t) x20 *y10; 00619 acc2 += (q31_t) x21 *y11; 00620 00621 /* multiply and accumlate */ 00622 acc1 += (q31_t) x11 *y10; 00623 acc1 += (q31_t) x20 *y11; 00624 00625 /* Read next two samples from scratch1 buffer */ 00626 x10 = *pScr1; 00627 x11 = *(pScr1 + 1u); 00628 00629 /* multiply and accumlate */ 00630 acc3 += (q31_t) x21 *y10; 00631 acc3 += (q31_t) x10 *y11; 00632 00633 /* Read next two samples from scratch2 buffer */ 00634 y10 = *(pIn2 + 2u); 00635 y11 = *(pIn2 + 3u); 00636 00637 /* multiply and accumlate */ 00638 acc0 += (q31_t) x20 *y10; 00639 acc0 += (q31_t) x21 *y11; 00640 acc2 += (q31_t) x10 *y10; 00641 acc2 += (q31_t) x11 *y11; 00642 acc1 += (q31_t) x21 *y10; 00643 acc1 += (q31_t) x10 *y11; 00644 00645 /* Read next two samples from scratch1 buffer */ 00646 x20 = *(pScr1 + 2); 00647 x21 = *(pScr1 + 3); 00648 00649 /* multiply and accumlate */ 00650 acc3 += (q31_t) x11 *y10; 00651 acc3 += (q31_t) x20 *y11; 00652 00653 /* update scratch pointers */ 00654 pIn2 += 4u; 00655 pScr1 += 4u; 00656 00657 /* Decrement the loop counter */ 00658 tapCnt--; 00659 } 00660 00661 /* Update scratch pointer for remaining samples of smaller length sequence */ 00662 pScr1 -= 4u; 00663 00664 /* apply same above for remaining samples of smaller length sequence */ 00665 tapCnt = (srcBLen) & 3u; 00666 00667 while(tapCnt > 0u) 00668 { 00669 /* accumlate the results */ 00670 acc0 += (*pScr1++ * *pIn2); 00671 acc1 += (*pScr1++ * *pIn2); 00672 acc2 += (*pScr1++ * *pIn2); 00673 acc3 += (*pScr1++ * *pIn2++); 00674 00675 pScr1 -= 3u; 00676 00677 /* Decrement the loop counter */ 00678 tapCnt--; 00679 } 00680 00681 blkCnt--; 00682 00683 00684 /* Store the results in the accumulators in the destination buffer. */ 00685 *pOut++ = __SSAT((acc0 >> 15), 16); 00686 *pOut++ = __SSAT((acc1 >> 15), 16); 00687 *pOut++ = __SSAT((acc2 >> 15), 16); 00688 *pOut++ = __SSAT((acc3 >> 15), 16); 00689 00690 /* Initialization of inputB pointer */ 00691 pIn2 = py; 00692 00693 pScratch1 += 4u; 00694 00695 } 00696 00697 00698 blkCnt = numPoints & 0x3; 00699 00700 /* Calculate convolution for remaining samples of Bigger length sequence */ 00701 while(blkCnt > 0) 00702 { 00703 /* Initialze temporary scratch pointer as scratch1 */ 00704 pScr1 = pScratch1; 00705 00706 /* Clear Accumlators */ 00707 acc0 = 0; 00708 00709 tapCnt = (srcBLen) >> 1u; 00710 00711 while(tapCnt > 0u) 00712 { 00713 00714 /* Read next two samples from scratch1 buffer */ 00715 x10 = *pScr1++; 00716 x11 = *pScr1++; 00717 00718 /* Read two samples from smaller buffer */ 00719 y10 = *pIn2++; 00720 y11 = *pIn2++; 00721 00722 /* multiply and accumlate */ 00723 acc0 += (q31_t) x10 *y10; 00724 acc0 += (q31_t) x11 *y11; 00725 00726 /* Decrement the loop counter */ 00727 tapCnt--; 00728 } 00729 00730 tapCnt = (srcBLen) & 1u; 00731 00732 /* apply same above for remaining samples of smaller length sequence */ 00733 while(tapCnt > 0u) 00734 { 00735 00736 /* accumlate the results */ 00737 acc0 += (*pScr1++ * *pIn2++); 00738 00739 /* Decrement the loop counter */ 00740 tapCnt--; 00741 } 00742 00743 blkCnt--; 00744 00745 /* Store the result in the accumulator in the destination buffer. */ 00746 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00747 00748 /* Initialization of inputB pointer */ 00749 pIn2 = py; 00750 00751 pScratch1 += 1u; 00752 00753 } 00754 00755 /* set status as ARM_MATH_SUCCESS */ 00756 status = ARM_MATH_SUCCESS; 00757 00758 } 00759 00760 /* Return to application */ 00761 return (status); 00762 } 00763 00764 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00765 00766 /** 00767 * @} end of PartialConv group 00768 */
Generated on Tue Jul 12 2022 18:44:08 by
