CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_conv_partial_opt_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_opt_q7.c 00009 * 00010 * Description: Partial convolution of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q7 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00062 * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). 00063 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00064 * 00065 * \par Restrictions 00066 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00067 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00068 * 00069 * 00070 * 00071 */ 00072 00073 00074 #ifndef UNALIGNED_SUPPORT_DISABLE 00075 00076 arm_status arm_conv_partial_opt_q7( 00077 q7_t * pSrcA, 00078 uint32_t srcALen, 00079 q7_t * pSrcB, 00080 uint32_t srcBLen, 00081 q7_t * pDst, 00082 uint32_t firstIndex, 00083 uint32_t numPoints, 00084 q15_t * pScratch1, 00085 q15_t * pScratch2) 00086 { 00087 00088 q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ 00089 q15_t x4; /* Temporary input variable */ 00090 q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ 00091 uint32_t j, k, blkCnt, tapCnt; /* loop counter */ 00092 q7_t *px; /* Temporary input1 pointer */ 00093 q15_t *py; /* Temporary input2 pointer */ 00094 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00095 q31_t x1, x2, x3, y1; /* Temporary input variables */ 00096 arm_status status; 00097 q7_t *pOut = pDst; /* output pointer */ 00098 q7_t out0, out1, out2, out3; /* temporary variables */ 00099 00100 /* Check for range of output samples to be calculated */ 00101 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00102 { 00103 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00104 status = ARM_MATH_ARGUMENT_ERROR; 00105 } 00106 else 00107 { 00108 00109 /* The algorithm implementation is based on the lengths of the inputs. */ 00110 /* srcB is always made to slide across srcA. */ 00111 /* So srcBLen is always considered as shorter or equal to srcALen */ 00112 if(srcALen >= srcBLen) 00113 { 00114 /* Initialization of inputA pointer */ 00115 pIn1 = pSrcA; 00116 00117 /* Initialization of inputB pointer */ 00118 pIn2 = pSrcB; 00119 } 00120 else 00121 { 00122 /* Initialization of inputA pointer */ 00123 pIn1 = pSrcB; 00124 00125 /* Initialization of inputB pointer */ 00126 pIn2 = pSrcA; 00127 00128 /* srcBLen is always considered as shorter or equal to srcALen */ 00129 j = srcBLen; 00130 srcBLen = srcALen; 00131 srcALen = j; 00132 } 00133 00134 /* pointer to take end of scratch2 buffer */ 00135 pScr2 = pScratch2; 00136 00137 /* points to smaller length sequence */ 00138 px = pIn2 + srcBLen - 1; 00139 00140 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00141 k = srcBLen >> 2u; 00142 00143 /* First part of the processing with loop unrolling copies 4 data points at a time. 00144 ** a second loop below copies for the remaining 1 to 3 samples. */ 00145 while(k > 0u) 00146 { 00147 /* copy second buffer in reversal manner */ 00148 x4 = (q15_t) * px--; 00149 *pScr2++ = x4; 00150 x4 = (q15_t) * px--; 00151 *pScr2++ = x4; 00152 x4 = (q15_t) * px--; 00153 *pScr2++ = x4; 00154 x4 = (q15_t) * px--; 00155 *pScr2++ = x4; 00156 00157 /* Decrement the loop counter */ 00158 k--; 00159 } 00160 00161 /* If the count is not a multiple of 4, copy remaining samples here. 00162 ** No loop unrolling is used. */ 00163 k = srcBLen % 0x4u; 00164 00165 while(k > 0u) 00166 { 00167 /* copy second buffer in reversal manner for remaining samples */ 00168 x4 = (q15_t) * px--; 00169 *pScr2++ = x4; 00170 00171 /* Decrement the loop counter */ 00172 k--; 00173 } 00174 00175 /* Initialze temporary scratch pointer */ 00176 pScr1 = pScratch1; 00177 00178 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00179 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00180 00181 /* Update temporary scratch pointer */ 00182 pScr1 += (srcBLen - 1u); 00183 00184 /* Copy (srcALen) samples in scratch buffer */ 00185 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00186 k = srcALen >> 2u; 00187 00188 /* First part of the processing with loop unrolling copies 4 data points at a time. 00189 ** a second loop below copies for the remaining 1 to 3 samples. */ 00190 while(k > 0u) 00191 { 00192 /* copy second buffer in reversal manner */ 00193 x4 = (q15_t) * pIn1++; 00194 *pScr1++ = x4; 00195 x4 = (q15_t) * pIn1++; 00196 *pScr1++ = x4; 00197 x4 = (q15_t) * pIn1++; 00198 *pScr1++ = x4; 00199 x4 = (q15_t) * pIn1++; 00200 *pScr1++ = x4; 00201 00202 /* Decrement the loop counter */ 00203 k--; 00204 } 00205 00206 /* If the count is not a multiple of 4, copy remaining samples here. 00207 ** No loop unrolling is used. */ 00208 k = srcALen % 0x4u; 00209 00210 while(k > 0u) 00211 { 00212 /* copy second buffer in reversal manner for remaining samples */ 00213 x4 = (q15_t) * pIn1++; 00214 *pScr1++ = x4; 00215 00216 /* Decrement the loop counter */ 00217 k--; 00218 } 00219 00220 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00221 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00222 00223 /* Update pointer */ 00224 pScr1 += (srcBLen - 1u); 00225 00226 00227 /* Temporary pointer for scratch2 */ 00228 py = pScratch2; 00229 00230 /* Initialization of pIn2 pointer */ 00231 pIn2 = (q7_t *) py; 00232 00233 pScr2 = py; 00234 00235 pOut = pDst + firstIndex; 00236 00237 pScratch1 += firstIndex; 00238 00239 /* Actual convolution process starts here */ 00240 blkCnt = (numPoints) >> 2; 00241 00242 00243 while(blkCnt > 0) 00244 { 00245 /* Initialze temporary scratch pointer as scratch1 */ 00246 pScr1 = pScratch1; 00247 00248 /* Clear Accumlators */ 00249 acc0 = 0; 00250 acc1 = 0; 00251 acc2 = 0; 00252 acc3 = 0; 00253 00254 /* Read two samples from scratch1 buffer */ 00255 x1 = *__SIMD32(pScr1)++; 00256 00257 /* Read next two samples from scratch1 buffer */ 00258 x2 = *__SIMD32(pScr1)++; 00259 00260 tapCnt = (srcBLen) >> 2u; 00261 00262 while(tapCnt > 0u) 00263 { 00264 00265 /* Read four samples from smaller buffer */ 00266 y1 = _SIMD32_OFFSET(pScr2); 00267 00268 /* multiply and accumlate */ 00269 acc0 = __SMLAD(x1, y1, acc0); 00270 acc2 = __SMLAD(x2, y1, acc2); 00271 00272 /* pack input data */ 00273 #ifndef ARM_MATH_BIG_ENDIAN 00274 x3 = __PKHBT(x2, x1, 0); 00275 #else 00276 x3 = __PKHBT(x1, x2, 0); 00277 #endif 00278 00279 /* multiply and accumlate */ 00280 acc1 = __SMLADX(x3, y1, acc1); 00281 00282 /* Read next two samples from scratch1 buffer */ 00283 x1 = *__SIMD32(pScr1)++; 00284 00285 /* pack input data */ 00286 #ifndef ARM_MATH_BIG_ENDIAN 00287 x3 = __PKHBT(x1, x2, 0); 00288 #else 00289 x3 = __PKHBT(x2, x1, 0); 00290 #endif 00291 00292 acc3 = __SMLADX(x3, y1, acc3); 00293 00294 /* Read four samples from smaller buffer */ 00295 y1 = _SIMD32_OFFSET(pScr2 + 2u); 00296 00297 acc0 = __SMLAD(x2, y1, acc0); 00298 00299 acc2 = __SMLAD(x1, y1, acc2); 00300 00301 acc1 = __SMLADX(x3, y1, acc1); 00302 00303 x2 = *__SIMD32(pScr1)++; 00304 00305 #ifndef ARM_MATH_BIG_ENDIAN 00306 x3 = __PKHBT(x2, x1, 0); 00307 #else 00308 x3 = __PKHBT(x1, x2, 0); 00309 #endif 00310 00311 acc3 = __SMLADX(x3, y1, acc3); 00312 00313 pScr2 += 4u; 00314 00315 00316 /* Decrement the loop counter */ 00317 tapCnt--; 00318 } 00319 00320 00321 00322 /* Update scratch pointer for remaining samples of smaller length sequence */ 00323 pScr1 -= 4u; 00324 00325 00326 /* apply same above for remaining samples of smaller length sequence */ 00327 tapCnt = (srcBLen) & 3u; 00328 00329 while(tapCnt > 0u) 00330 { 00331 00332 /* accumlate the results */ 00333 acc0 += (*pScr1++ * *pScr2); 00334 acc1 += (*pScr1++ * *pScr2); 00335 acc2 += (*pScr1++ * *pScr2); 00336 acc3 += (*pScr1++ * *pScr2++); 00337 00338 pScr1 -= 3u; 00339 00340 /* Decrement the loop counter */ 00341 tapCnt--; 00342 } 00343 00344 blkCnt--; 00345 00346 /* Store the result in the accumulator in the destination buffer. */ 00347 out0 = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00348 out1 = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00349 out2 = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00350 out3 = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00351 00352 *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); 00353 00354 /* Initialization of inputB pointer */ 00355 pScr2 = py; 00356 00357 pScratch1 += 4u; 00358 00359 } 00360 00361 blkCnt = (numPoints) & 0x3; 00362 00363 /* Calculate convolution for remaining samples of Bigger length sequence */ 00364 while(blkCnt > 0) 00365 { 00366 /* Initialze temporary scratch pointer as scratch1 */ 00367 pScr1 = pScratch1; 00368 00369 /* Clear Accumlators */ 00370 acc0 = 0; 00371 00372 tapCnt = (srcBLen) >> 1u; 00373 00374 while(tapCnt > 0u) 00375 { 00376 00377 /* Read next two samples from scratch1 buffer */ 00378 x1 = *__SIMD32(pScr1)++; 00379 00380 /* Read two samples from smaller buffer */ 00381 y1 = *__SIMD32(pScr2)++; 00382 00383 acc0 = __SMLAD(x1, y1, acc0); 00384 00385 /* Decrement the loop counter */ 00386 tapCnt--; 00387 } 00388 00389 tapCnt = (srcBLen) & 1u; 00390 00391 /* apply same above for remaining samples of smaller length sequence */ 00392 while(tapCnt > 0u) 00393 { 00394 00395 /* accumlate the results */ 00396 acc0 += (*pScr1++ * *pScr2++); 00397 00398 /* Decrement the loop counter */ 00399 tapCnt--; 00400 } 00401 00402 blkCnt--; 00403 00404 /* Store the result in the accumulator in the destination buffer. */ 00405 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00406 00407 /* Initialization of inputB pointer */ 00408 pScr2 = py; 00409 00410 pScratch1 += 1u; 00411 00412 } 00413 00414 /* set status as ARM_MATH_SUCCESS */ 00415 status = ARM_MATH_SUCCESS; 00416 00417 00418 } 00419 00420 return (status); 00421 00422 } 00423 00424 #else 00425 00426 arm_status arm_conv_partial_opt_q7( 00427 q7_t * pSrcA, 00428 uint32_t srcALen, 00429 q7_t * pSrcB, 00430 uint32_t srcBLen, 00431 q7_t * pDst, 00432 uint32_t firstIndex, 00433 uint32_t numPoints, 00434 q15_t * pScratch1, 00435 q15_t * pScratch2) 00436 { 00437 00438 q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ 00439 q15_t x4; /* Temporary input variable */ 00440 q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ 00441 uint32_t j, k, blkCnt, tapCnt; /* loop counter */ 00442 q7_t *px; /* Temporary input1 pointer */ 00443 q15_t *py; /* Temporary input2 pointer */ 00444 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00445 arm_status status; 00446 q7_t *pOut = pDst; /* output pointer */ 00447 q15_t x10, x11, x20, x21; /* Temporary input variables */ 00448 q15_t y10, y11; /* Temporary input variables */ 00449 00450 /* Check for range of output samples to be calculated */ 00451 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00452 { 00453 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00454 status = ARM_MATH_ARGUMENT_ERROR; 00455 } 00456 else 00457 { 00458 00459 /* The algorithm implementation is based on the lengths of the inputs. */ 00460 /* srcB is always made to slide across srcA. */ 00461 /* So srcBLen is always considered as shorter or equal to srcALen */ 00462 if(srcALen >= srcBLen) 00463 { 00464 /* Initialization of inputA pointer */ 00465 pIn1 = pSrcA; 00466 00467 /* Initialization of inputB pointer */ 00468 pIn2 = pSrcB; 00469 } 00470 else 00471 { 00472 /* Initialization of inputA pointer */ 00473 pIn1 = pSrcB; 00474 00475 /* Initialization of inputB pointer */ 00476 pIn2 = pSrcA; 00477 00478 /* srcBLen is always considered as shorter or equal to srcALen */ 00479 j = srcBLen; 00480 srcBLen = srcALen; 00481 srcALen = j; 00482 } 00483 00484 /* pointer to take end of scratch2 buffer */ 00485 pScr2 = pScratch2; 00486 00487 /* points to smaller length sequence */ 00488 px = pIn2 + srcBLen - 1; 00489 00490 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00491 k = srcBLen >> 2u; 00492 00493 /* First part of the processing with loop unrolling copies 4 data points at a time. 00494 ** a second loop below copies for the remaining 1 to 3 samples. */ 00495 while(k > 0u) 00496 { 00497 /* copy second buffer in reversal manner */ 00498 x4 = (q15_t) * px--; 00499 *pScr2++ = x4; 00500 x4 = (q15_t) * px--; 00501 *pScr2++ = x4; 00502 x4 = (q15_t) * px--; 00503 *pScr2++ = x4; 00504 x4 = (q15_t) * px--; 00505 *pScr2++ = x4; 00506 00507 /* Decrement the loop counter */ 00508 k--; 00509 } 00510 00511 /* If the count is not a multiple of 4, copy remaining samples here. 00512 ** No loop unrolling is used. */ 00513 k = srcBLen % 0x4u; 00514 00515 while(k > 0u) 00516 { 00517 /* copy second buffer in reversal manner for remaining samples */ 00518 x4 = (q15_t) * px--; 00519 *pScr2++ = x4; 00520 00521 /* Decrement the loop counter */ 00522 k--; 00523 } 00524 00525 /* Initialze temporary scratch pointer */ 00526 pScr1 = pScratch1; 00527 00528 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00529 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00530 00531 /* Update temporary scratch pointer */ 00532 pScr1 += (srcBLen - 1u); 00533 00534 /* Copy (srcALen) samples in scratch buffer */ 00535 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00536 k = srcALen >> 2u; 00537 00538 /* First part of the processing with loop unrolling copies 4 data points at a time. 00539 ** a second loop below copies for the remaining 1 to 3 samples. */ 00540 while(k > 0u) 00541 { 00542 /* copy second buffer in reversal manner */ 00543 x4 = (q15_t) * pIn1++; 00544 *pScr1++ = x4; 00545 x4 = (q15_t) * pIn1++; 00546 *pScr1++ = x4; 00547 x4 = (q15_t) * pIn1++; 00548 *pScr1++ = x4; 00549 x4 = (q15_t) * pIn1++; 00550 *pScr1++ = x4; 00551 00552 /* Decrement the loop counter */ 00553 k--; 00554 } 00555 00556 /* If the count is not a multiple of 4, copy remaining samples here. 00557 ** No loop unrolling is used. */ 00558 k = srcALen % 0x4u; 00559 00560 while(k > 0u) 00561 { 00562 /* copy second buffer in reversal manner for remaining samples */ 00563 x4 = (q15_t) * pIn1++; 00564 *pScr1++ = x4; 00565 00566 /* Decrement the loop counter */ 00567 k--; 00568 } 00569 00570 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00571 k = (srcBLen - 1u) >> 2u; 00572 00573 /* First part of the processing with loop unrolling copies 4 data points at a time. 00574 ** a second loop below copies for the remaining 1 to 3 samples. */ 00575 while(k > 0u) 00576 { 00577 /* copy second buffer in reversal manner */ 00578 *pScr1++ = 0; 00579 *pScr1++ = 0; 00580 *pScr1++ = 0; 00581 *pScr1++ = 0; 00582 00583 /* Decrement the loop counter */ 00584 k--; 00585 } 00586 00587 /* If the count is not a multiple of 4, copy remaining samples here. 00588 ** No loop unrolling is used. */ 00589 k = (srcBLen - 1u) % 0x4u; 00590 00591 while(k > 0u) 00592 { 00593 /* copy second buffer in reversal manner for remaining samples */ 00594 *pScr1++ = 0; 00595 00596 /* Decrement the loop counter */ 00597 k--; 00598 } 00599 00600 00601 /* Temporary pointer for scratch2 */ 00602 py = pScratch2; 00603 00604 /* Initialization of pIn2 pointer */ 00605 pIn2 = (q7_t *) py; 00606 00607 pScr2 = py; 00608 00609 pOut = pDst + firstIndex; 00610 00611 pScratch1 += firstIndex; 00612 00613 /* Actual convolution process starts here */ 00614 blkCnt = (numPoints) >> 2; 00615 00616 00617 while(blkCnt > 0) 00618 { 00619 /* Initialze temporary scratch pointer as scratch1 */ 00620 pScr1 = pScratch1; 00621 00622 /* Clear Accumlators */ 00623 acc0 = 0; 00624 acc1 = 0; 00625 acc2 = 0; 00626 acc3 = 0; 00627 00628 /* Read two samples from scratch1 buffer */ 00629 x10 = *pScr1++; 00630 x11 = *pScr1++; 00631 00632 /* Read next two samples from scratch1 buffer */ 00633 x20 = *pScr1++; 00634 x21 = *pScr1++; 00635 00636 tapCnt = (srcBLen) >> 2u; 00637 00638 while(tapCnt > 0u) 00639 { 00640 00641 /* Read four samples from smaller buffer */ 00642 y10 = *pScr2; 00643 y11 = *(pScr2 + 1u); 00644 00645 /* multiply and accumlate */ 00646 acc0 += (q31_t) x10 *y10; 00647 acc0 += (q31_t) x11 *y11; 00648 acc2 += (q31_t) x20 *y10; 00649 acc2 += (q31_t) x21 *y11; 00650 00651 00652 acc1 += (q31_t) x11 *y10; 00653 acc1 += (q31_t) x20 *y11; 00654 00655 /* Read next two samples from scratch1 buffer */ 00656 x10 = *pScr1; 00657 x11 = *(pScr1 + 1u); 00658 00659 /* multiply and accumlate */ 00660 acc3 += (q31_t) x21 *y10; 00661 acc3 += (q31_t) x10 *y11; 00662 00663 /* Read next two samples from scratch2 buffer */ 00664 y10 = *(pScr2 + 2u); 00665 y11 = *(pScr2 + 3u); 00666 00667 /* multiply and accumlate */ 00668 acc0 += (q31_t) x20 *y10; 00669 acc0 += (q31_t) x21 *y11; 00670 acc2 += (q31_t) x10 *y10; 00671 acc2 += (q31_t) x11 *y11; 00672 acc1 += (q31_t) x21 *y10; 00673 acc1 += (q31_t) x10 *y11; 00674 00675 /* Read next two samples from scratch1 buffer */ 00676 x20 = *(pScr1 + 2); 00677 x21 = *(pScr1 + 3); 00678 00679 /* multiply and accumlate */ 00680 acc3 += (q31_t) x11 *y10; 00681 acc3 += (q31_t) x20 *y11; 00682 00683 /* update scratch pointers */ 00684 00685 pScr1 += 4u; 00686 pScr2 += 4u; 00687 00688 /* Decrement the loop counter */ 00689 tapCnt--; 00690 } 00691 00692 00693 00694 /* Update scratch pointer for remaining samples of smaller length sequence */ 00695 pScr1 -= 4u; 00696 00697 00698 /* apply same above for remaining samples of smaller length sequence */ 00699 tapCnt = (srcBLen) & 3u; 00700 00701 while(tapCnt > 0u) 00702 { 00703 00704 /* accumlate the results */ 00705 acc0 += (*pScr1++ * *pScr2); 00706 acc1 += (*pScr1++ * *pScr2); 00707 acc2 += (*pScr1++ * *pScr2); 00708 acc3 += (*pScr1++ * *pScr2++); 00709 00710 pScr1 -= 3u; 00711 00712 /* Decrement the loop counter */ 00713 tapCnt--; 00714 } 00715 00716 blkCnt--; 00717 00718 /* Store the result in the accumulator in the destination buffer. */ 00719 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00720 *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00721 *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00722 *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00723 00724 /* Initialization of inputB pointer */ 00725 pScr2 = py; 00726 00727 pScratch1 += 4u; 00728 00729 } 00730 00731 blkCnt = (numPoints) & 0x3; 00732 00733 /* Calculate convolution for remaining samples of Bigger length sequence */ 00734 while(blkCnt > 0) 00735 { 00736 /* Initialze temporary scratch pointer as scratch1 */ 00737 pScr1 = pScratch1; 00738 00739 /* Clear Accumlators */ 00740 acc0 = 0; 00741 00742 tapCnt = (srcBLen) >> 1u; 00743 00744 while(tapCnt > 0u) 00745 { 00746 00747 /* Read next two samples from scratch1 buffer */ 00748 x10 = *pScr1++; 00749 x11 = *pScr1++; 00750 00751 /* Read two samples from smaller buffer */ 00752 y10 = *pScr2++; 00753 y11 = *pScr2++; 00754 00755 /* multiply and accumlate */ 00756 acc0 += (q31_t) x10 *y10; 00757 acc0 += (q31_t) x11 *y11; 00758 00759 /* Decrement the loop counter */ 00760 tapCnt--; 00761 } 00762 00763 tapCnt = (srcBLen) & 1u; 00764 00765 /* apply same above for remaining samples of smaller length sequence */ 00766 while(tapCnt > 0u) 00767 { 00768 00769 /* accumlate the results */ 00770 acc0 += (*pScr1++ * *pScr2++); 00771 00772 /* Decrement the loop counter */ 00773 tapCnt--; 00774 } 00775 00776 blkCnt--; 00777 00778 /* Store the result in the accumulator in the destination buffer. */ 00779 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00780 00781 /* Initialization of inputB pointer */ 00782 pScr2 = py; 00783 00784 pScratch1 += 1u; 00785 00786 } 00787 00788 /* set status as ARM_MATH_SUCCESS */ 00789 status = ARM_MATH_SUCCESS; 00790 00791 } 00792 00793 return (status); 00794 00795 } 00796 00797 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00798 00799 00800 00801 /** 00802 * @} end of PartialConv group 00803 */
Generated on Tue Jul 12 2022 11:59:16 by 1.7.2