CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_conv_partial_opt_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_opt_q7.c 00009 * 00010 * Description: Partial convolution of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q7 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00062 * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). 00063 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00064 * 00065 * \par Restrictions 00066 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00067 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00068 * 00069 * 00070 * 00071 */ 00072 00073 00074 #ifndef UNALIGNED_SUPPORT_DISABLE 00075 00076 arm_status arm_conv_partial_opt_q7( 00077 q7_t * pSrcA, 00078 uint32_t srcALen, 00079 q7_t * pSrcB, 00080 uint32_t srcBLen, 00081 q7_t * pDst, 00082 uint32_t firstIndex, 00083 uint32_t numPoints, 00084 q15_t * pScratch1, 00085 q15_t * pScratch2) 00086 { 00087 00088 q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ 00089 q15_t x4; /* Temporary input variable */ 00090 q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ 00091 uint32_t j, k, blkCnt, tapCnt; /* loop counter */ 00092 q7_t *px; /* Temporary input1 pointer */ 00093 q15_t *py; /* Temporary input2 pointer */ 00094 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00095 q31_t x1, x2, x3, y1; /* Temporary input variables */ 00096 arm_status status; 00097 q7_t *pOut = pDst; /* output pointer */ 00098 q7_t out0, out1, out2, out3; /* temporary variables */ 00099 00100 /* Check for range of output samples to be calculated */ 00101 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00102 { 00103 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00104 status = ARM_MATH_ARGUMENT_ERROR; 00105 } 00106 else 00107 { 00108 00109 /* The algorithm implementation is based on the lengths of the inputs. */ 00110 /* srcB is always made to slide across srcA. */ 00111 /* So srcBLen is always considered as shorter or equal to srcALen */ 00112 if(srcALen >= srcBLen) 00113 { 00114 /* Initialization of inputA pointer */ 00115 pIn1 = pSrcA; 00116 00117 /* Initialization of inputB pointer */ 00118 pIn2 = pSrcB; 00119 } 00120 else 00121 { 00122 /* Initialization of inputA pointer */ 00123 pIn1 = pSrcB; 00124 00125 /* Initialization of inputB pointer */ 00126 pIn2 = pSrcA; 00127 00128 /* srcBLen is always considered as shorter or equal to srcALen */ 00129 j = srcBLen; 00130 srcBLen = srcALen; 00131 srcALen = j; 00132 } 00133 00134 /* pointer to take end of scratch2 buffer */ 00135 pScr2 = pScratch2; 00136 00137 /* points to smaller length sequence */ 00138 px = pIn2 + srcBLen - 1; 00139 00140 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00141 k = srcBLen >> 2u; 00142 00143 /* First part of the processing with loop unrolling copies 4 data points at a time. 00144 ** a second loop below copies for the remaining 1 to 3 samples. */ 00145 while(k > 0u) 00146 { 00147 /* copy second buffer in reversal manner */ 00148 x4 = (q15_t) * px--; 00149 *pScr2++ = x4; 00150 x4 = (q15_t) * px--; 00151 *pScr2++ = x4; 00152 x4 = (q15_t) * px--; 00153 *pScr2++ = x4; 00154 x4 = (q15_t) * px--; 00155 *pScr2++ = x4; 00156 00157 /* Decrement the loop counter */ 00158 k--; 00159 } 00160 00161 /* If the count is not a multiple of 4, copy remaining samples here. 00162 ** No loop unrolling is used. */ 00163 k = srcBLen % 0x4u; 00164 00165 while(k > 0u) 00166 { 00167 /* copy second buffer in reversal manner for remaining samples */ 00168 x4 = (q15_t) * px--; 00169 *pScr2++ = x4; 00170 00171 /* Decrement the loop counter */ 00172 k--; 00173 } 00174 00175 /* Initialze temporary scratch pointer */ 00176 pScr1 = pScratch1; 00177 00178 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00179 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00180 00181 /* Update temporary scratch pointer */ 00182 pScr1 += (srcBLen - 1u); 00183 00184 /* Copy (srcALen) samples in scratch buffer */ 00185 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00186 k = srcALen >> 2u; 00187 00188 /* First part of the processing with loop unrolling copies 4 data points at a time. 00189 ** a second loop below copies for the remaining 1 to 3 samples. */ 00190 while(k > 0u) 00191 { 00192 /* copy second buffer in reversal manner */ 00193 x4 = (q15_t) * pIn1++; 00194 *pScr1++ = x4; 00195 x4 = (q15_t) * pIn1++; 00196 *pScr1++ = x4; 00197 x4 = (q15_t) * pIn1++; 00198 *pScr1++ = x4; 00199 x4 = (q15_t) * pIn1++; 00200 *pScr1++ = x4; 00201 00202 /* Decrement the loop counter */ 00203 k--; 00204 } 00205 00206 /* If the count is not a multiple of 4, copy remaining samples here. 00207 ** No loop unrolling is used. */ 00208 k = srcALen % 0x4u; 00209 00210 while(k > 0u) 00211 { 00212 /* copy second buffer in reversal manner for remaining samples */ 00213 x4 = (q15_t) * pIn1++; 00214 *pScr1++ = x4; 00215 00216 /* Decrement the loop counter */ 00217 k--; 00218 } 00219 00220 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00221 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00222 00223 /* Update pointer */ 00224 pScr1 += (srcBLen - 1u); 00225 00226 00227 /* Temporary pointer for scratch2 */ 00228 py = pScratch2; 00229 00230 /* Initialization of pIn2 pointer */ 00231 pIn2 = (q7_t *) py; 00232 00233 pScr2 = py; 00234 00235 pOut = pDst + firstIndex; 00236 00237 pScratch1 += firstIndex; 00238 00239 /* Actual convolution process starts here */ 00240 blkCnt = (numPoints) >> 2; 00241 00242 00243 while(blkCnt > 0) 00244 { 00245 /* Initialze temporary scratch pointer as scratch1 */ 00246 pScr1 = pScratch1; 00247 00248 /* Clear Accumlators */ 00249 acc0 = 0; 00250 acc1 = 0; 00251 acc2 = 0; 00252 acc3 = 0; 00253 00254 /* Read two samples from scratch1 buffer */ 00255 x1 = *__SIMD32(pScr1)++; 00256 00257 /* Read next two samples from scratch1 buffer */ 00258 x2 = *__SIMD32(pScr1)++; 00259 00260 tapCnt = (srcBLen) >> 2u; 00261 00262 while(tapCnt > 0u) 00263 { 00264 00265 /* Read four samples from smaller buffer */ 00266 y1 = _SIMD32_OFFSET(pScr2); 00267 00268 /* multiply and accumlate */ 00269 acc0 = __SMLAD(x1, y1, acc0); 00270 acc2 = __SMLAD(x2, y1, acc2); 00271 00272 /* pack input data */ 00273 #ifndef ARM_MATH_BIG_ENDIAN 00274 x3 = __PKHBT(x2, x1, 0); 00275 #else 00276 x3 = __PKHBT(x1, x2, 0); 00277 #endif 00278 00279 /* multiply and accumlate */ 00280 acc1 = __SMLADX(x3, y1, acc1); 00281 00282 /* Read next two samples from scratch1 buffer */ 00283 x1 = *__SIMD32(pScr1)++; 00284 00285 /* pack input data */ 00286 #ifndef ARM_MATH_BIG_ENDIAN 00287 x3 = __PKHBT(x1, x2, 0); 00288 #else 00289 x3 = __PKHBT(x2, x1, 0); 00290 #endif 00291 00292 acc3 = __SMLADX(x3, y1, acc3); 00293 00294 /* Read four samples from smaller buffer */ 00295 y1 = _SIMD32_OFFSET(pScr2 + 2u); 00296 00297 acc0 = __SMLAD(x2, y1, acc0); 00298 00299 acc2 = __SMLAD(x1, y1, acc2); 00300 00301 acc1 = __SMLADX(x3, y1, acc1); 00302 00303 x2 = *__SIMD32(pScr1)++; 00304 00305 #ifndef ARM_MATH_BIG_ENDIAN 00306 x3 = __PKHBT(x2, x1, 0); 00307 #else 00308 x3 = __PKHBT(x1, x2, 0); 00309 #endif 00310 00311 acc3 = __SMLADX(x3, y1, acc3); 00312 00313 pScr2 += 4u; 00314 00315 00316 /* Decrement the loop counter */ 00317 tapCnt--; 00318 } 00319 00320 00321 00322 /* Update scratch pointer for remaining samples of smaller length sequence */ 00323 pScr1 -= 4u; 00324 00325 00326 /* apply same above for remaining samples of smaller length sequence */ 00327 tapCnt = (srcBLen) & 3u; 00328 00329 while(tapCnt > 0u) 00330 { 00331 00332 /* accumlate the results */ 00333 acc0 += (*pScr1++ * *pScr2); 00334 acc1 += (*pScr1++ * *pScr2); 00335 acc2 += (*pScr1++ * *pScr2); 00336 acc3 += (*pScr1++ * *pScr2++); 00337 00338 pScr1 -= 3u; 00339 00340 /* Decrement the loop counter */ 00341 tapCnt--; 00342 } 00343 00344 blkCnt--; 00345 00346 /* Store the result in the accumulator in the destination buffer. */ 00347 out0 = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00348 out1 = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00349 out2 = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00350 out3 = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00351 00352 *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); 00353 00354 /* Initialization of inputB pointer */ 00355 pScr2 = py; 00356 00357 pScratch1 += 4u; 00358 00359 } 00360 00361 blkCnt = (numPoints) & 0x3; 00362 00363 /* Calculate convolution for remaining samples of Bigger length sequence */ 00364 while(blkCnt > 0) 00365 { 00366 /* Initialze temporary scratch pointer as scratch1 */ 00367 pScr1 = pScratch1; 00368 00369 /* Clear Accumlators */ 00370 acc0 = 0; 00371 00372 tapCnt = (srcBLen) >> 1u; 00373 00374 while(tapCnt > 0u) 00375 { 00376 00377 /* Read next two samples from scratch1 buffer */ 00378 x1 = *__SIMD32(pScr1)++; 00379 00380 /* Read two samples from smaller buffer */ 00381 y1 = *__SIMD32(pScr2)++; 00382 00383 acc0 = __SMLAD(x1, y1, acc0); 00384 00385 /* Decrement the loop counter */ 00386 tapCnt--; 00387 } 00388 00389 tapCnt = (srcBLen) & 1u; 00390 00391 /* apply same above for remaining samples of smaller length sequence */ 00392 while(tapCnt > 0u) 00393 { 00394 00395 /* accumlate the results */ 00396 acc0 += (*pScr1++ * *pScr2++); 00397 00398 /* Decrement the loop counter */ 00399 tapCnt--; 00400 } 00401 00402 blkCnt--; 00403 00404 /* Store the result in the accumulator in the destination buffer. */ 00405 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00406 00407 /* Initialization of inputB pointer */ 00408 pScr2 = py; 00409 00410 pScratch1 += 1u; 00411 00412 } 00413 00414 /* set status as ARM_MATH_SUCCESS */ 00415 status = ARM_MATH_SUCCESS; 00416 00417 00418 } 00419 00420 return (status); 00421 00422 } 00423 00424 #else 00425 00426 arm_status arm_conv_partial_opt_q7( 00427 q7_t * pSrcA, 00428 uint32_t srcALen, 00429 q7_t * pSrcB, 00430 uint32_t srcBLen, 00431 q7_t * pDst, 00432 uint32_t firstIndex, 00433 uint32_t numPoints, 00434 q15_t * pScratch1, 00435 q15_t * pScratch2) 00436 { 00437 00438 q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ 00439 q15_t x4; /* Temporary input variable */ 00440 q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ 00441 uint32_t j, k, blkCnt, tapCnt; /* loop counter */ 00442 q7_t *px; /* Temporary input1 pointer */ 00443 q15_t *py; /* Temporary input2 pointer */ 00444 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00445 arm_status status; 00446 q7_t *pOut = pDst; /* output pointer */ 00447 q15_t x10, x11, x20, x21; /* Temporary input variables */ 00448 q15_t y10, y11; /* Temporary input variables */ 00449 q7_t out0, out1, out2, out3; /* temporary variables */ 00450 00451 /* Check for range of output samples to be calculated */ 00452 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00453 { 00454 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00455 status = ARM_MATH_ARGUMENT_ERROR; 00456 } 00457 else 00458 { 00459 00460 /* The algorithm implementation is based on the lengths of the inputs. */ 00461 /* srcB is always made to slide across srcA. */ 00462 /* So srcBLen is always considered as shorter or equal to srcALen */ 00463 if(srcALen >= srcBLen) 00464 { 00465 /* Initialization of inputA pointer */ 00466 pIn1 = pSrcA; 00467 00468 /* Initialization of inputB pointer */ 00469 pIn2 = pSrcB; 00470 } 00471 else 00472 { 00473 /* Initialization of inputA pointer */ 00474 pIn1 = pSrcB; 00475 00476 /* Initialization of inputB pointer */ 00477 pIn2 = pSrcA; 00478 00479 /* srcBLen is always considered as shorter or equal to srcALen */ 00480 j = srcBLen; 00481 srcBLen = srcALen; 00482 srcALen = j; 00483 } 00484 00485 /* pointer to take end of scratch2 buffer */ 00486 pScr2 = pScratch2; 00487 00488 /* points to smaller length sequence */ 00489 px = pIn2 + srcBLen - 1; 00490 00491 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00492 k = srcBLen >> 2u; 00493 00494 /* First part of the processing with loop unrolling copies 4 data points at a time. 00495 ** a second loop below copies for the remaining 1 to 3 samples. */ 00496 while(k > 0u) 00497 { 00498 /* copy second buffer in reversal manner */ 00499 x4 = (q15_t) * px--; 00500 *pScr2++ = x4; 00501 x4 = (q15_t) * px--; 00502 *pScr2++ = x4; 00503 x4 = (q15_t) * px--; 00504 *pScr2++ = x4; 00505 x4 = (q15_t) * px--; 00506 *pScr2++ = x4; 00507 00508 /* Decrement the loop counter */ 00509 k--; 00510 } 00511 00512 /* If the count is not a multiple of 4, copy remaining samples here. 00513 ** No loop unrolling is used. */ 00514 k = srcBLen % 0x4u; 00515 00516 while(k > 0u) 00517 { 00518 /* copy second buffer in reversal manner for remaining samples */ 00519 x4 = (q15_t) * px--; 00520 *pScr2++ = x4; 00521 00522 /* Decrement the loop counter */ 00523 k--; 00524 } 00525 00526 /* Initialze temporary scratch pointer */ 00527 pScr1 = pScratch1; 00528 00529 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00530 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00531 00532 /* Update temporary scratch pointer */ 00533 pScr1 += (srcBLen - 1u); 00534 00535 /* Copy (srcALen) samples in scratch buffer */ 00536 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00537 k = srcALen >> 2u; 00538 00539 /* First part of the processing with loop unrolling copies 4 data points at a time. 00540 ** a second loop below copies for the remaining 1 to 3 samples. */ 00541 while(k > 0u) 00542 { 00543 /* copy second buffer in reversal manner */ 00544 x4 = (q15_t) * pIn1++; 00545 *pScr1++ = x4; 00546 x4 = (q15_t) * pIn1++; 00547 *pScr1++ = x4; 00548 x4 = (q15_t) * pIn1++; 00549 *pScr1++ = x4; 00550 x4 = (q15_t) * pIn1++; 00551 *pScr1++ = x4; 00552 00553 /* Decrement the loop counter */ 00554 k--; 00555 } 00556 00557 /* If the count is not a multiple of 4, copy remaining samples here. 00558 ** No loop unrolling is used. */ 00559 k = srcALen % 0x4u; 00560 00561 while(k > 0u) 00562 { 00563 /* copy second buffer in reversal manner for remaining samples */ 00564 x4 = (q15_t) * pIn1++; 00565 *pScr1++ = x4; 00566 00567 /* Decrement the loop counter */ 00568 k--; 00569 } 00570 00571 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00572 k = (srcBLen - 1u) >> 2u; 00573 00574 /* First part of the processing with loop unrolling copies 4 data points at a time. 00575 ** a second loop below copies for the remaining 1 to 3 samples. */ 00576 while(k > 0u) 00577 { 00578 /* copy second buffer in reversal manner */ 00579 *pScr1++ = 0; 00580 *pScr1++ = 0; 00581 *pScr1++ = 0; 00582 *pScr1++ = 0; 00583 00584 /* Decrement the loop counter */ 00585 k--; 00586 } 00587 00588 /* If the count is not a multiple of 4, copy remaining samples here. 00589 ** No loop unrolling is used. */ 00590 k = (srcBLen - 1u) % 0x4u; 00591 00592 while(k > 0u) 00593 { 00594 /* copy second buffer in reversal manner for remaining samples */ 00595 *pScr1++ = 0; 00596 00597 /* Decrement the loop counter */ 00598 k--; 00599 } 00600 00601 00602 /* Temporary pointer for scratch2 */ 00603 py = pScratch2; 00604 00605 /* Initialization of pIn2 pointer */ 00606 pIn2 = (q7_t *) py; 00607 00608 pScr2 = py; 00609 00610 pOut = pDst + firstIndex; 00611 00612 pScratch1 += firstIndex; 00613 00614 /* Actual convolution process starts here */ 00615 blkCnt = (numPoints) >> 2; 00616 00617 00618 while(blkCnt > 0) 00619 { 00620 /* Initialze temporary scratch pointer as scratch1 */ 00621 pScr1 = pScratch1; 00622 00623 /* Clear Accumlators */ 00624 acc0 = 0; 00625 acc1 = 0; 00626 acc2 = 0; 00627 acc3 = 0; 00628 00629 /* Read two samples from scratch1 buffer */ 00630 x10 = *pScr1++; 00631 x11 = *pScr1++; 00632 00633 /* Read next two samples from scratch1 buffer */ 00634 x20 = *pScr1++; 00635 x21 = *pScr1++; 00636 00637 tapCnt = (srcBLen) >> 2u; 00638 00639 while(tapCnt > 0u) 00640 { 00641 00642 /* Read four samples from smaller buffer */ 00643 y10 = *pScr2; 00644 y11 = *(pScr2 + 1u); 00645 00646 /* multiply and accumlate */ 00647 acc0 += (q31_t) x10 *y10; 00648 acc0 += (q31_t) x11 *y11; 00649 acc2 += (q31_t) x20 *y10; 00650 acc2 += (q31_t) x21 *y11; 00651 00652 00653 acc1 += (q31_t) x11 *y10; 00654 acc1 += (q31_t) x20 *y11; 00655 00656 /* Read next two samples from scratch1 buffer */ 00657 x10 = *pScr1; 00658 x11 = *(pScr1 + 1u); 00659 00660 /* multiply and accumlate */ 00661 acc3 += (q31_t) x21 *y10; 00662 acc3 += (q31_t) x10 *y11; 00663 00664 /* Read next two samples from scratch2 buffer */ 00665 y10 = *(pScr2 + 2u); 00666 y11 = *(pScr2 + 3u); 00667 00668 /* multiply and accumlate */ 00669 acc0 += (q31_t) x20 *y10; 00670 acc0 += (q31_t) x21 *y11; 00671 acc2 += (q31_t) x10 *y10; 00672 acc2 += (q31_t) x11 *y11; 00673 acc1 += (q31_t) x21 *y10; 00674 acc1 += (q31_t) x10 *y11; 00675 00676 /* Read next two samples from scratch1 buffer */ 00677 x20 = *(pScr1 + 2); 00678 x21 = *(pScr1 + 3); 00679 00680 /* multiply and accumlate */ 00681 acc3 += (q31_t) x11 *y10; 00682 acc3 += (q31_t) x20 *y11; 00683 00684 /* update scratch pointers */ 00685 00686 pScr1 += 4u; 00687 pScr2 += 4u; 00688 00689 /* Decrement the loop counter */ 00690 tapCnt--; 00691 } 00692 00693 00694 00695 /* Update scratch pointer for remaining samples of smaller length sequence */ 00696 pScr1 -= 4u; 00697 00698 00699 /* apply same above for remaining samples of smaller length sequence */ 00700 tapCnt = (srcBLen) & 3u; 00701 00702 while(tapCnt > 0u) 00703 { 00704 00705 /* accumlate the results */ 00706 acc0 += (*pScr1++ * *pScr2); 00707 acc1 += (*pScr1++ * *pScr2); 00708 acc2 += (*pScr1++ * *pScr2); 00709 acc3 += (*pScr1++ * *pScr2++); 00710 00711 pScr1 -= 3u; 00712 00713 /* Decrement the loop counter */ 00714 tapCnt--; 00715 } 00716 00717 blkCnt--; 00718 00719 /* Store the result in the accumulator in the destination buffer. */ 00720 out0 = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00721 out1 = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00722 out2 = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00723 out3 = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00724 00725 00726 *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); 00727 00728 /* Initialization of inputB pointer */ 00729 pScr2 = py; 00730 00731 pScratch1 += 4u; 00732 00733 } 00734 00735 blkCnt = (numPoints) & 0x3; 00736 00737 /* Calculate convolution for remaining samples of Bigger length sequence */ 00738 while(blkCnt > 0) 00739 { 00740 /* Initialze temporary scratch pointer as scratch1 */ 00741 pScr1 = pScratch1; 00742 00743 /* Clear Accumlators */ 00744 acc0 = 0; 00745 00746 tapCnt = (srcBLen) >> 1u; 00747 00748 while(tapCnt > 0u) 00749 { 00750 00751 /* Read next two samples from scratch1 buffer */ 00752 x10 = *pScr1++; 00753 x11 = *pScr1++; 00754 00755 /* Read two samples from smaller buffer */ 00756 y10 = *pScr2++; 00757 y11 = *pScr2++; 00758 00759 /* multiply and accumlate */ 00760 acc0 += (q31_t) x10 *y10; 00761 acc0 += (q31_t) x11 *y11; 00762 00763 /* Decrement the loop counter */ 00764 tapCnt--; 00765 } 00766 00767 tapCnt = (srcBLen) & 1u; 00768 00769 /* apply same above for remaining samples of smaller length sequence */ 00770 while(tapCnt > 0u) 00771 { 00772 00773 /* accumlate the results */ 00774 acc0 += (*pScr1++ * *pScr2++); 00775 00776 /* Decrement the loop counter */ 00777 tapCnt--; 00778 } 00779 00780 blkCnt--; 00781 00782 /* Store the result in the accumulator in the destination buffer. */ 00783 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00784 00785 /* Initialization of inputB pointer */ 00786 pScr2 = py; 00787 00788 pScratch1 += 1u; 00789 00790 } 00791 00792 /* set status as ARM_MATH_SUCCESS */ 00793 status = ARM_MATH_SUCCESS; 00794 00795 } 00796 00797 return (status); 00798 00799 } 00800 00801 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00802 00803 00804 00805 /** 00806 * @} end of PartialConv group 00807 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2