Aded CMSIS5 DSP and NN folder. Needs some work
Embed:
(wiki syntax)
Show/hide line numbers
arm_conv_partial_opt_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_conv_partial_opt_q7.c 00004 * Description: Partial convolution of Q7 sequences 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup PartialConv 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Partial convolution of Q7 sequences. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. 00047 * @param[in] firstIndex is the first output sample to start with. 00048 * @param[in] numPoints is the number of output points to be computed. 00049 * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00050 * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). 00051 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00052 * 00053 * \par Restrictions 00054 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00055 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00056 * 00057 * 00058 * 00059 */ 00060 00061 00062 #ifndef UNALIGNED_SUPPORT_DISABLE 00063 00064 arm_status arm_conv_partial_opt_q7( 00065 q7_t * pSrcA, 00066 uint32_t srcALen, 00067 q7_t * pSrcB, 00068 uint32_t srcBLen, 00069 q7_t * pDst, 00070 uint32_t firstIndex, 00071 uint32_t numPoints, 00072 q15_t * pScratch1, 00073 q15_t * pScratch2) 00074 { 00075 00076 q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ 00077 q15_t x4; /* Temporary input variable */ 00078 q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ 00079 uint32_t j, k, blkCnt, tapCnt; /* loop counter */ 00080 q7_t *px; /* Temporary input1 pointer */ 00081 q15_t *py; /* Temporary input2 pointer */ 00082 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00083 q31_t x1, x2, x3, y1; /* Temporary input variables */ 00084 arm_status status; 00085 q7_t *pOut = pDst; /* output pointer */ 00086 q7_t out0, out1, out2, out3; /* temporary variables */ 00087 00088 /* Check for range of output samples to be calculated */ 00089 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) 00090 { 00091 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00092 status = ARM_MATH_ARGUMENT_ERROR; 00093 } 00094 else 00095 { 00096 00097 /* The algorithm implementation is based on the lengths of the inputs. */ 00098 /* srcB is always made to slide across srcA. */ 00099 /* So srcBLen is always considered as shorter or equal to srcALen */ 00100 if (srcALen >= srcBLen) 00101 { 00102 /* Initialization of inputA pointer */ 00103 pIn1 = pSrcA; 00104 00105 /* Initialization of inputB pointer */ 00106 pIn2 = pSrcB; 00107 } 00108 else 00109 { 00110 /* Initialization of inputA pointer */ 00111 pIn1 = pSrcB; 00112 00113 /* Initialization of inputB pointer */ 00114 pIn2 = pSrcA; 00115 00116 /* srcBLen is always considered as shorter or equal to srcALen */ 00117 j = srcBLen; 00118 srcBLen = srcALen; 00119 srcALen = j; 00120 } 00121 00122 /* pointer to take end of scratch2 buffer */ 00123 pScr2 = pScratch2; 00124 00125 /* points to smaller length sequence */ 00126 px = pIn2 + srcBLen - 1; 00127 00128 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00129 k = srcBLen >> 2U; 00130 00131 /* First part of the processing with loop unrolling copies 4 data points at a time. 00132 ** a second loop below copies for the remaining 1 to 3 samples. */ 00133 while (k > 0U) 00134 { 00135 /* copy second buffer in reversal manner */ 00136 x4 = (q15_t) * px--; 00137 *pScr2++ = x4; 00138 x4 = (q15_t) * px--; 00139 *pScr2++ = x4; 00140 x4 = (q15_t) * px--; 00141 *pScr2++ = x4; 00142 x4 = (q15_t) * px--; 00143 *pScr2++ = x4; 00144 00145 /* Decrement the loop counter */ 00146 k--; 00147 } 00148 00149 /* If the count is not a multiple of 4, copy remaining samples here. 00150 ** No loop unrolling is used. */ 00151 k = srcBLen % 0x4U; 00152 00153 while (k > 0U) 00154 { 00155 /* copy second buffer in reversal manner for remaining samples */ 00156 x4 = (q15_t) * px--; 00157 *pScr2++ = x4; 00158 00159 /* Decrement the loop counter */ 00160 k--; 00161 } 00162 00163 /* Initialze temporary scratch pointer */ 00164 pScr1 = pScratch1; 00165 00166 /* Fill (srcBLen - 1U) zeros in scratch buffer */ 00167 arm_fill_q15(0, pScr1, (srcBLen - 1U)); 00168 00169 /* Update temporary scratch pointer */ 00170 pScr1 += (srcBLen - 1U); 00171 00172 /* Copy (srcALen) samples in scratch buffer */ 00173 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00174 k = srcALen >> 2U; 00175 00176 /* First part of the processing with loop unrolling copies 4 data points at a time. 00177 ** a second loop below copies for the remaining 1 to 3 samples. */ 00178 while (k > 0U) 00179 { 00180 /* copy second buffer in reversal manner */ 00181 x4 = (q15_t) * pIn1++; 00182 *pScr1++ = x4; 00183 x4 = (q15_t) * pIn1++; 00184 *pScr1++ = x4; 00185 x4 = (q15_t) * pIn1++; 00186 *pScr1++ = x4; 00187 x4 = (q15_t) * pIn1++; 00188 *pScr1++ = x4; 00189 00190 /* Decrement the loop counter */ 00191 k--; 00192 } 00193 00194 /* If the count is not a multiple of 4, copy remaining samples here. 00195 ** No loop unrolling is used. */ 00196 k = srcALen % 0x4U; 00197 00198 while (k > 0U) 00199 { 00200 /* copy second buffer in reversal manner for remaining samples */ 00201 x4 = (q15_t) * pIn1++; 00202 *pScr1++ = x4; 00203 00204 /* Decrement the loop counter */ 00205 k--; 00206 } 00207 00208 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ 00209 arm_fill_q15(0, pScr1, (srcBLen - 1U)); 00210 00211 /* Update pointer */ 00212 pScr1 += (srcBLen - 1U); 00213 00214 00215 /* Temporary pointer for scratch2 */ 00216 py = pScratch2; 00217 00218 /* Initialization of pIn2 pointer */ 00219 pIn2 = (q7_t *) py; 00220 00221 pScr2 = py; 00222 00223 pOut = pDst + firstIndex; 00224 00225 pScratch1 += firstIndex; 00226 00227 /* Actual convolution process starts here */ 00228 blkCnt = (numPoints) >> 2; 00229 00230 00231 while (blkCnt > 0) 00232 { 00233 /* Initialze temporary scratch pointer as scratch1 */ 00234 pScr1 = pScratch1; 00235 00236 /* Clear Accumlators */ 00237 acc0 = 0; 00238 acc1 = 0; 00239 acc2 = 0; 00240 acc3 = 0; 00241 00242 /* Read two samples from scratch1 buffer */ 00243 x1 = *__SIMD32(pScr1)++; 00244 00245 /* Read next two samples from scratch1 buffer */ 00246 x2 = *__SIMD32(pScr1)++; 00247 00248 tapCnt = (srcBLen) >> 2U; 00249 00250 while (tapCnt > 0U) 00251 { 00252 00253 /* Read four samples from smaller buffer */ 00254 y1 = _SIMD32_OFFSET(pScr2); 00255 00256 /* multiply and accumlate */ 00257 acc0 = __SMLAD(x1, y1, acc0); 00258 acc2 = __SMLAD(x2, y1, acc2); 00259 00260 /* pack input data */ 00261 #ifndef ARM_MATH_BIG_ENDIAN 00262 x3 = __PKHBT(x2, x1, 0); 00263 #else 00264 x3 = __PKHBT(x1, x2, 0); 00265 #endif 00266 00267 /* multiply and accumlate */ 00268 acc1 = __SMLADX(x3, y1, acc1); 00269 00270 /* Read next two samples from scratch1 buffer */ 00271 x1 = *__SIMD32(pScr1)++; 00272 00273 /* pack input data */ 00274 #ifndef ARM_MATH_BIG_ENDIAN 00275 x3 = __PKHBT(x1, x2, 0); 00276 #else 00277 x3 = __PKHBT(x2, x1, 0); 00278 #endif 00279 00280 acc3 = __SMLADX(x3, y1, acc3); 00281 00282 /* Read four samples from smaller buffer */ 00283 y1 = _SIMD32_OFFSET(pScr2 + 2U); 00284 00285 acc0 = __SMLAD(x2, y1, acc0); 00286 00287 acc2 = __SMLAD(x1, y1, acc2); 00288 00289 acc1 = __SMLADX(x3, y1, acc1); 00290 00291 x2 = *__SIMD32(pScr1)++; 00292 00293 #ifndef ARM_MATH_BIG_ENDIAN 00294 x3 = __PKHBT(x2, x1, 0); 00295 #else 00296 x3 = __PKHBT(x1, x2, 0); 00297 #endif 00298 00299 acc3 = __SMLADX(x3, y1, acc3); 00300 00301 pScr2 += 4U; 00302 00303 00304 /* Decrement the loop counter */ 00305 tapCnt--; 00306 } 00307 00308 00309 00310 /* Update scratch pointer for remaining samples of smaller length sequence */ 00311 pScr1 -= 4U; 00312 00313 00314 /* apply same above for remaining samples of smaller length sequence */ 00315 tapCnt = (srcBLen) & 3U; 00316 00317 while (tapCnt > 0U) 00318 { 00319 00320 /* accumlate the results */ 00321 acc0 += (*pScr1++ * *pScr2); 00322 acc1 += (*pScr1++ * *pScr2); 00323 acc2 += (*pScr1++ * *pScr2); 00324 acc3 += (*pScr1++ * *pScr2++); 00325 00326 pScr1 -= 3U; 00327 00328 /* Decrement the loop counter */ 00329 tapCnt--; 00330 } 00331 00332 blkCnt--; 00333 00334 /* Store the result in the accumulator in the destination buffer. */ 00335 out0 = (q7_t) (__SSAT(acc0 >> 7U, 8)); 00336 out1 = (q7_t) (__SSAT(acc1 >> 7U, 8)); 00337 out2 = (q7_t) (__SSAT(acc2 >> 7U, 8)); 00338 out3 = (q7_t) (__SSAT(acc3 >> 7U, 8)); 00339 00340 *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); 00341 00342 /* Initialization of inputB pointer */ 00343 pScr2 = py; 00344 00345 pScratch1 += 4U; 00346 00347 } 00348 00349 blkCnt = (numPoints) & 0x3; 00350 00351 /* Calculate convolution for remaining samples of Bigger length sequence */ 00352 while (blkCnt > 0) 00353 { 00354 /* Initialze temporary scratch pointer as scratch1 */ 00355 pScr1 = pScratch1; 00356 00357 /* Clear Accumlators */ 00358 acc0 = 0; 00359 00360 tapCnt = (srcBLen) >> 1U; 00361 00362 while (tapCnt > 0U) 00363 { 00364 00365 /* Read next two samples from scratch1 buffer */ 00366 x1 = *__SIMD32(pScr1)++; 00367 00368 /* Read two samples from smaller buffer */ 00369 y1 = *__SIMD32(pScr2)++; 00370 00371 acc0 = __SMLAD(x1, y1, acc0); 00372 00373 /* Decrement the loop counter */ 00374 tapCnt--; 00375 } 00376 00377 tapCnt = (srcBLen) & 1U; 00378 00379 /* apply same above for remaining samples of smaller length sequence */ 00380 while (tapCnt > 0U) 00381 { 00382 00383 /* accumlate the results */ 00384 acc0 += (*pScr1++ * *pScr2++); 00385 00386 /* Decrement the loop counter */ 00387 tapCnt--; 00388 } 00389 00390 blkCnt--; 00391 00392 /* Store the result in the accumulator in the destination buffer. */ 00393 *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8)); 00394 00395 /* Initialization of inputB pointer */ 00396 pScr2 = py; 00397 00398 pScratch1 += 1U; 00399 00400 } 00401 00402 /* set status as ARM_MATH_SUCCESS */ 00403 status = ARM_MATH_SUCCESS; 00404 00405 00406 } 00407 00408 return (status); 00409 00410 } 00411 00412 #else 00413 00414 arm_status arm_conv_partial_opt_q7( 00415 q7_t * pSrcA, 00416 uint32_t srcALen, 00417 q7_t * pSrcB, 00418 uint32_t srcBLen, 00419 q7_t * pDst, 00420 uint32_t firstIndex, 00421 uint32_t numPoints, 00422 q15_t * pScratch1, 00423 q15_t * pScratch2) 00424 { 00425 00426 q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ 00427 q15_t x4; /* Temporary input variable */ 00428 q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ 00429 uint32_t j, k, blkCnt, tapCnt; /* loop counter */ 00430 q7_t *px; /* Temporary input1 pointer */ 00431 q15_t *py; /* Temporary input2 pointer */ 00432 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00433 arm_status status; 00434 q7_t *pOut = pDst; /* output pointer */ 00435 q15_t x10, x11, x20, x21; /* Temporary input variables */ 00436 q15_t y10, y11; /* Temporary input variables */ 00437 00438 /* Check for range of output samples to be calculated */ 00439 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) 00440 { 00441 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00442 status = ARM_MATH_ARGUMENT_ERROR; 00443 } 00444 else 00445 { 00446 00447 /* The algorithm implementation is based on the lengths of the inputs. */ 00448 /* srcB is always made to slide across srcA. */ 00449 /* So srcBLen is always considered as shorter or equal to srcALen */ 00450 if (srcALen >= srcBLen) 00451 { 00452 /* Initialization of inputA pointer */ 00453 pIn1 = pSrcA; 00454 00455 /* Initialization of inputB pointer */ 00456 pIn2 = pSrcB; 00457 } 00458 else 00459 { 00460 /* Initialization of inputA pointer */ 00461 pIn1 = pSrcB; 00462 00463 /* Initialization of inputB pointer */ 00464 pIn2 = pSrcA; 00465 00466 /* srcBLen is always considered as shorter or equal to srcALen */ 00467 j = srcBLen; 00468 srcBLen = srcALen; 00469 srcALen = j; 00470 } 00471 00472 /* pointer to take end of scratch2 buffer */ 00473 pScr2 = pScratch2; 00474 00475 /* points to smaller length sequence */ 00476 px = pIn2 + srcBLen - 1; 00477 00478 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00479 k = srcBLen >> 2U; 00480 00481 /* First part of the processing with loop unrolling copies 4 data points at a time. 00482 ** a second loop below copies for the remaining 1 to 3 samples. */ 00483 while (k > 0U) 00484 { 00485 /* copy second buffer in reversal manner */ 00486 x4 = (q15_t) * px--; 00487 *pScr2++ = x4; 00488 x4 = (q15_t) * px--; 00489 *pScr2++ = x4; 00490 x4 = (q15_t) * px--; 00491 *pScr2++ = x4; 00492 x4 = (q15_t) * px--; 00493 *pScr2++ = x4; 00494 00495 /* Decrement the loop counter */ 00496 k--; 00497 } 00498 00499 /* If the count is not a multiple of 4, copy remaining samples here. 00500 ** No loop unrolling is used. */ 00501 k = srcBLen % 0x4U; 00502 00503 while (k > 0U) 00504 { 00505 /* copy second buffer in reversal manner for remaining samples */ 00506 x4 = (q15_t) * px--; 00507 *pScr2++ = x4; 00508 00509 /* Decrement the loop counter */ 00510 k--; 00511 } 00512 00513 /* Initialze temporary scratch pointer */ 00514 pScr1 = pScratch1; 00515 00516 /* Fill (srcBLen - 1U) zeros in scratch buffer */ 00517 arm_fill_q15(0, pScr1, (srcBLen - 1U)); 00518 00519 /* Update temporary scratch pointer */ 00520 pScr1 += (srcBLen - 1U); 00521 00522 /* Copy (srcALen) samples in scratch buffer */ 00523 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00524 k = srcALen >> 2U; 00525 00526 /* First part of the processing with loop unrolling copies 4 data points at a time. 00527 ** a second loop below copies for the remaining 1 to 3 samples. */ 00528 while (k > 0U) 00529 { 00530 /* copy second buffer in reversal manner */ 00531 x4 = (q15_t) * pIn1++; 00532 *pScr1++ = x4; 00533 x4 = (q15_t) * pIn1++; 00534 *pScr1++ = x4; 00535 x4 = (q15_t) * pIn1++; 00536 *pScr1++ = x4; 00537 x4 = (q15_t) * pIn1++; 00538 *pScr1++ = x4; 00539 00540 /* Decrement the loop counter */ 00541 k--; 00542 } 00543 00544 /* If the count is not a multiple of 4, copy remaining samples here. 00545 ** No loop unrolling is used. */ 00546 k = srcALen % 0x4U; 00547 00548 while (k > 0U) 00549 { 00550 /* copy second buffer in reversal manner for remaining samples */ 00551 x4 = (q15_t) * pIn1++; 00552 *pScr1++ = x4; 00553 00554 /* Decrement the loop counter */ 00555 k--; 00556 } 00557 00558 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00559 k = (srcBLen - 1U) >> 2U; 00560 00561 /* First part of the processing with loop unrolling copies 4 data points at a time. 00562 ** a second loop below copies for the remaining 1 to 3 samples. */ 00563 while (k > 0U) 00564 { 00565 /* copy second buffer in reversal manner */ 00566 *pScr1++ = 0; 00567 *pScr1++ = 0; 00568 *pScr1++ = 0; 00569 *pScr1++ = 0; 00570 00571 /* Decrement the loop counter */ 00572 k--; 00573 } 00574 00575 /* If the count is not a multiple of 4, copy remaining samples here. 00576 ** No loop unrolling is used. */ 00577 k = (srcBLen - 1U) % 0x4U; 00578 00579 while (k > 0U) 00580 { 00581 /* copy second buffer in reversal manner for remaining samples */ 00582 *pScr1++ = 0; 00583 00584 /* Decrement the loop counter */ 00585 k--; 00586 } 00587 00588 00589 /* Temporary pointer for scratch2 */ 00590 py = pScratch2; 00591 00592 /* Initialization of pIn2 pointer */ 00593 pIn2 = (q7_t *) py; 00594 00595 pScr2 = py; 00596 00597 pOut = pDst + firstIndex; 00598 00599 pScratch1 += firstIndex; 00600 00601 /* Actual convolution process starts here */ 00602 blkCnt = (numPoints) >> 2; 00603 00604 00605 while (blkCnt > 0) 00606 { 00607 /* Initialze temporary scratch pointer as scratch1 */ 00608 pScr1 = pScratch1; 00609 00610 /* Clear Accumlators */ 00611 acc0 = 0; 00612 acc1 = 0; 00613 acc2 = 0; 00614 acc3 = 0; 00615 00616 /* Read two samples from scratch1 buffer */ 00617 x10 = *pScr1++; 00618 x11 = *pScr1++; 00619 00620 /* Read next two samples from scratch1 buffer */ 00621 x20 = *pScr1++; 00622 x21 = *pScr1++; 00623 00624 tapCnt = (srcBLen) >> 2U; 00625 00626 while (tapCnt > 0U) 00627 { 00628 00629 /* Read four samples from smaller buffer */ 00630 y10 = *pScr2; 00631 y11 = *(pScr2 + 1U); 00632 00633 /* multiply and accumlate */ 00634 acc0 += (q31_t) x10 *y10; 00635 acc0 += (q31_t) x11 *y11; 00636 acc2 += (q31_t) x20 *y10; 00637 acc2 += (q31_t) x21 *y11; 00638 00639 00640 acc1 += (q31_t) x11 *y10; 00641 acc1 += (q31_t) x20 *y11; 00642 00643 /* Read next two samples from scratch1 buffer */ 00644 x10 = *pScr1; 00645 x11 = *(pScr1 + 1U); 00646 00647 /* multiply and accumlate */ 00648 acc3 += (q31_t) x21 *y10; 00649 acc3 += (q31_t) x10 *y11; 00650 00651 /* Read next two samples from scratch2 buffer */ 00652 y10 = *(pScr2 + 2U); 00653 y11 = *(pScr2 + 3U); 00654 00655 /* multiply and accumlate */ 00656 acc0 += (q31_t) x20 *y10; 00657 acc0 += (q31_t) x21 *y11; 00658 acc2 += (q31_t) x10 *y10; 00659 acc2 += (q31_t) x11 *y11; 00660 acc1 += (q31_t) x21 *y10; 00661 acc1 += (q31_t) x10 *y11; 00662 00663 /* Read next two samples from scratch1 buffer */ 00664 x20 = *(pScr1 + 2); 00665 x21 = *(pScr1 + 3); 00666 00667 /* multiply and accumlate */ 00668 acc3 += (q31_t) x11 *y10; 00669 acc3 += (q31_t) x20 *y11; 00670 00671 /* update scratch pointers */ 00672 00673 pScr1 += 4U; 00674 pScr2 += 4U; 00675 00676 /* Decrement the loop counter */ 00677 tapCnt--; 00678 } 00679 00680 00681 00682 /* Update scratch pointer for remaining samples of smaller length sequence */ 00683 pScr1 -= 4U; 00684 00685 00686 /* apply same above for remaining samples of smaller length sequence */ 00687 tapCnt = (srcBLen) & 3U; 00688 00689 while (tapCnt > 0U) 00690 { 00691 00692 /* accumlate the results */ 00693 acc0 += (*pScr1++ * *pScr2); 00694 acc1 += (*pScr1++ * *pScr2); 00695 acc2 += (*pScr1++ * *pScr2); 00696 acc3 += (*pScr1++ * *pScr2++); 00697 00698 pScr1 -= 3U; 00699 00700 /* Decrement the loop counter */ 00701 tapCnt--; 00702 } 00703 00704 blkCnt--; 00705 00706 /* Store the result in the accumulator in the destination buffer. */ 00707 *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8)); 00708 *pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8)); 00709 *pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8)); 00710 *pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8)); 00711 00712 /* Initialization of inputB pointer */ 00713 pScr2 = py; 00714 00715 pScratch1 += 4U; 00716 00717 } 00718 00719 blkCnt = (numPoints) & 0x3; 00720 00721 /* Calculate convolution for remaining samples of Bigger length sequence */ 00722 while (blkCnt > 0) 00723 { 00724 /* Initialze temporary scratch pointer as scratch1 */ 00725 pScr1 = pScratch1; 00726 00727 /* Clear Accumlators */ 00728 acc0 = 0; 00729 00730 tapCnt = (srcBLen) >> 1U; 00731 00732 while (tapCnt > 0U) 00733 { 00734 00735 /* Read next two samples from scratch1 buffer */ 00736 x10 = *pScr1++; 00737 x11 = *pScr1++; 00738 00739 /* Read two samples from smaller buffer */ 00740 y10 = *pScr2++; 00741 y11 = *pScr2++; 00742 00743 /* multiply and accumlate */ 00744 acc0 += (q31_t) x10 *y10; 00745 acc0 += (q31_t) x11 *y11; 00746 00747 /* Decrement the loop counter */ 00748 tapCnt--; 00749 } 00750 00751 tapCnt = (srcBLen) & 1U; 00752 00753 /* apply same above for remaining samples of smaller length sequence */ 00754 while (tapCnt > 0U) 00755 { 00756 00757 /* accumlate the results */ 00758 acc0 += (*pScr1++ * *pScr2++); 00759 00760 /* Decrement the loop counter */ 00761 tapCnt--; 00762 } 00763 00764 blkCnt--; 00765 00766 /* Store the result in the accumulator in the destination buffer. */ 00767 *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8)); 00768 00769 /* Initialization of inputB pointer */ 00770 pScr2 = py; 00771 00772 pScratch1 += 1U; 00773 00774 } 00775 00776 /* set status as ARM_MATH_SUCCESS */ 00777 status = ARM_MATH_SUCCESS; 00778 00779 } 00780 00781 return (status); 00782 00783 } 00784 00785 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00786 00787 00788 00789 /** 00790 * @} end of PartialConv group 00791 */ 00792
Generated on Tue Jul 12 2022 16:46:23 by 1.7.2