Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_conv_partial_opt_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_opt_q7.c 00009 * 00010 * Description: Partial convolution of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q7 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00062 * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). 00063 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00064 * 00065 * \par Restrictions 00066 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00067 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00068 * 00069 * 00070 * 00071 */ 00072 00073 00074 #ifndef UNALIGNED_SUPPORT_DISABLE 00075 00076 arm_status arm_conv_partial_opt_q7( 00077 q7_t * pSrcA, 00078 uint32_t srcALen, 00079 q7_t * pSrcB, 00080 uint32_t srcBLen, 00081 q7_t * pDst, 00082 uint32_t firstIndex, 00083 uint32_t numPoints, 00084 q15_t * pScratch1, 00085 q15_t * pScratch2) 00086 { 00087 00088 q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ 00089 q15_t x4; /* Temporary input variable */ 00090 q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ 00091 uint32_t j, k, blkCnt, tapCnt; /* loop counter */ 00092 q7_t *px; /* Temporary input1 pointer */ 00093 q15_t *py; /* Temporary input2 pointer */ 00094 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00095 q31_t x1, x2, x3, y1; /* Temporary input variables */ 00096 arm_status status; 00097 q7_t *pOut = pDst; /* output pointer */ 00098 q7_t out0, out1, out2, out3; /* temporary variables */ 00099 00100 /* Check for range of output samples to be calculated */ 00101 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00102 { 00103 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00104 status = ARM_MATH_ARGUMENT_ERROR; 00105 } 00106 else 00107 { 00108 00109 /* The algorithm implementation is based on the lengths of the inputs. */ 00110 /* srcB is always made to slide across srcA. */ 00111 /* So srcBLen is always considered as shorter or equal to srcALen */ 00112 if(srcALen >= srcBLen) 00113 { 00114 /* Initialization of inputA pointer */ 00115 pIn1 = pSrcA; 00116 00117 /* Initialization of inputB pointer */ 00118 pIn2 = pSrcB; 00119 } 00120 else 00121 { 00122 /* Initialization of inputA pointer */ 00123 pIn1 = pSrcB; 00124 00125 /* Initialization of inputB pointer */ 00126 pIn2 = pSrcA; 00127 00128 /* srcBLen is always considered as shorter or equal to srcALen */ 00129 j = srcBLen; 00130 srcBLen = srcALen; 00131 srcALen = j; 00132 } 00133 00134 /* pointer to take end of scratch2 buffer */ 00135 pScr2 = pScratch2; 00136 00137 /* points to smaller length sequence */ 00138 px = pIn2 + srcBLen - 1; 00139 00140 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00141 k = srcBLen >> 2u; 00142 00143 /* First part of the processing with loop unrolling copies 4 data points at a time. 00144 ** a second loop below copies for the remaining 1 to 3 samples. */ 00145 while(k > 0u) 00146 { 00147 /* copy second buffer in reversal manner */ 00148 x4 = (q15_t) * px--; 00149 *pScr2++ = x4; 00150 x4 = (q15_t) * px--; 00151 *pScr2++ = x4; 00152 x4 = (q15_t) * px--; 00153 *pScr2++ = x4; 00154 x4 = (q15_t) * px--; 00155 *pScr2++ = x4; 00156 00157 /* Decrement the loop counter */ 00158 k--; 00159 } 00160 00161 /* If the count is not a multiple of 4, copy remaining samples here. 00162 ** No loop unrolling is used. */ 00163 k = srcBLen % 0x4u; 00164 00165 while(k > 0u) 00166 { 00167 /* copy second buffer in reversal manner for remaining samples */ 00168 x4 = (q15_t) * px--; 00169 *pScr2++ = x4; 00170 00171 /* Decrement the loop counter */ 00172 k--; 00173 } 00174 00175 /* Initialze temporary scratch pointer */ 00176 pScr1 = pScratch1; 00177 00178 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00179 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00180 00181 /* Update temporary scratch pointer */ 00182 pScr1 += (srcBLen - 1u); 00183 00184 /* Copy (srcALen) samples in scratch buffer */ 00185 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00186 k = srcALen >> 2u; 00187 00188 /* First part of the processing with loop unrolling copies 4 data points at a time. 00189 ** a second loop below copies for the remaining 1 to 3 samples. */ 00190 while(k > 0u) 00191 { 00192 /* copy second buffer in reversal manner */ 00193 x4 = (q15_t) * pIn1++; 00194 *pScr1++ = x4; 00195 x4 = (q15_t) * pIn1++; 00196 *pScr1++ = x4; 00197 x4 = (q15_t) * pIn1++; 00198 *pScr1++ = x4; 00199 x4 = (q15_t) * pIn1++; 00200 *pScr1++ = x4; 00201 00202 /* Decrement the loop counter */ 00203 k--; 00204 } 00205 00206 /* If the count is not a multiple of 4, copy remaining samples here. 00207 ** No loop unrolling is used. */ 00208 k = srcALen % 0x4u; 00209 00210 while(k > 0u) 00211 { 00212 /* copy second buffer in reversal manner for remaining samples */ 00213 x4 = (q15_t) * pIn1++; 00214 *pScr1++ = x4; 00215 00216 /* Decrement the loop counter */ 00217 k--; 00218 } 00219 00220 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00221 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00222 00223 /* Update pointer */ 00224 pScr1 += (srcBLen - 1u); 00225 00226 00227 /* Temporary pointer for scratch2 */ 00228 py = pScratch2; 00229 00230 /* Initialization of pIn2 pointer */ 00231 pIn2 = (q7_t *) py; 00232 00233 pScr2 = py; 00234 00235 pOut = pDst + firstIndex; 00236 00237 pScratch1 += firstIndex; 00238 00239 /* Actual convolution process starts here */ 00240 blkCnt = (numPoints) >> 2; 00241 00242 00243 while(blkCnt > 0) 00244 { 00245 /* Initialze temporary scratch pointer as scratch1 */ 00246 pScr1 = pScratch1; 00247 00248 /* Clear Accumlators */ 00249 acc0 = 0; 00250 acc1 = 0; 00251 acc2 = 0; 00252 acc3 = 0; 00253 00254 /* Read two samples from scratch1 buffer */ 00255 x1 = *__SIMD32(pScr1)++; 00256 00257 /* Read next two samples from scratch1 buffer */ 00258 x2 = *__SIMD32(pScr1)++; 00259 00260 tapCnt = (srcBLen) >> 2u; 00261 00262 while(tapCnt > 0u) 00263 { 00264 00265 /* Read four samples from smaller buffer */ 00266 y1 = _SIMD32_OFFSET(pScr2); 00267 00268 /* multiply and accumlate */ 00269 acc0 = __SMLAD(x1, y1, acc0); 00270 acc2 = __SMLAD(x2, y1, acc2); 00271 00272 /* pack input data */ 00273 #ifndef ARM_MATH_BIG_ENDIAN 00274 x3 = __PKHBT(x2, x1, 0); 00275 #else 00276 x3 = __PKHBT(x1, x2, 0); 00277 #endif 00278 00279 /* multiply and accumlate */ 00280 acc1 = __SMLADX(x3, y1, acc1); 00281 00282 /* Read next two samples from scratch1 buffer */ 00283 x1 = *__SIMD32(pScr1)++; 00284 00285 /* pack input data */ 00286 #ifndef ARM_MATH_BIG_ENDIAN 00287 x3 = __PKHBT(x1, x2, 0); 00288 #else 00289 x3 = __PKHBT(x2, x1, 0); 00290 #endif 00291 00292 acc3 = __SMLADX(x3, y1, acc3); 00293 00294 /* Read four samples from smaller buffer */ 00295 y1 = _SIMD32_OFFSET(pScr2 + 2u); 00296 00297 acc0 = __SMLAD(x2, y1, acc0); 00298 00299 acc2 = __SMLAD(x1, y1, acc2); 00300 00301 acc1 = __SMLADX(x3, y1, acc1); 00302 00303 x2 = *__SIMD32(pScr1)++; 00304 00305 #ifndef ARM_MATH_BIG_ENDIAN 00306 x3 = __PKHBT(x2, x1, 0); 00307 #else 00308 x3 = __PKHBT(x1, x2, 0); 00309 #endif 00310 00311 acc3 = __SMLADX(x3, y1, acc3); 00312 00313 pScr2 += 4u; 00314 00315 00316 /* Decrement the loop counter */ 00317 tapCnt--; 00318 } 00319 00320 00321 00322 /* Update scratch pointer for remaining samples of smaller length sequence */ 00323 pScr1 -= 4u; 00324 00325 00326 /* apply same above for remaining samples of smaller length sequence */ 00327 tapCnt = (srcBLen) & 3u; 00328 00329 while(tapCnt > 0u) 00330 { 00331 00332 /* accumlate the results */ 00333 acc0 += (*pScr1++ * *pScr2); 00334 acc1 += (*pScr1++ * *pScr2); 00335 acc2 += (*pScr1++ * *pScr2); 00336 acc3 += (*pScr1++ * *pScr2++); 00337 00338 pScr1 -= 3u; 00339 00340 /* Decrement the loop counter */ 00341 tapCnt--; 00342 } 00343 00344 blkCnt--; 00345 00346 /* Store the result in the accumulator in the destination buffer. */ 00347 out0 = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00348 out1 = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00349 out2 = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00350 out3 = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00351 00352 *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); 00353 00354 /* Initialization of inputB pointer */ 00355 pScr2 = py; 00356 00357 pScratch1 += 4u; 00358 00359 } 00360 00361 blkCnt = (numPoints) & 0x3; 00362 00363 /* Calculate convolution for remaining samples of Bigger length sequence */ 00364 while(blkCnt > 0) 00365 { 00366 /* Initialze temporary scratch pointer as scratch1 */ 00367 pScr1 = pScratch1; 00368 00369 /* Clear Accumlators */ 00370 acc0 = 0; 00371 00372 tapCnt = (srcBLen) >> 1u; 00373 00374 while(tapCnt > 0u) 00375 { 00376 00377 /* Read next two samples from scratch1 buffer */ 00378 x1 = *__SIMD32(pScr1)++; 00379 00380 /* Read two samples from smaller buffer */ 00381 y1 = *__SIMD32(pScr2)++; 00382 00383 acc0 = __SMLAD(x1, y1, acc0); 00384 00385 /* Decrement the loop counter */ 00386 tapCnt--; 00387 } 00388 00389 tapCnt = (srcBLen) & 1u; 00390 00391 /* apply same above for remaining samples of smaller length sequence */ 00392 while(tapCnt > 0u) 00393 { 00394 00395 /* accumlate the results */ 00396 acc0 += (*pScr1++ * *pScr2++); 00397 00398 /* Decrement the loop counter */ 00399 tapCnt--; 00400 } 00401 00402 blkCnt--; 00403 00404 /* Store the result in the accumulator in the destination buffer. */ 00405 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00406 00407 /* Initialization of inputB pointer */ 00408 pScr2 = py; 00409 00410 pScratch1 += 1u; 00411 00412 } 00413 00414 /* set status as ARM_MATH_SUCCESS */ 00415 status = ARM_MATH_SUCCESS; 00416 00417 00418 } 00419 00420 return (status); 00421 00422 } 00423 00424 #else 00425 00426 arm_status arm_conv_partial_opt_q7( 00427 q7_t * pSrcA, 00428 uint32_t srcALen, 00429 q7_t * pSrcB, 00430 uint32_t srcBLen, 00431 q7_t * pDst, 00432 uint32_t firstIndex, 00433 uint32_t numPoints, 00434 q15_t * pScratch1, 00435 q15_t * pScratch2) 00436 { 00437 00438 q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */ 00439 q15_t x4; /* Temporary input variable */ 00440 q7_t *pIn1, *pIn2; /* inputA and inputB pointer */ 00441 uint32_t j, k, blkCnt, tapCnt; /* loop counter */ 00442 q7_t *px; /* Temporary input1 pointer */ 00443 q15_t *py; /* Temporary input2 pointer */ 00444 q31_t acc0, acc1, acc2, acc3; /* Accumulator */ 00445 arm_status status; 00446 q7_t *pOut = pDst; /* output pointer */ 00447 q15_t x10, x11, x20, x21; /* Temporary input variables */ 00448 q15_t y10, y11; /* Temporary input variables */ 00449 q7_t out0, out1, out2, out3; /* temporary variables */ 00450 00451 /* Check for range of output samples to be calculated */ 00452 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00453 { 00454 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00455 status = ARM_MATH_ARGUMENT_ERROR; 00456 } 00457 else 00458 { 00459 00460 /* The algorithm implementation is based on the lengths of the inputs. */ 00461 /* srcB is always made to slide across srcA. */ 00462 /* So srcBLen is always considered as shorter or equal to srcALen */ 00463 if(srcALen >= srcBLen) 00464 { 00465 /* Initialization of inputA pointer */ 00466 pIn1 = pSrcA; 00467 00468 /* Initialization of inputB pointer */ 00469 pIn2 = pSrcB; 00470 } 00471 else 00472 { 00473 /* Initialization of inputA pointer */ 00474 pIn1 = pSrcB; 00475 00476 /* Initialization of inputB pointer */ 00477 pIn2 = pSrcA; 00478 00479 /* srcBLen is always considered as shorter or equal to srcALen */ 00480 j = srcBLen; 00481 srcBLen = srcALen; 00482 srcALen = j; 00483 } 00484 00485 /* pointer to take end of scratch2 buffer */ 00486 pScr2 = pScratch2; 00487 00488 /* points to smaller length sequence */ 00489 px = pIn2 + srcBLen - 1; 00490 00491 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00492 k = srcBLen >> 2u; 00493 00494 /* First part of the processing with loop unrolling copies 4 data points at a time. 00495 ** a second loop below copies for the remaining 1 to 3 samples. */ 00496 while(k > 0u) 00497 { 00498 /* copy second buffer in reversal manner */ 00499 x4 = (q15_t) * px--; 00500 *pScr2++ = x4; 00501 x4 = (q15_t) * px--; 00502 *pScr2++ = x4; 00503 x4 = (q15_t) * px--; 00504 *pScr2++ = x4; 00505 x4 = (q15_t) * px--; 00506 *pScr2++ = x4; 00507 00508 /* Decrement the loop counter */ 00509 k--; 00510 } 00511 00512 /* If the count is not a multiple of 4, copy remaining samples here. 00513 ** No loop unrolling is used. */ 00514 k = srcBLen % 0x4u; 00515 00516 while(k > 0u) 00517 { 00518 /* copy second buffer in reversal manner for remaining samples */ 00519 x4 = (q15_t) * px--; 00520 *pScr2++ = x4; 00521 00522 /* Decrement the loop counter */ 00523 k--; 00524 } 00525 00526 /* Initialze temporary scratch pointer */ 00527 pScr1 = pScratch1; 00528 00529 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00530 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00531 00532 /* Update temporary scratch pointer */ 00533 pScr1 += (srcBLen - 1u); 00534 00535 /* Copy (srcALen) samples in scratch buffer */ 00536 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00537 k = srcALen >> 2u; 00538 00539 /* First part of the processing with loop unrolling copies 4 data points at a time. 00540 ** a second loop below copies for the remaining 1 to 3 samples. */ 00541 while(k > 0u) 00542 { 00543 /* copy second buffer in reversal manner */ 00544 x4 = (q15_t) * pIn1++; 00545 *pScr1++ = x4; 00546 x4 = (q15_t) * pIn1++; 00547 *pScr1++ = x4; 00548 x4 = (q15_t) * pIn1++; 00549 *pScr1++ = x4; 00550 x4 = (q15_t) * pIn1++; 00551 *pScr1++ = x4; 00552 00553 /* Decrement the loop counter */ 00554 k--; 00555 } 00556 00557 /* If the count is not a multiple of 4, copy remaining samples here. 00558 ** No loop unrolling is used. */ 00559 k = srcALen % 0x4u; 00560 00561 while(k > 0u) 00562 { 00563 /* copy second buffer in reversal manner for remaining samples */ 00564 x4 = (q15_t) * pIn1++; 00565 *pScr1++ = x4; 00566 00567 /* Decrement the loop counter */ 00568 k--; 00569 } 00570 00571 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00572 k = (srcBLen - 1u) >> 2u; 00573 00574 /* First part of the processing with loop unrolling copies 4 data points at a time. 00575 ** a second loop below copies for the remaining 1 to 3 samples. */ 00576 while(k > 0u) 00577 { 00578 /* copy second buffer in reversal manner */ 00579 *pScr1++ = 0; 00580 *pScr1++ = 0; 00581 *pScr1++ = 0; 00582 *pScr1++ = 0; 00583 00584 /* Decrement the loop counter */ 00585 k--; 00586 } 00587 00588 /* If the count is not a multiple of 4, copy remaining samples here. 00589 ** No loop unrolling is used. */ 00590 k = (srcBLen - 1u) % 0x4u; 00591 00592 while(k > 0u) 00593 { 00594 /* copy second buffer in reversal manner for remaining samples */ 00595 *pScr1++ = 0; 00596 00597 /* Decrement the loop counter */ 00598 k--; 00599 } 00600 00601 00602 /* Temporary pointer for scratch2 */ 00603 py = pScratch2; 00604 00605 /* Initialization of pIn2 pointer */ 00606 pIn2 = (q7_t *) py; 00607 00608 pScr2 = py; 00609 00610 pOut = pDst + firstIndex; 00611 00612 pScratch1 += firstIndex; 00613 00614 /* Actual convolution process starts here */ 00615 blkCnt = (numPoints) >> 2; 00616 00617 00618 while(blkCnt > 0) 00619 { 00620 /* Initialze temporary scratch pointer as scratch1 */ 00621 pScr1 = pScratch1; 00622 00623 /* Clear Accumlators */ 00624 acc0 = 0; 00625 acc1 = 0; 00626 acc2 = 0; 00627 acc3 = 0; 00628 00629 /* Read two samples from scratch1 buffer */ 00630 x10 = *pScr1++; 00631 x11 = *pScr1++; 00632 00633 /* Read next two samples from scratch1 buffer */ 00634 x20 = *pScr1++; 00635 x21 = *pScr1++; 00636 00637 tapCnt = (srcBLen) >> 2u; 00638 00639 while(tapCnt > 0u) 00640 { 00641 00642 /* Read four samples from smaller buffer */ 00643 y10 = *pScr2; 00644 y11 = *(pScr2 + 1u); 00645 00646 /* multiply and accumlate */ 00647 acc0 += (q31_t) x10 *y10; 00648 acc0 += (q31_t) x11 *y11; 00649 acc2 += (q31_t) x20 *y10; 00650 acc2 += (q31_t) x21 *y11; 00651 00652 00653 acc1 += (q31_t) x11 *y10; 00654 acc1 += (q31_t) x20 *y11; 00655 00656 /* Read next two samples from scratch1 buffer */ 00657 x10 = *pScr1; 00658 x11 = *(pScr1 + 1u); 00659 00660 /* multiply and accumlate */ 00661 acc3 += (q31_t) x21 *y10; 00662 acc3 += (q31_t) x10 *y11; 00663 00664 /* Read next two samples from scratch2 buffer */ 00665 y10 = *(pScr2 + 2u); 00666 y11 = *(pScr2 + 3u); 00667 00668 /* multiply and accumlate */ 00669 acc0 += (q31_t) x20 *y10; 00670 acc0 += (q31_t) x21 *y11; 00671 acc2 += (q31_t) x10 *y10; 00672 acc2 += (q31_t) x11 *y11; 00673 acc1 += (q31_t) x21 *y10; 00674 acc1 += (q31_t) x10 *y11; 00675 00676 /* Read next two samples from scratch1 buffer */ 00677 x20 = *(pScr1 + 2); 00678 x21 = *(pScr1 + 3); 00679 00680 /* multiply and accumlate */ 00681 acc3 += (q31_t) x11 *y10; 00682 acc3 += (q31_t) x20 *y11; 00683 00684 /* update scratch pointers */ 00685 00686 pScr1 += 4u; 00687 pScr2 += 4u; 00688 00689 /* Decrement the loop counter */ 00690 tapCnt--; 00691 } 00692 00693 00694 00695 /* Update scratch pointer for remaining samples of smaller length sequence */ 00696 pScr1 -= 4u; 00697 00698 00699 /* apply same above for remaining samples of smaller length sequence */ 00700 tapCnt = (srcBLen) & 3u; 00701 00702 while(tapCnt > 0u) 00703 { 00704 00705 /* accumlate the results */ 00706 acc0 += (*pScr1++ * *pScr2); 00707 acc1 += (*pScr1++ * *pScr2); 00708 acc2 += (*pScr1++ * *pScr2); 00709 acc3 += (*pScr1++ * *pScr2++); 00710 00711 pScr1 -= 3u; 00712 00713 /* Decrement the loop counter */ 00714 tapCnt--; 00715 } 00716 00717 blkCnt--; 00718 00719 /* Store the result in the accumulator in the destination buffer. */ 00720 out0 = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00721 out1 = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00722 out2 = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00723 out3 = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00724 00725 00726 *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3); 00727 00728 /* Initialization of inputB pointer */ 00729 pScr2 = py; 00730 00731 pScratch1 += 4u; 00732 00733 } 00734 00735 blkCnt = (numPoints) & 0x3; 00736 00737 /* Calculate convolution for remaining samples of Bigger length sequence */ 00738 while(blkCnt > 0) 00739 { 00740 /* Initialze temporary scratch pointer as scratch1 */ 00741 pScr1 = pScratch1; 00742 00743 /* Clear Accumlators */ 00744 acc0 = 0; 00745 00746 tapCnt = (srcBLen) >> 1u; 00747 00748 while(tapCnt > 0u) 00749 { 00750 00751 /* Read next two samples from scratch1 buffer */ 00752 x10 = *pScr1++; 00753 x11 = *pScr1++; 00754 00755 /* Read two samples from smaller buffer */ 00756 y10 = *pScr2++; 00757 y11 = *pScr2++; 00758 00759 /* multiply and accumlate */ 00760 acc0 += (q31_t) x10 *y10; 00761 acc0 += (q31_t) x11 *y11; 00762 00763 /* Decrement the loop counter */ 00764 tapCnt--; 00765 } 00766 00767 tapCnt = (srcBLen) & 1u; 00768 00769 /* apply same above for remaining samples of smaller length sequence */ 00770 while(tapCnt > 0u) 00771 { 00772 00773 /* accumlate the results */ 00774 acc0 += (*pScr1++ * *pScr2++); 00775 00776 /* Decrement the loop counter */ 00777 tapCnt--; 00778 } 00779 00780 blkCnt--; 00781 00782 /* Store the result in the accumulator in the destination buffer. */ 00783 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00784 00785 /* Initialization of inputB pointer */ 00786 pScr2 = py; 00787 00788 pScratch1 += 1u; 00789 00790 } 00791 00792 /* set status as ARM_MATH_SUCCESS */ 00793 status = ARM_MATH_SUCCESS; 00794 00795 } 00796 00797 return (status); 00798 00799 } 00800 00801 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00802 00803 00804 00805 /** 00806 * @} end of PartialConv group 00807 */
Generated on Tue Jul 12 2022 18:44:08 by
