Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-os by
arm_conv_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_q15.c 00009 * 00010 * Description: Convolution of Q15 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q15 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @return none. 00060 * 00061 * @details 00062 * <b>Scaling and Overflow Behavior:</b> 00063 * 00064 * \par 00065 * The function is implemented using a 64-bit internal accumulator. 00066 * Both inputs are in 1.15 format and multiplications yield a 2.30 result. 00067 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. 00068 * This approach provides 33 guard bits and there is no risk of overflow. 00069 * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. 00070 * 00071 * \par 00072 * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00073 * 00074 * \par 00075 * Refer the function <code>arm_conv_opt_q15()</code> for a faster implementation of this function using scratch buffers. 00076 * 00077 */ 00078 00079 void arm_conv_q15( 00080 q15_t * pSrcA, 00081 uint32_t srcALen, 00082 q15_t * pSrcB, 00083 uint32_t srcBLen, 00084 q15_t * pDst) 00085 { 00086 00087 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) 00088 00089 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00090 00091 q15_t *pIn1; /* inputA pointer */ 00092 q15_t *pIn2; /* inputB pointer */ 00093 q15_t *pOut = pDst; /* output pointer */ 00094 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00095 q15_t *px; /* Intermediate inputA pointer */ 00096 q15_t *py; /* Intermediate inputB pointer */ 00097 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00098 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00099 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ 00100 00101 /* The algorithm implementation is based on the lengths of the inputs. */ 00102 /* srcB is always made to slide across srcA. */ 00103 /* So srcBLen is always considered as shorter or equal to srcALen */ 00104 if(srcALen >= srcBLen) 00105 { 00106 /* Initialization of inputA pointer */ 00107 pIn1 = pSrcA; 00108 00109 /* Initialization of inputB pointer */ 00110 pIn2 = pSrcB; 00111 } 00112 else 00113 { 00114 /* Initialization of inputA pointer */ 00115 pIn1 = pSrcB; 00116 00117 /* Initialization of inputB pointer */ 00118 pIn2 = pSrcA; 00119 00120 /* srcBLen is always considered as shorter or equal to srcALen */ 00121 j = srcBLen; 00122 srcBLen = srcALen; 00123 srcALen = j; 00124 } 00125 00126 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00127 /* The function is internally 00128 * divided into three stages according to the number of multiplications that has to be 00129 * taken place between inputA samples and inputB samples. In the first stage of the 00130 * algorithm, the multiplications increase by one for every iteration. 00131 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00132 * In the third stage of the algorithm, the multiplications decrease by one 00133 * for every iteration. */ 00134 00135 /* The algorithm is implemented in three stages. 00136 The loop counters of each stage is initiated here. */ 00137 blockSize1 = srcBLen - 1u; 00138 blockSize2 = srcALen - (srcBLen - 1u); 00139 00140 /* -------------------------- 00141 * Initializations of stage1 00142 * -------------------------*/ 00143 00144 /* sum = x[0] * y[0] 00145 * sum = x[0] * y[1] + x[1] * y[0] 00146 * .... 00147 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00148 */ 00149 00150 /* In this stage the MAC operations are increased by 1 for every iteration. 00151 The count variable holds the number of MAC operations performed */ 00152 count = 1u; 00153 00154 /* Working pointer of inputA */ 00155 px = pIn1; 00156 00157 /* Working pointer of inputB */ 00158 py = pIn2; 00159 00160 00161 /* ------------------------ 00162 * Stage1 process 00163 * ----------------------*/ 00164 00165 /* For loop unrolling by 4, this stage is divided into two. */ 00166 /* First part of this stage computes the MAC operations less than 4 */ 00167 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00168 00169 /* The first part of the stage starts here */ 00170 while((count < 4u) && (blockSize1 > 0u)) 00171 { 00172 /* Accumulator is made zero for every iteration */ 00173 sum = 0; 00174 00175 /* Loop over number of MAC operations between 00176 * inputA samples and inputB samples */ 00177 k = count; 00178 00179 while(k > 0u) 00180 { 00181 /* Perform the multiply-accumulates */ 00182 sum = __SMLALD(*px++, *py--, sum); 00183 00184 /* Decrement the loop counter */ 00185 k--; 00186 } 00187 00188 /* Store the result in the accumulator in the destination buffer. */ 00189 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00190 00191 /* Update the inputA and inputB pointers for next MAC calculation */ 00192 py = pIn2 + count; 00193 px = pIn1; 00194 00195 /* Increment the MAC count */ 00196 count++; 00197 00198 /* Decrement the loop counter */ 00199 blockSize1--; 00200 } 00201 00202 /* The second part of the stage starts here */ 00203 /* The internal loop, over count, is unrolled by 4 */ 00204 /* To, read the last two inputB samples using SIMD: 00205 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00206 py = py - 1; 00207 00208 while(blockSize1 > 0u) 00209 { 00210 /* Accumulator is made zero for every iteration */ 00211 sum = 0; 00212 00213 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00214 k = count >> 2u; 00215 00216 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00217 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00218 while(k > 0u) 00219 { 00220 /* Perform the multiply-accumulates */ 00221 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00222 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00223 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00224 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00225 00226 /* Decrement the loop counter */ 00227 k--; 00228 } 00229 00230 /* For the next MAC operations, the pointer py is used without SIMD 00231 * So, py is incremented by 1 */ 00232 py = py + 1u; 00233 00234 /* If the count is not a multiple of 4, compute any remaining MACs here. 00235 ** No loop unrolling is used. */ 00236 k = count % 0x4u; 00237 00238 while(k > 0u) 00239 { 00240 /* Perform the multiply-accumulates */ 00241 sum = __SMLALD(*px++, *py--, sum); 00242 00243 /* Decrement the loop counter */ 00244 k--; 00245 } 00246 00247 /* Store the result in the accumulator in the destination buffer. */ 00248 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00249 00250 /* Update the inputA and inputB pointers for next MAC calculation */ 00251 py = pIn2 + (count - 1u); 00252 px = pIn1; 00253 00254 /* Increment the MAC count */ 00255 count++; 00256 00257 /* Decrement the loop counter */ 00258 blockSize1--; 00259 } 00260 00261 /* -------------------------- 00262 * Initializations of stage2 00263 * ------------------------*/ 00264 00265 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00266 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00267 * .... 00268 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00269 */ 00270 00271 /* Working pointer of inputA */ 00272 px = pIn1; 00273 00274 /* Working pointer of inputB */ 00275 pSrc2 = pIn2 + (srcBLen - 1u); 00276 py = pSrc2; 00277 00278 /* count is the index by which the pointer pIn1 to be incremented */ 00279 count = 0u; 00280 00281 00282 /* -------------------- 00283 * Stage2 process 00284 * -------------------*/ 00285 00286 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00287 * So, to loop unroll over blockSize2, 00288 * srcBLen should be greater than or equal to 4 */ 00289 if(srcBLen >= 4u) 00290 { 00291 /* Loop unroll over blockSize2, by 4 */ 00292 blkCnt = blockSize2 >> 2u; 00293 00294 while(blkCnt > 0u) 00295 { 00296 py = py - 1u; 00297 00298 /* Set all accumulators to zero */ 00299 acc0 = 0; 00300 acc1 = 0; 00301 acc2 = 0; 00302 acc3 = 0; 00303 00304 00305 /* read x[0], x[1] samples */ 00306 x0 = *__SIMD32(px); 00307 /* read x[1], x[2] samples */ 00308 x1 = _SIMD32_OFFSET(px+1); 00309 px+= 2u; 00310 00311 00312 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00313 k = srcBLen >> 2u; 00314 00315 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00316 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00317 do 00318 { 00319 /* Read the last two inputB samples using SIMD: 00320 * y[srcBLen - 1] and y[srcBLen - 2] */ 00321 c0 = *__SIMD32(py)--; 00322 00323 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00324 acc0 = __SMLALDX(x0, c0, acc0); 00325 00326 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00327 acc1 = __SMLALDX(x1, c0, acc1); 00328 00329 /* Read x[2], x[3] */ 00330 x2 = *__SIMD32(px); 00331 00332 /* Read x[3], x[4] */ 00333 x3 = _SIMD32_OFFSET(px+1); 00334 00335 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00336 acc2 = __SMLALDX(x2, c0, acc2); 00337 00338 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00339 acc3 = __SMLALDX(x3, c0, acc3); 00340 00341 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00342 c0 = *__SIMD32(py)--; 00343 00344 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00345 acc0 = __SMLALDX(x2, c0, acc0); 00346 00347 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00348 acc1 = __SMLALDX(x3, c0, acc1); 00349 00350 /* Read x[4], x[5] */ 00351 x0 = _SIMD32_OFFSET(px+2); 00352 00353 /* Read x[5], x[6] */ 00354 x1 = _SIMD32_OFFSET(px+3); 00355 px += 4u; 00356 00357 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00358 acc2 = __SMLALDX(x0, c0, acc2); 00359 00360 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00361 acc3 = __SMLALDX(x1, c0, acc3); 00362 00363 } while(--k); 00364 00365 /* For the next MAC operations, SIMD is not used 00366 * So, the 16 bit pointer if inputB, py is updated */ 00367 00368 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00369 ** No loop unrolling is used. */ 00370 k = srcBLen % 0x4u; 00371 00372 if(k == 1u) 00373 { 00374 /* Read y[srcBLen - 5] */ 00375 c0 = *(py+1); 00376 00377 #ifdef ARM_MATH_BIG_ENDIAN 00378 00379 c0 = c0 << 16u; 00380 00381 #else 00382 00383 c0 = c0 & 0x0000FFFF; 00384 00385 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00386 /* Read x[7] */ 00387 x3 = *__SIMD32(px); 00388 px++; 00389 00390 /* Perform the multiply-accumulates */ 00391 acc0 = __SMLALD(x0, c0, acc0); 00392 acc1 = __SMLALD(x1, c0, acc1); 00393 acc2 = __SMLALDX(x1, c0, acc2); 00394 acc3 = __SMLALDX(x3, c0, acc3); 00395 } 00396 00397 if(k == 2u) 00398 { 00399 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00400 c0 = _SIMD32_OFFSET(py); 00401 00402 /* Read x[7], x[8] */ 00403 x3 = *__SIMD32(px); 00404 00405 /* Read x[9] */ 00406 x2 = _SIMD32_OFFSET(px+1); 00407 px += 2u; 00408 00409 /* Perform the multiply-accumulates */ 00410 acc0 = __SMLALDX(x0, c0, acc0); 00411 acc1 = __SMLALDX(x1, c0, acc1); 00412 acc2 = __SMLALDX(x3, c0, acc2); 00413 acc3 = __SMLALDX(x2, c0, acc3); 00414 } 00415 00416 if(k == 3u) 00417 { 00418 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00419 c0 = _SIMD32_OFFSET(py); 00420 00421 /* Read x[7], x[8] */ 00422 x3 = *__SIMD32(px); 00423 00424 /* Read x[9] */ 00425 x2 = _SIMD32_OFFSET(px+1); 00426 00427 /* Perform the multiply-accumulates */ 00428 acc0 = __SMLALDX(x0, c0, acc0); 00429 acc1 = __SMLALDX(x1, c0, acc1); 00430 acc2 = __SMLALDX(x3, c0, acc2); 00431 acc3 = __SMLALDX(x2, c0, acc3); 00432 00433 c0 = *(py-1); 00434 00435 #ifdef ARM_MATH_BIG_ENDIAN 00436 00437 c0 = c0 << 16u; 00438 #else 00439 00440 c0 = c0 & 0x0000FFFF; 00441 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00442 /* Read x[10] */ 00443 x3 = _SIMD32_OFFSET(px+2); 00444 px += 3u; 00445 00446 /* Perform the multiply-accumulates */ 00447 acc0 = __SMLALDX(x1, c0, acc0); 00448 acc1 = __SMLALD(x2, c0, acc1); 00449 acc2 = __SMLALDX(x2, c0, acc2); 00450 acc3 = __SMLALDX(x3, c0, acc3); 00451 } 00452 00453 00454 /* Store the results in the accumulators in the destination buffer. */ 00455 00456 #ifndef ARM_MATH_BIG_ENDIAN 00457 00458 *__SIMD32(pOut)++ = 00459 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00460 *__SIMD32(pOut)++ = 00461 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00462 00463 #else 00464 00465 *__SIMD32(pOut)++ = 00466 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00467 *__SIMD32(pOut)++ = 00468 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00469 00470 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00471 00472 /* Increment the pointer pIn1 index, count by 4 */ 00473 count += 4u; 00474 00475 /* Update the inputA and inputB pointers for next MAC calculation */ 00476 px = pIn1 + count; 00477 py = pSrc2; 00478 00479 /* Decrement the loop counter */ 00480 blkCnt--; 00481 } 00482 00483 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00484 ** No loop unrolling is used. */ 00485 blkCnt = blockSize2 % 0x4u; 00486 00487 while(blkCnt > 0u) 00488 { 00489 /* Accumulator is made zero for every iteration */ 00490 sum = 0; 00491 00492 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00493 k = srcBLen >> 2u; 00494 00495 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00496 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00497 while(k > 0u) 00498 { 00499 /* Perform the multiply-accumulates */ 00500 sum += (q63_t) ((q31_t) * px++ * *py--); 00501 sum += (q63_t) ((q31_t) * px++ * *py--); 00502 sum += (q63_t) ((q31_t) * px++ * *py--); 00503 sum += (q63_t) ((q31_t) * px++ * *py--); 00504 00505 /* Decrement the loop counter */ 00506 k--; 00507 } 00508 00509 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00510 ** No loop unrolling is used. */ 00511 k = srcBLen % 0x4u; 00512 00513 while(k > 0u) 00514 { 00515 /* Perform the multiply-accumulates */ 00516 sum += (q63_t) ((q31_t) * px++ * *py--); 00517 00518 /* Decrement the loop counter */ 00519 k--; 00520 } 00521 00522 /* Store the result in the accumulator in the destination buffer. */ 00523 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00524 00525 /* Increment the pointer pIn1 index, count by 1 */ 00526 count++; 00527 00528 /* Update the inputA and inputB pointers for next MAC calculation */ 00529 px = pIn1 + count; 00530 py = pSrc2; 00531 00532 /* Decrement the loop counter */ 00533 blkCnt--; 00534 } 00535 } 00536 else 00537 { 00538 /* If the srcBLen is not a multiple of 4, 00539 * the blockSize2 loop cannot be unrolled by 4 */ 00540 blkCnt = blockSize2; 00541 00542 while(blkCnt > 0u) 00543 { 00544 /* Accumulator is made zero for every iteration */ 00545 sum = 0; 00546 00547 /* srcBLen number of MACS should be performed */ 00548 k = srcBLen; 00549 00550 while(k > 0u) 00551 { 00552 /* Perform the multiply-accumulate */ 00553 sum += (q63_t) ((q31_t) * px++ * *py--); 00554 00555 /* Decrement the loop counter */ 00556 k--; 00557 } 00558 00559 /* Store the result in the accumulator in the destination buffer. */ 00560 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00561 00562 /* Increment the MAC count */ 00563 count++; 00564 00565 /* Update the inputA and inputB pointers for next MAC calculation */ 00566 px = pIn1 + count; 00567 py = pSrc2; 00568 00569 /* Decrement the loop counter */ 00570 blkCnt--; 00571 } 00572 } 00573 00574 00575 /* -------------------------- 00576 * Initializations of stage3 00577 * -------------------------*/ 00578 00579 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00580 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00581 * .... 00582 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00583 * sum += x[srcALen-1] * y[srcBLen-1] 00584 */ 00585 00586 /* In this stage the MAC operations are decreased by 1 for every iteration. 00587 The blockSize3 variable holds the number of MAC operations performed */ 00588 00589 blockSize3 = srcBLen - 1u; 00590 00591 /* Working pointer of inputA */ 00592 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00593 px = pSrc1; 00594 00595 /* Working pointer of inputB */ 00596 pSrc2 = pIn2 + (srcBLen - 1u); 00597 pIn2 = pSrc2 - 1u; 00598 py = pIn2; 00599 00600 /* ------------------- 00601 * Stage3 process 00602 * ------------------*/ 00603 00604 /* For loop unrolling by 4, this stage is divided into two. */ 00605 /* First part of this stage computes the MAC operations greater than 4 */ 00606 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00607 00608 /* The first part of the stage starts here */ 00609 j = blockSize3 >> 2u; 00610 00611 while((j > 0u) && (blockSize3 > 0u)) 00612 { 00613 /* Accumulator is made zero for every iteration */ 00614 sum = 0; 00615 00616 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00617 k = blockSize3 >> 2u; 00618 00619 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00620 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00621 while(k > 0u) 00622 { 00623 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00624 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00625 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00626 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00627 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00628 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00629 00630 /* Decrement the loop counter */ 00631 k--; 00632 } 00633 00634 /* For the next MAC operations, the pointer py is used without SIMD 00635 * So, py is incremented by 1 */ 00636 py = py + 1u; 00637 00638 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00639 ** No loop unrolling is used. */ 00640 k = blockSize3 % 0x4u; 00641 00642 while(k > 0u) 00643 { 00644 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00645 sum = __SMLALD(*px++, *py--, sum); 00646 00647 /* Decrement the loop counter */ 00648 k--; 00649 } 00650 00651 /* Store the result in the accumulator in the destination buffer. */ 00652 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00653 00654 /* Update the inputA and inputB pointers for next MAC calculation */ 00655 px = ++pSrc1; 00656 py = pIn2; 00657 00658 /* Decrement the loop counter */ 00659 blockSize3--; 00660 00661 j--; 00662 } 00663 00664 /* The second part of the stage starts here */ 00665 /* SIMD is not used for the next MAC operations, 00666 * so pointer py is updated to read only one sample at a time */ 00667 py = py + 1u; 00668 00669 while(blockSize3 > 0u) 00670 { 00671 /* Accumulator is made zero for every iteration */ 00672 sum = 0; 00673 00674 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00675 k = blockSize3; 00676 00677 while(k > 0u) 00678 { 00679 /* Perform the multiply-accumulates */ 00680 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00681 sum = __SMLALD(*px++, *py--, sum); 00682 00683 /* Decrement the loop counter */ 00684 k--; 00685 } 00686 00687 /* Store the result in the accumulator in the destination buffer. */ 00688 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00689 00690 /* Update the inputA and inputB pointers for next MAC calculation */ 00691 px = ++pSrc1; 00692 py = pSrc2; 00693 00694 /* Decrement the loop counter */ 00695 blockSize3--; 00696 } 00697 00698 #else 00699 00700 /* Run the below code for Cortex-M0 */ 00701 00702 q15_t *pIn1 = pSrcA; /* input pointer */ 00703 q15_t *pIn2 = pSrcB; /* coefficient pointer */ 00704 q63_t sum; /* Accumulator */ 00705 uint32_t i, j; /* loop counter */ 00706 00707 /* Loop to calculate output of convolution for output length number of times */ 00708 for (i = 0; i < (srcALen + srcBLen - 1); i++) 00709 { 00710 /* Initialize sum with zero to carry on MAC operations */ 00711 sum = 0; 00712 00713 /* Loop to perform MAC operations according to convolution equation */ 00714 for (j = 0; j <= i; j++) 00715 { 00716 /* Check the array limitations */ 00717 if(((i - j) < srcBLen) && (j < srcALen)) 00718 { 00719 /* z[i] += x[i-j] * y[j] */ 00720 sum += (q31_t) pIn1[j] * (pIn2[i - j]); 00721 } 00722 } 00723 00724 /* Store the output in the destination buffer */ 00725 pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u); 00726 } 00727 00728 #endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)*/ 00729 00730 } 00731 00732 /** 00733 * @} end of Conv group 00734 */
Generated on Tue Jul 12 2022 13:15:23 by
