Aded CMSIS5 DSP and NN folder. Needs some work
Embed:
(wiki syntax)
Show/hide line numbers
arm_conv_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_conv_q15.c 00004 * Description: Convolution of Q15 sequences 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup Conv 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Convolution of Q15 sequences. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00047 * @return none. 00048 * 00049 * @details 00050 * <b>Scaling and Overflow Behavior:</b> 00051 * 00052 * \par 00053 * The function is implemented using a 64-bit internal accumulator. 00054 * Both inputs are in 1.15 format and multiplications yield a 2.30 result. 00055 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. 00056 * This approach provides 33 guard bits and there is no risk of overflow. 00057 * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. 00058 * 00059 * \par 00060 * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00061 * 00062 * \par 00063 * Refer the function <code>arm_conv_opt_q15()</code> for a faster implementation of this function using scratch buffers. 00064 * 00065 */ 00066 00067 void arm_conv_q15( 00068 q15_t * pSrcA, 00069 uint32_t srcALen, 00070 q15_t * pSrcB, 00071 uint32_t srcBLen, 00072 q15_t * pDst) 00073 { 00074 00075 #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) 00076 00077 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00078 00079 q15_t *pIn1; /* inputA pointer */ 00080 q15_t *pIn2; /* inputB pointer */ 00081 q15_t *pOut = pDst; /* output pointer */ 00082 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00083 q15_t *px; /* Intermediate inputA pointer */ 00084 q15_t *py; /* Intermediate inputB pointer */ 00085 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00086 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00087 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ 00088 00089 /* The algorithm implementation is based on the lengths of the inputs. */ 00090 /* srcB is always made to slide across srcA. */ 00091 /* So srcBLen is always considered as shorter or equal to srcALen */ 00092 if (srcALen >= srcBLen) 00093 { 00094 /* Initialization of inputA pointer */ 00095 pIn1 = pSrcA; 00096 00097 /* Initialization of inputB pointer */ 00098 pIn2 = pSrcB; 00099 } 00100 else 00101 { 00102 /* Initialization of inputA pointer */ 00103 pIn1 = pSrcB; 00104 00105 /* Initialization of inputB pointer */ 00106 pIn2 = pSrcA; 00107 00108 /* srcBLen is always considered as shorter or equal to srcALen */ 00109 j = srcBLen; 00110 srcBLen = srcALen; 00111 srcALen = j; 00112 } 00113 00114 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00115 /* The function is internally 00116 * divided into three stages according to the number of multiplications that has to be 00117 * taken place between inputA samples and inputB samples. In the first stage of the 00118 * algorithm, the multiplications increase by one for every iteration. 00119 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00120 * In the third stage of the algorithm, the multiplications decrease by one 00121 * for every iteration. */ 00122 00123 /* The algorithm is implemented in three stages. 00124 The loop counters of each stage is initiated here. */ 00125 blockSize1 = srcBLen - 1U; 00126 blockSize2 = srcALen - (srcBLen - 1U); 00127 00128 /* -------------------------- 00129 * Initializations of stage1 00130 * -------------------------*/ 00131 00132 /* sum = x[0] * y[0] 00133 * sum = x[0] * y[1] + x[1] * y[0] 00134 * .... 00135 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00136 */ 00137 00138 /* In this stage the MAC operations are increased by 1 for every iteration. 00139 The count variable holds the number of MAC operations performed */ 00140 count = 1U; 00141 00142 /* Working pointer of inputA */ 00143 px = pIn1; 00144 00145 /* Working pointer of inputB */ 00146 py = pIn2; 00147 00148 00149 /* ------------------------ 00150 * Stage1 process 00151 * ----------------------*/ 00152 00153 /* For loop unrolling by 4, this stage is divided into two. */ 00154 /* First part of this stage computes the MAC operations less than 4 */ 00155 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00156 00157 /* The first part of the stage starts here */ 00158 while ((count < 4U) && (blockSize1 > 0U)) 00159 { 00160 /* Accumulator is made zero for every iteration */ 00161 sum = 0; 00162 00163 /* Loop over number of MAC operations between 00164 * inputA samples and inputB samples */ 00165 k = count; 00166 00167 while (k > 0U) 00168 { 00169 /* Perform the multiply-accumulates */ 00170 sum = __SMLALD(*px++, *py--, sum); 00171 00172 /* Decrement the loop counter */ 00173 k--; 00174 } 00175 00176 /* Store the result in the accumulator in the destination buffer. */ 00177 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00178 00179 /* Update the inputA and inputB pointers for next MAC calculation */ 00180 py = pIn2 + count; 00181 px = pIn1; 00182 00183 /* Increment the MAC count */ 00184 count++; 00185 00186 /* Decrement the loop counter */ 00187 blockSize1--; 00188 } 00189 00190 /* The second part of the stage starts here */ 00191 /* The internal loop, over count, is unrolled by 4 */ 00192 /* To, read the last two inputB samples using SIMD: 00193 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00194 py = py - 1; 00195 00196 while (blockSize1 > 0U) 00197 { 00198 /* Accumulator is made zero for every iteration */ 00199 sum = 0; 00200 00201 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00202 k = count >> 2U; 00203 00204 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00205 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00206 while (k > 0U) 00207 { 00208 /* Perform the multiply-accumulates */ 00209 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00210 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00211 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00212 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00213 00214 /* Decrement the loop counter */ 00215 k--; 00216 } 00217 00218 /* For the next MAC operations, the pointer py is used without SIMD 00219 * So, py is incremented by 1 */ 00220 py = py + 1U; 00221 00222 /* If the count is not a multiple of 4, compute any remaining MACs here. 00223 ** No loop unrolling is used. */ 00224 k = count % 0x4U; 00225 00226 while (k > 0U) 00227 { 00228 /* Perform the multiply-accumulates */ 00229 sum = __SMLALD(*px++, *py--, sum); 00230 00231 /* Decrement the loop counter */ 00232 k--; 00233 } 00234 00235 /* Store the result in the accumulator in the destination buffer. */ 00236 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00237 00238 /* Update the inputA and inputB pointers for next MAC calculation */ 00239 py = pIn2 + (count - 1U); 00240 px = pIn1; 00241 00242 /* Increment the MAC count */ 00243 count++; 00244 00245 /* Decrement the loop counter */ 00246 blockSize1--; 00247 } 00248 00249 /* -------------------------- 00250 * Initializations of stage2 00251 * ------------------------*/ 00252 00253 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00254 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00255 * .... 00256 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00257 */ 00258 00259 /* Working pointer of inputA */ 00260 px = pIn1; 00261 00262 /* Working pointer of inputB */ 00263 pSrc2 = pIn2 + (srcBLen - 1U); 00264 py = pSrc2; 00265 00266 /* count is the index by which the pointer pIn1 to be incremented */ 00267 count = 0U; 00268 00269 00270 /* -------------------- 00271 * Stage2 process 00272 * -------------------*/ 00273 00274 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00275 * So, to loop unroll over blockSize2, 00276 * srcBLen should be greater than or equal to 4 */ 00277 if (srcBLen >= 4U) 00278 { 00279 /* Loop unroll over blockSize2, by 4 */ 00280 blkCnt = blockSize2 >> 2U; 00281 00282 while (blkCnt > 0U) 00283 { 00284 py = py - 1U; 00285 00286 /* Set all accumulators to zero */ 00287 acc0 = 0; 00288 acc1 = 0; 00289 acc2 = 0; 00290 acc3 = 0; 00291 00292 00293 /* read x[0], x[1] samples */ 00294 x0 = *__SIMD32(px); 00295 /* read x[1], x[2] samples */ 00296 x1 = _SIMD32_OFFSET(px+1); 00297 px+= 2U; 00298 00299 00300 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00301 k = srcBLen >> 2U; 00302 00303 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00304 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00305 do 00306 { 00307 /* Read the last two inputB samples using SIMD: 00308 * y[srcBLen - 1] and y[srcBLen - 2] */ 00309 c0 = *__SIMD32(py)--; 00310 00311 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00312 acc0 = __SMLALDX(x0, c0, acc0); 00313 00314 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00315 acc1 = __SMLALDX(x1, c0, acc1); 00316 00317 /* Read x[2], x[3] */ 00318 x2 = *__SIMD32(px); 00319 00320 /* Read x[3], x[4] */ 00321 x3 = _SIMD32_OFFSET(px+1); 00322 00323 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00324 acc2 = __SMLALDX(x2, c0, acc2); 00325 00326 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00327 acc3 = __SMLALDX(x3, c0, acc3); 00328 00329 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00330 c0 = *__SIMD32(py)--; 00331 00332 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00333 acc0 = __SMLALDX(x2, c0, acc0); 00334 00335 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00336 acc1 = __SMLALDX(x3, c0, acc1); 00337 00338 /* Read x[4], x[5] */ 00339 x0 = _SIMD32_OFFSET(px+2); 00340 00341 /* Read x[5], x[6] */ 00342 x1 = _SIMD32_OFFSET(px+3); 00343 px += 4U; 00344 00345 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00346 acc2 = __SMLALDX(x0, c0, acc2); 00347 00348 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00349 acc3 = __SMLALDX(x1, c0, acc3); 00350 00351 } while (--k); 00352 00353 /* For the next MAC operations, SIMD is not used 00354 * So, the 16 bit pointer if inputB, py is updated */ 00355 00356 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00357 ** No loop unrolling is used. */ 00358 k = srcBLen % 0x4U; 00359 00360 if (k == 1U) 00361 { 00362 /* Read y[srcBLen - 5] */ 00363 c0 = *(py+1); 00364 00365 #ifdef ARM_MATH_BIG_ENDIAN 00366 00367 c0 = c0 << 16U; 00368 00369 #else 00370 00371 c0 = c0 & 0x0000FFFF; 00372 00373 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00374 /* Read x[7] */ 00375 x3 = *__SIMD32(px); 00376 px++; 00377 00378 /* Perform the multiply-accumulates */ 00379 acc0 = __SMLALD(x0, c0, acc0); 00380 acc1 = __SMLALD(x1, c0, acc1); 00381 acc2 = __SMLALDX(x1, c0, acc2); 00382 acc3 = __SMLALDX(x3, c0, acc3); 00383 } 00384 00385 if (k == 2U) 00386 { 00387 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00388 c0 = _SIMD32_OFFSET(py); 00389 00390 /* Read x[7], x[8] */ 00391 x3 = *__SIMD32(px); 00392 00393 /* Read x[9] */ 00394 x2 = _SIMD32_OFFSET(px+1); 00395 px += 2U; 00396 00397 /* Perform the multiply-accumulates */ 00398 acc0 = __SMLALDX(x0, c0, acc0); 00399 acc1 = __SMLALDX(x1, c0, acc1); 00400 acc2 = __SMLALDX(x3, c0, acc2); 00401 acc3 = __SMLALDX(x2, c0, acc3); 00402 } 00403 00404 if (k == 3U) 00405 { 00406 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00407 c0 = _SIMD32_OFFSET(py); 00408 00409 /* Read x[7], x[8] */ 00410 x3 = *__SIMD32(px); 00411 00412 /* Read x[9] */ 00413 x2 = _SIMD32_OFFSET(px+1); 00414 00415 /* Perform the multiply-accumulates */ 00416 acc0 = __SMLALDX(x0, c0, acc0); 00417 acc1 = __SMLALDX(x1, c0, acc1); 00418 acc2 = __SMLALDX(x3, c0, acc2); 00419 acc3 = __SMLALDX(x2, c0, acc3); 00420 00421 c0 = *(py-1); 00422 00423 #ifdef ARM_MATH_BIG_ENDIAN 00424 00425 c0 = c0 << 16U; 00426 #else 00427 00428 c0 = c0 & 0x0000FFFF; 00429 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00430 /* Read x[10] */ 00431 x3 = _SIMD32_OFFSET(px+2); 00432 px += 3U; 00433 00434 /* Perform the multiply-accumulates */ 00435 acc0 = __SMLALDX(x1, c0, acc0); 00436 acc1 = __SMLALD(x2, c0, acc1); 00437 acc2 = __SMLALDX(x2, c0, acc2); 00438 acc3 = __SMLALDX(x3, c0, acc3); 00439 } 00440 00441 00442 /* Store the results in the accumulators in the destination buffer. */ 00443 00444 #ifndef ARM_MATH_BIG_ENDIAN 00445 00446 *__SIMD32(pOut)++ = 00447 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00448 *__SIMD32(pOut)++ = 00449 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00450 00451 #else 00452 00453 *__SIMD32(pOut)++ = 00454 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00455 *__SIMD32(pOut)++ = 00456 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00457 00458 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00459 00460 /* Increment the pointer pIn1 index, count by 4 */ 00461 count += 4U; 00462 00463 /* Update the inputA and inputB pointers for next MAC calculation */ 00464 px = pIn1 + count; 00465 py = pSrc2; 00466 00467 /* Decrement the loop counter */ 00468 blkCnt--; 00469 } 00470 00471 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00472 ** No loop unrolling is used. */ 00473 blkCnt = blockSize2 % 0x4U; 00474 00475 while (blkCnt > 0U) 00476 { 00477 /* Accumulator is made zero for every iteration */ 00478 sum = 0; 00479 00480 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00481 k = srcBLen >> 2U; 00482 00483 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00484 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00485 while (k > 0U) 00486 { 00487 /* Perform the multiply-accumulates */ 00488 sum += (q63_t) ((q31_t) * px++ * *py--); 00489 sum += (q63_t) ((q31_t) * px++ * *py--); 00490 sum += (q63_t) ((q31_t) * px++ * *py--); 00491 sum += (q63_t) ((q31_t) * px++ * *py--); 00492 00493 /* Decrement the loop counter */ 00494 k--; 00495 } 00496 00497 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00498 ** No loop unrolling is used. */ 00499 k = srcBLen % 0x4U; 00500 00501 while (k > 0U) 00502 { 00503 /* Perform the multiply-accumulates */ 00504 sum += (q63_t) ((q31_t) * px++ * *py--); 00505 00506 /* Decrement the loop counter */ 00507 k--; 00508 } 00509 00510 /* Store the result in the accumulator in the destination buffer. */ 00511 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00512 00513 /* Increment the pointer pIn1 index, count by 1 */ 00514 count++; 00515 00516 /* Update the inputA and inputB pointers for next MAC calculation */ 00517 px = pIn1 + count; 00518 py = pSrc2; 00519 00520 /* Decrement the loop counter */ 00521 blkCnt--; 00522 } 00523 } 00524 else 00525 { 00526 /* If the srcBLen is not a multiple of 4, 00527 * the blockSize2 loop cannot be unrolled by 4 */ 00528 blkCnt = blockSize2; 00529 00530 while (blkCnt > 0U) 00531 { 00532 /* Accumulator is made zero for every iteration */ 00533 sum = 0; 00534 00535 /* srcBLen number of MACS should be performed */ 00536 k = srcBLen; 00537 00538 while (k > 0U) 00539 { 00540 /* Perform the multiply-accumulate */ 00541 sum += (q63_t) ((q31_t) * px++ * *py--); 00542 00543 /* Decrement the loop counter */ 00544 k--; 00545 } 00546 00547 /* Store the result in the accumulator in the destination buffer. */ 00548 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00549 00550 /* Increment the MAC count */ 00551 count++; 00552 00553 /* Update the inputA and inputB pointers for next MAC calculation */ 00554 px = pIn1 + count; 00555 py = pSrc2; 00556 00557 /* Decrement the loop counter */ 00558 blkCnt--; 00559 } 00560 } 00561 00562 00563 /* -------------------------- 00564 * Initializations of stage3 00565 * -------------------------*/ 00566 00567 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00568 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00569 * .... 00570 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00571 * sum += x[srcALen-1] * y[srcBLen-1] 00572 */ 00573 00574 /* In this stage the MAC operations are decreased by 1 for every iteration. 00575 The blockSize3 variable holds the number of MAC operations performed */ 00576 00577 blockSize3 = srcBLen - 1U; 00578 00579 /* Working pointer of inputA */ 00580 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); 00581 px = pSrc1; 00582 00583 /* Working pointer of inputB */ 00584 pSrc2 = pIn2 + (srcBLen - 1U); 00585 pIn2 = pSrc2 - 1U; 00586 py = pIn2; 00587 00588 /* ------------------- 00589 * Stage3 process 00590 * ------------------*/ 00591 00592 /* For loop unrolling by 4, this stage is divided into two. */ 00593 /* First part of this stage computes the MAC operations greater than 4 */ 00594 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00595 00596 /* The first part of the stage starts here */ 00597 j = blockSize3 >> 2U; 00598 00599 while ((j > 0U) && (blockSize3 > 0U)) 00600 { 00601 /* Accumulator is made zero for every iteration */ 00602 sum = 0; 00603 00604 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00605 k = blockSize3 >> 2U; 00606 00607 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00608 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00609 while (k > 0U) 00610 { 00611 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00612 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00613 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00614 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00615 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00616 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00617 00618 /* Decrement the loop counter */ 00619 k--; 00620 } 00621 00622 /* For the next MAC operations, the pointer py is used without SIMD 00623 * So, py is incremented by 1 */ 00624 py = py + 1U; 00625 00626 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00627 ** No loop unrolling is used. */ 00628 k = blockSize3 % 0x4U; 00629 00630 while (k > 0U) 00631 { 00632 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00633 sum = __SMLALD(*px++, *py--, sum); 00634 00635 /* Decrement the loop counter */ 00636 k--; 00637 } 00638 00639 /* Store the result in the accumulator in the destination buffer. */ 00640 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00641 00642 /* Update the inputA and inputB pointers for next MAC calculation */ 00643 px = ++pSrc1; 00644 py = pIn2; 00645 00646 /* Decrement the loop counter */ 00647 blockSize3--; 00648 00649 j--; 00650 } 00651 00652 /* The second part of the stage starts here */ 00653 /* SIMD is not used for the next MAC operations, 00654 * so pointer py is updated to read only one sample at a time */ 00655 py = py + 1U; 00656 00657 while (blockSize3 > 0U) 00658 { 00659 /* Accumulator is made zero for every iteration */ 00660 sum = 0; 00661 00662 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00663 k = blockSize3; 00664 00665 while (k > 0U) 00666 { 00667 /* Perform the multiply-accumulates */ 00668 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00669 sum = __SMLALD(*px++, *py--, sum); 00670 00671 /* Decrement the loop counter */ 00672 k--; 00673 } 00674 00675 /* Store the result in the accumulator in the destination buffer. */ 00676 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00677 00678 /* Update the inputA and inputB pointers for next MAC calculation */ 00679 px = ++pSrc1; 00680 py = pSrc2; 00681 00682 /* Decrement the loop counter */ 00683 blockSize3--; 00684 } 00685 00686 #else 00687 00688 /* Run the below code for Cortex-M0 */ 00689 00690 q15_t *pIn1 = pSrcA; /* input pointer */ 00691 q15_t *pIn2 = pSrcB; /* coefficient pointer */ 00692 q63_t sum; /* Accumulator */ 00693 uint32_t i, j; /* loop counter */ 00694 00695 /* Loop to calculate output of convolution for output length number of times */ 00696 for (i = 0; i < (srcALen + srcBLen - 1); i++) 00697 { 00698 /* Initialize sum with zero to carry on MAC operations */ 00699 sum = 0; 00700 00701 /* Loop to perform MAC operations according to convolution equation */ 00702 for (j = 0; j <= i; j++) 00703 { 00704 /* Check the array limitations */ 00705 if (((i - j) < srcBLen) && (j < srcALen)) 00706 { 00707 /* z[i] += x[i-j] * y[j] */ 00708 sum += (q31_t) pIn1[j] * (pIn2[i - j]); 00709 } 00710 } 00711 00712 /* Store the output in the destination buffer */ 00713 pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U); 00714 } 00715 00716 #endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */ 00717 00718 } 00719 00720 /** 00721 * @} end of Conv group 00722 */ 00723
Generated on Tue Jul 12 2022 16:46:23 by 1.7.2