Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
arm_conv_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_conv_fast_q15.c 00004 * Description: Fast Q15 Convolution 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup Conv 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00047 * @return none. 00048 * 00049 * <b>Scaling and Overflow Behavior:</b> 00050 * 00051 * \par 00052 * This fast version uses a 32-bit accumulator with 2.30 format. 00053 * The accumulator maintains full precision of the intermediate multiplication results 00054 * but provides only a single guard bit. There is no saturation on intermediate additions. 00055 * Thus, if the accumulator overflows it wraps around and distorts the result. 00056 * The input signals should be scaled down to avoid intermediate overflows. 00057 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, 00058 * as maximum of min(srcALen, srcBLen) number of additions are carried internally. 00059 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. 00060 * 00061 * \par 00062 * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. 00063 */ 00064 00065 void arm_conv_fast_q15( 00066 q15_t * pSrcA, 00067 uint32_t srcALen, 00068 q15_t * pSrcB, 00069 uint32_t srcBLen, 00070 q15_t * pDst) 00071 { 00072 #ifndef UNALIGNED_SUPPORT_DISABLE 00073 q15_t *pIn1; /* inputA pointer */ 00074 q15_t *pIn2; /* inputB pointer */ 00075 q15_t *pOut = pDst; /* output pointer */ 00076 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00077 q15_t *px; /* Intermediate inputA pointer */ 00078 q15_t *py; /* Intermediate inputB pointer */ 00079 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00080 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00081 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ 00082 00083 /* The algorithm implementation is based on the lengths of the inputs. */ 00084 /* srcB is always made to slide across srcA. */ 00085 /* So srcBLen is always considered as shorter or equal to srcALen */ 00086 if (srcALen >= srcBLen) 00087 { 00088 /* Initialization of inputA pointer */ 00089 pIn1 = pSrcA; 00090 00091 /* Initialization of inputB pointer */ 00092 pIn2 = pSrcB; 00093 } 00094 else 00095 { 00096 /* Initialization of inputA pointer */ 00097 pIn1 = pSrcB; 00098 00099 /* Initialization of inputB pointer */ 00100 pIn2 = pSrcA; 00101 00102 /* srcBLen is always considered as shorter or equal to srcALen */ 00103 j = srcBLen; 00104 srcBLen = srcALen; 00105 srcALen = j; 00106 } 00107 00108 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00109 /* The function is internally 00110 * divided into three stages according to the number of multiplications that has to be 00111 * taken place between inputA samples and inputB samples. In the first stage of the 00112 * algorithm, the multiplications increase by one for every iteration. 00113 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00114 * In the third stage of the algorithm, the multiplications decrease by one 00115 * for every iteration. */ 00116 00117 /* The algorithm is implemented in three stages. 00118 The loop counters of each stage is initiated here. */ 00119 blockSize1 = srcBLen - 1U; 00120 blockSize2 = srcALen - (srcBLen - 1U); 00121 blockSize3 = blockSize1; 00122 00123 /* -------------------------- 00124 * Initializations of stage1 00125 * -------------------------*/ 00126 00127 /* sum = x[0] * y[0] 00128 * sum = x[0] * y[1] + x[1] * y[0] 00129 * .... 00130 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00131 */ 00132 00133 /* In this stage the MAC operations are increased by 1 for every iteration. 00134 The count variable holds the number of MAC operations performed */ 00135 count = 1U; 00136 00137 /* Working pointer of inputA */ 00138 px = pIn1; 00139 00140 /* Working pointer of inputB */ 00141 py = pIn2; 00142 00143 00144 /* ------------------------ 00145 * Stage1 process 00146 * ----------------------*/ 00147 00148 /* For loop unrolling by 4, this stage is divided into two. */ 00149 /* First part of this stage computes the MAC operations less than 4 */ 00150 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00151 00152 /* The first part of the stage starts here */ 00153 while ((count < 4U) && (blockSize1 > 0U)) 00154 { 00155 /* Accumulator is made zero for every iteration */ 00156 sum = 0; 00157 00158 /* Loop over number of MAC operations between 00159 * inputA samples and inputB samples */ 00160 k = count; 00161 00162 while (k > 0U) 00163 { 00164 /* Perform the multiply-accumulates */ 00165 sum = __SMLAD(*px++, *py--, sum); 00166 00167 /* Decrement the loop counter */ 00168 k--; 00169 } 00170 00171 /* Store the result in the accumulator in the destination buffer. */ 00172 *pOut++ = (q15_t) (sum >> 15); 00173 00174 /* Update the inputA and inputB pointers for next MAC calculation */ 00175 py = pIn2 + count; 00176 px = pIn1; 00177 00178 /* Increment the MAC count */ 00179 count++; 00180 00181 /* Decrement the loop counter */ 00182 blockSize1--; 00183 } 00184 00185 /* The second part of the stage starts here */ 00186 /* The internal loop, over count, is unrolled by 4 */ 00187 /* To, read the last two inputB samples using SIMD: 00188 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00189 py = py - 1; 00190 00191 while (blockSize1 > 0U) 00192 { 00193 /* Accumulator is made zero for every iteration */ 00194 sum = 0; 00195 00196 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00197 k = count >> 2U; 00198 00199 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00200 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00201 while (k > 0U) 00202 { 00203 /* Perform the multiply-accumulates */ 00204 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00205 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00206 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00207 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00208 00209 /* Decrement the loop counter */ 00210 k--; 00211 } 00212 00213 /* For the next MAC operations, the pointer py is used without SIMD 00214 * So, py is incremented by 1 */ 00215 py = py + 1U; 00216 00217 /* If the count is not a multiple of 4, compute any remaining MACs here. 00218 ** No loop unrolling is used. */ 00219 k = count % 0x4U; 00220 00221 while (k > 0U) 00222 { 00223 /* Perform the multiply-accumulates */ 00224 sum = __SMLAD(*px++, *py--, sum); 00225 00226 /* Decrement the loop counter */ 00227 k--; 00228 } 00229 00230 /* Store the result in the accumulator in the destination buffer. */ 00231 *pOut++ = (q15_t) (sum >> 15); 00232 00233 /* Update the inputA and inputB pointers for next MAC calculation */ 00234 py = pIn2 + (count - 1U); 00235 px = pIn1; 00236 00237 /* Increment the MAC count */ 00238 count++; 00239 00240 /* Decrement the loop counter */ 00241 blockSize1--; 00242 } 00243 00244 /* -------------------------- 00245 * Initializations of stage2 00246 * ------------------------*/ 00247 00248 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00249 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00250 * .... 00251 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00252 */ 00253 00254 /* Working pointer of inputA */ 00255 px = pIn1; 00256 00257 /* Working pointer of inputB */ 00258 pSrc2 = pIn2 + (srcBLen - 1U); 00259 py = pSrc2; 00260 00261 /* count is the index by which the pointer pIn1 to be incremented */ 00262 count = 0U; 00263 00264 00265 /* -------------------- 00266 * Stage2 process 00267 * -------------------*/ 00268 00269 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00270 * So, to loop unroll over blockSize2, 00271 * srcBLen should be greater than or equal to 4 */ 00272 if (srcBLen >= 4U) 00273 { 00274 /* Loop unroll over blockSize2, by 4 */ 00275 blkCnt = blockSize2 >> 2U; 00276 00277 while (blkCnt > 0U) 00278 { 00279 py = py - 1U; 00280 00281 /* Set all accumulators to zero */ 00282 acc0 = 0; 00283 acc1 = 0; 00284 acc2 = 0; 00285 acc3 = 0; 00286 00287 00288 /* read x[0], x[1] samples */ 00289 x0 = *__SIMD32(px); 00290 /* read x[1], x[2] samples */ 00291 x1 = _SIMD32_OFFSET(px+1); 00292 px+= 2U; 00293 00294 00295 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00296 k = srcBLen >> 2U; 00297 00298 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00299 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00300 do 00301 { 00302 /* Read the last two inputB samples using SIMD: 00303 * y[srcBLen - 1] and y[srcBLen - 2] */ 00304 c0 = *__SIMD32(py)--; 00305 00306 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00307 acc0 = __SMLADX(x0, c0, acc0); 00308 00309 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00310 acc1 = __SMLADX(x1, c0, acc1); 00311 00312 /* Read x[2], x[3] */ 00313 x2 = *__SIMD32(px); 00314 00315 /* Read x[3], x[4] */ 00316 x3 = _SIMD32_OFFSET(px+1); 00317 00318 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00319 acc2 = __SMLADX(x2, c0, acc2); 00320 00321 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00322 acc3 = __SMLADX(x3, c0, acc3); 00323 00324 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00325 c0 = *__SIMD32(py)--; 00326 00327 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00328 acc0 = __SMLADX(x2, c0, acc0); 00329 00330 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00331 acc1 = __SMLADX(x3, c0, acc1); 00332 00333 /* Read x[4], x[5] */ 00334 x0 = _SIMD32_OFFSET(px+2); 00335 00336 /* Read x[5], x[6] */ 00337 x1 = _SIMD32_OFFSET(px+3); 00338 px += 4U; 00339 00340 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00341 acc2 = __SMLADX(x0, c0, acc2); 00342 00343 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00344 acc3 = __SMLADX(x1, c0, acc3); 00345 00346 } while (--k); 00347 00348 /* For the next MAC operations, SIMD is not used 00349 * So, the 16 bit pointer if inputB, py is updated */ 00350 00351 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00352 ** No loop unrolling is used. */ 00353 k = srcBLen % 0x4U; 00354 00355 if (k == 1U) 00356 { 00357 /* Read y[srcBLen - 5] */ 00358 c0 = *(py+1); 00359 00360 #ifdef ARM_MATH_BIG_ENDIAN 00361 00362 c0 = c0 << 16U; 00363 00364 #else 00365 00366 c0 = c0 & 0x0000FFFF; 00367 00368 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00369 00370 /* Read x[7] */ 00371 x3 = *__SIMD32(px); 00372 px++; 00373 00374 /* Perform the multiply-accumulates */ 00375 acc0 = __SMLAD(x0, c0, acc0); 00376 acc1 = __SMLAD(x1, c0, acc1); 00377 acc2 = __SMLADX(x1, c0, acc2); 00378 acc3 = __SMLADX(x3, c0, acc3); 00379 } 00380 00381 if (k == 2U) 00382 { 00383 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00384 c0 = _SIMD32_OFFSET(py); 00385 00386 /* Read x[7], x[8] */ 00387 x3 = *__SIMD32(px); 00388 00389 /* Read x[9] */ 00390 x2 = _SIMD32_OFFSET(px+1); 00391 px += 2U; 00392 00393 /* Perform the multiply-accumulates */ 00394 acc0 = __SMLADX(x0, c0, acc0); 00395 acc1 = __SMLADX(x1, c0, acc1); 00396 acc2 = __SMLADX(x3, c0, acc2); 00397 acc3 = __SMLADX(x2, c0, acc3); 00398 } 00399 00400 if (k == 3U) 00401 { 00402 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00403 c0 = _SIMD32_OFFSET(py); 00404 00405 /* Read x[7], x[8] */ 00406 x3 = *__SIMD32(px); 00407 00408 /* Read x[9] */ 00409 x2 = _SIMD32_OFFSET(px+1); 00410 00411 /* Perform the multiply-accumulates */ 00412 acc0 = __SMLADX(x0, c0, acc0); 00413 acc1 = __SMLADX(x1, c0, acc1); 00414 acc2 = __SMLADX(x3, c0, acc2); 00415 acc3 = __SMLADX(x2, c0, acc3); 00416 00417 /* Read y[srcBLen - 7] */ 00418 c0 = *(py-1); 00419 #ifdef ARM_MATH_BIG_ENDIAN 00420 00421 c0 = c0 << 16U; 00422 #else 00423 00424 c0 = c0 & 0x0000FFFF; 00425 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00426 00427 /* Read x[10] */ 00428 x3 = _SIMD32_OFFSET(px+2); 00429 px += 3U; 00430 00431 /* Perform the multiply-accumulates */ 00432 acc0 = __SMLADX(x1, c0, acc0); 00433 acc1 = __SMLAD(x2, c0, acc1); 00434 acc2 = __SMLADX(x2, c0, acc2); 00435 acc3 = __SMLADX(x3, c0, acc3); 00436 } 00437 00438 /* Store the results in the accumulators in the destination buffer. */ 00439 #ifndef ARM_MATH_BIG_ENDIAN 00440 00441 *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16); 00442 *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16); 00443 00444 #else 00445 00446 *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16); 00447 *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16); 00448 00449 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00450 00451 /* Increment the pointer pIn1 index, count by 4 */ 00452 count += 4U; 00453 00454 /* Update the inputA and inputB pointers for next MAC calculation */ 00455 px = pIn1 + count; 00456 py = pSrc2; 00457 00458 /* Decrement the loop counter */ 00459 blkCnt--; 00460 } 00461 00462 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00463 ** No loop unrolling is used. */ 00464 blkCnt = blockSize2 % 0x4U; 00465 00466 while (blkCnt > 0U) 00467 { 00468 /* Accumulator is made zero for every iteration */ 00469 sum = 0; 00470 00471 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00472 k = srcBLen >> 2U; 00473 00474 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00475 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00476 while (k > 0U) 00477 { 00478 /* Perform the multiply-accumulates */ 00479 sum += ((q31_t) * px++ * *py--); 00480 sum += ((q31_t) * px++ * *py--); 00481 sum += ((q31_t) * px++ * *py--); 00482 sum += ((q31_t) * px++ * *py--); 00483 00484 /* Decrement the loop counter */ 00485 k--; 00486 } 00487 00488 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00489 ** No loop unrolling is used. */ 00490 k = srcBLen % 0x4U; 00491 00492 while (k > 0U) 00493 { 00494 /* Perform the multiply-accumulates */ 00495 sum += ((q31_t) * px++ * *py--); 00496 00497 /* Decrement the loop counter */ 00498 k--; 00499 } 00500 00501 /* Store the result in the accumulator in the destination buffer. */ 00502 *pOut++ = (q15_t) (sum >> 15); 00503 00504 /* Increment the pointer pIn1 index, count by 1 */ 00505 count++; 00506 00507 /* Update the inputA and inputB pointers for next MAC calculation */ 00508 px = pIn1 + count; 00509 py = pSrc2; 00510 00511 /* Decrement the loop counter */ 00512 blkCnt--; 00513 } 00514 } 00515 else 00516 { 00517 /* If the srcBLen is not a multiple of 4, 00518 * the blockSize2 loop cannot be unrolled by 4 */ 00519 blkCnt = blockSize2; 00520 00521 while (blkCnt > 0U) 00522 { 00523 /* Accumulator is made zero for every iteration */ 00524 sum = 0; 00525 00526 /* srcBLen number of MACS should be performed */ 00527 k = srcBLen; 00528 00529 while (k > 0U) 00530 { 00531 /* Perform the multiply-accumulate */ 00532 sum += ((q31_t) * px++ * *py--); 00533 00534 /* Decrement the loop counter */ 00535 k--; 00536 } 00537 00538 /* Store the result in the accumulator in the destination buffer. */ 00539 *pOut++ = (q15_t) (sum >> 15); 00540 00541 /* Increment the MAC count */ 00542 count++; 00543 00544 /* Update the inputA and inputB pointers for next MAC calculation */ 00545 px = pIn1 + count; 00546 py = pSrc2; 00547 00548 /* Decrement the loop counter */ 00549 blkCnt--; 00550 } 00551 } 00552 00553 00554 /* -------------------------- 00555 * Initializations of stage3 00556 * -------------------------*/ 00557 00558 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00559 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00560 * .... 00561 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00562 * sum += x[srcALen-1] * y[srcBLen-1] 00563 */ 00564 00565 /* In this stage the MAC operations are decreased by 1 for every iteration. 00566 The blockSize3 variable holds the number of MAC operations performed */ 00567 00568 /* Working pointer of inputA */ 00569 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); 00570 px = pSrc1; 00571 00572 /* Working pointer of inputB */ 00573 pSrc2 = pIn2 + (srcBLen - 1U); 00574 pIn2 = pSrc2 - 1U; 00575 py = pIn2; 00576 00577 /* ------------------- 00578 * Stage3 process 00579 * ------------------*/ 00580 00581 /* For loop unrolling by 4, this stage is divided into two. */ 00582 /* First part of this stage computes the MAC operations greater than 4 */ 00583 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00584 00585 /* The first part of the stage starts here */ 00586 j = blockSize3 >> 2U; 00587 00588 while ((j > 0U) && (blockSize3 > 0U)) 00589 { 00590 /* Accumulator is made zero for every iteration */ 00591 sum = 0; 00592 00593 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00594 k = blockSize3 >> 2U; 00595 00596 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00597 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00598 while (k > 0U) 00599 { 00600 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00601 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00602 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00603 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00604 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00605 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00606 00607 /* Decrement the loop counter */ 00608 k--; 00609 } 00610 00611 /* For the next MAC operations, the pointer py is used without SIMD 00612 * So, py is incremented by 1 */ 00613 py = py + 1U; 00614 00615 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00616 ** No loop unrolling is used. */ 00617 k = blockSize3 % 0x4U; 00618 00619 while (k > 0U) 00620 { 00621 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00622 sum = __SMLAD(*px++, *py--, sum); 00623 00624 /* Decrement the loop counter */ 00625 k--; 00626 } 00627 00628 /* Store the result in the accumulator in the destination buffer. */ 00629 *pOut++ = (q15_t) (sum >> 15); 00630 00631 /* Update the inputA and inputB pointers for next MAC calculation */ 00632 px = ++pSrc1; 00633 py = pIn2; 00634 00635 /* Decrement the loop counter */ 00636 blockSize3--; 00637 00638 j--; 00639 } 00640 00641 /* The second part of the stage starts here */ 00642 /* SIMD is not used for the next MAC operations, 00643 * so pointer py is updated to read only one sample at a time */ 00644 py = py + 1U; 00645 00646 while (blockSize3 > 0U) 00647 { 00648 /* Accumulator is made zero for every iteration */ 00649 sum = 0; 00650 00651 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00652 k = blockSize3; 00653 00654 while (k > 0U) 00655 { 00656 /* Perform the multiply-accumulates */ 00657 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00658 sum = __SMLAD(*px++, *py--, sum); 00659 00660 /* Decrement the loop counter */ 00661 k--; 00662 } 00663 00664 /* Store the result in the accumulator in the destination buffer. */ 00665 *pOut++ = (q15_t) (sum >> 15); 00666 00667 /* Update the inputA and inputB pointers for next MAC calculation */ 00668 px = ++pSrc1; 00669 py = pSrc2; 00670 00671 /* Decrement the loop counter */ 00672 blockSize3--; 00673 } 00674 00675 #else 00676 q15_t *pIn1; /* inputA pointer */ 00677 q15_t *pIn2; /* inputB pointer */ 00678 q15_t *pOut = pDst; /* output pointer */ 00679 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00680 q15_t *px; /* Intermediate inputA pointer */ 00681 q15_t *py; /* Intermediate inputB pointer */ 00682 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00683 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00684 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ 00685 q15_t a, b; 00686 00687 /* The algorithm implementation is based on the lengths of the inputs. */ 00688 /* srcB is always made to slide across srcA. */ 00689 /* So srcBLen is always considered as shorter or equal to srcALen */ 00690 if (srcALen >= srcBLen) 00691 { 00692 /* Initialization of inputA pointer */ 00693 pIn1 = pSrcA; 00694 00695 /* Initialization of inputB pointer */ 00696 pIn2 = pSrcB; 00697 } 00698 else 00699 { 00700 /* Initialization of inputA pointer */ 00701 pIn1 = pSrcB; 00702 00703 /* Initialization of inputB pointer */ 00704 pIn2 = pSrcA; 00705 00706 /* srcBLen is always considered as shorter or equal to srcALen */ 00707 j = srcBLen; 00708 srcBLen = srcALen; 00709 srcALen = j; 00710 } 00711 00712 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00713 /* The function is internally 00714 * divided into three stages according to the number of multiplications that has to be 00715 * taken place between inputA samples and inputB samples. In the first stage of the 00716 * algorithm, the multiplications increase by one for every iteration. 00717 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00718 * In the third stage of the algorithm, the multiplications decrease by one 00719 * for every iteration. */ 00720 00721 /* The algorithm is implemented in three stages. 00722 The loop counters of each stage is initiated here. */ 00723 blockSize1 = srcBLen - 1U; 00724 blockSize2 = srcALen - (srcBLen - 1U); 00725 blockSize3 = blockSize1; 00726 00727 /* -------------------------- 00728 * Initializations of stage1 00729 * -------------------------*/ 00730 00731 /* sum = x[0] * y[0] 00732 * sum = x[0] * y[1] + x[1] * y[0] 00733 * .... 00734 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00735 */ 00736 00737 /* In this stage the MAC operations are increased by 1 for every iteration. 00738 The count variable holds the number of MAC operations performed */ 00739 count = 1U; 00740 00741 /* Working pointer of inputA */ 00742 px = pIn1; 00743 00744 /* Working pointer of inputB */ 00745 py = pIn2; 00746 00747 00748 /* ------------------------ 00749 * Stage1 process 00750 * ----------------------*/ 00751 00752 /* For loop unrolling by 4, this stage is divided into two. */ 00753 /* First part of this stage computes the MAC operations less than 4 */ 00754 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00755 00756 /* The first part of the stage starts here */ 00757 while ((count < 4U) && (blockSize1 > 0U)) 00758 { 00759 /* Accumulator is made zero for every iteration */ 00760 sum = 0; 00761 00762 /* Loop over number of MAC operations between 00763 * inputA samples and inputB samples */ 00764 k = count; 00765 00766 while (k > 0U) 00767 { 00768 /* Perform the multiply-accumulates */ 00769 sum += ((q31_t) * px++ * *py--); 00770 00771 /* Decrement the loop counter */ 00772 k--; 00773 } 00774 00775 /* Store the result in the accumulator in the destination buffer. */ 00776 *pOut++ = (q15_t) (sum >> 15); 00777 00778 /* Update the inputA and inputB pointers for next MAC calculation */ 00779 py = pIn2 + count; 00780 px = pIn1; 00781 00782 /* Increment the MAC count */ 00783 count++; 00784 00785 /* Decrement the loop counter */ 00786 blockSize1--; 00787 } 00788 00789 /* The second part of the stage starts here */ 00790 /* The internal loop, over count, is unrolled by 4 */ 00791 /* To, read the last two inputB samples using SIMD: 00792 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00793 py = py - 1; 00794 00795 while (blockSize1 > 0U) 00796 { 00797 /* Accumulator is made zero for every iteration */ 00798 sum = 0; 00799 00800 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00801 k = count >> 2U; 00802 00803 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00804 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00805 py++; 00806 00807 while (k > 0U) 00808 { 00809 /* Perform the multiply-accumulates */ 00810 sum += ((q31_t) * px++ * *py--); 00811 sum += ((q31_t) * px++ * *py--); 00812 sum += ((q31_t) * px++ * *py--); 00813 sum += ((q31_t) * px++ * *py--); 00814 00815 /* Decrement the loop counter */ 00816 k--; 00817 } 00818 00819 /* If the count is not a multiple of 4, compute any remaining MACs here. 00820 ** No loop unrolling is used. */ 00821 k = count % 0x4U; 00822 00823 while (k > 0U) 00824 { 00825 /* Perform the multiply-accumulates */ 00826 sum += ((q31_t) * px++ * *py--); 00827 00828 /* Decrement the loop counter */ 00829 k--; 00830 } 00831 00832 /* Store the result in the accumulator in the destination buffer. */ 00833 *pOut++ = (q15_t) (sum >> 15); 00834 00835 /* Update the inputA and inputB pointers for next MAC calculation */ 00836 py = pIn2 + (count - 1U); 00837 px = pIn1; 00838 00839 /* Increment the MAC count */ 00840 count++; 00841 00842 /* Decrement the loop counter */ 00843 blockSize1--; 00844 } 00845 00846 /* -------------------------- 00847 * Initializations of stage2 00848 * ------------------------*/ 00849 00850 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00851 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00852 * .... 00853 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00854 */ 00855 00856 /* Working pointer of inputA */ 00857 px = pIn1; 00858 00859 /* Working pointer of inputB */ 00860 pSrc2 = pIn2 + (srcBLen - 1U); 00861 py = pSrc2; 00862 00863 /* count is the index by which the pointer pIn1 to be incremented */ 00864 count = 0U; 00865 00866 00867 /* -------------------- 00868 * Stage2 process 00869 * -------------------*/ 00870 00871 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00872 * So, to loop unroll over blockSize2, 00873 * srcBLen should be greater than or equal to 4 */ 00874 if (srcBLen >= 4U) 00875 { 00876 /* Loop unroll over blockSize2, by 4 */ 00877 blkCnt = blockSize2 >> 2U; 00878 00879 while (blkCnt > 0U) 00880 { 00881 py = py - 1U; 00882 00883 /* Set all accumulators to zero */ 00884 acc0 = 0; 00885 acc1 = 0; 00886 acc2 = 0; 00887 acc3 = 0; 00888 00889 /* read x[0], x[1] samples */ 00890 a = *px++; 00891 b = *px++; 00892 00893 #ifndef ARM_MATH_BIG_ENDIAN 00894 00895 x0 = __PKHBT(a, b, 16); 00896 a = *px; 00897 x1 = __PKHBT(b, a, 16); 00898 00899 #else 00900 00901 x0 = __PKHBT(b, a, 16); 00902 a = *px; 00903 x1 = __PKHBT(a, b, 16); 00904 00905 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00906 00907 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00908 k = srcBLen >> 2U; 00909 00910 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00911 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00912 do 00913 { 00914 /* Read the last two inputB samples using SIMD: 00915 * y[srcBLen - 1] and y[srcBLen - 2] */ 00916 a = *py; 00917 b = *(py+1); 00918 py -= 2; 00919 00920 #ifndef ARM_MATH_BIG_ENDIAN 00921 00922 c0 = __PKHBT(a, b, 16); 00923 00924 #else 00925 00926 c0 = __PKHBT(b, a, 16);; 00927 00928 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00929 00930 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00931 acc0 = __SMLADX(x0, c0, acc0); 00932 00933 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00934 acc1 = __SMLADX(x1, c0, acc1); 00935 00936 a = *px; 00937 b = *(px + 1); 00938 00939 #ifndef ARM_MATH_BIG_ENDIAN 00940 00941 x2 = __PKHBT(a, b, 16); 00942 a = *(px + 2); 00943 x3 = __PKHBT(b, a, 16); 00944 00945 #else 00946 00947 x2 = __PKHBT(b, a, 16); 00948 a = *(px + 2); 00949 x3 = __PKHBT(a, b, 16); 00950 00951 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00952 00953 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00954 acc2 = __SMLADX(x2, c0, acc2); 00955 00956 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00957 acc3 = __SMLADX(x3, c0, acc3); 00958 00959 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00960 a = *py; 00961 b = *(py+1); 00962 py -= 2; 00963 00964 #ifndef ARM_MATH_BIG_ENDIAN 00965 00966 c0 = __PKHBT(a, b, 16); 00967 00968 #else 00969 00970 c0 = __PKHBT(b, a, 16);; 00971 00972 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00973 00974 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00975 acc0 = __SMLADX(x2, c0, acc0); 00976 00977 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00978 acc1 = __SMLADX(x3, c0, acc1); 00979 00980 /* Read x[4], x[5], x[6] */ 00981 a = *(px + 2); 00982 b = *(px + 3); 00983 00984 #ifndef ARM_MATH_BIG_ENDIAN 00985 00986 x0 = __PKHBT(a, b, 16); 00987 a = *(px + 4); 00988 x1 = __PKHBT(b, a, 16); 00989 00990 #else 00991 00992 x0 = __PKHBT(b, a, 16); 00993 a = *(px + 4); 00994 x1 = __PKHBT(a, b, 16); 00995 00996 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00997 00998 px += 4U; 00999 01000 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 01001 acc2 = __SMLADX(x0, c0, acc2); 01002 01003 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 01004 acc3 = __SMLADX(x1, c0, acc3); 01005 01006 } while (--k); 01007 01008 /* For the next MAC operations, SIMD is not used 01009 * So, the 16 bit pointer if inputB, py is updated */ 01010 01011 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01012 ** No loop unrolling is used. */ 01013 k = srcBLen % 0x4U; 01014 01015 if (k == 1U) 01016 { 01017 /* Read y[srcBLen - 5] */ 01018 c0 = *(py+1); 01019 01020 #ifdef ARM_MATH_BIG_ENDIAN 01021 01022 c0 = c0 << 16U; 01023 01024 #else 01025 01026 c0 = c0 & 0x0000FFFF; 01027 01028 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01029 01030 /* Read x[7] */ 01031 a = *px; 01032 b = *(px+1); 01033 px++; 01034 01035 #ifndef ARM_MATH_BIG_ENDIAN 01036 01037 x3 = __PKHBT(a, b, 16); 01038 01039 #else 01040 01041 x3 = __PKHBT(b, a, 16);; 01042 01043 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01044 01045 01046 /* Perform the multiply-accumulates */ 01047 acc0 = __SMLAD(x0, c0, acc0); 01048 acc1 = __SMLAD(x1, c0, acc1); 01049 acc2 = __SMLADX(x1, c0, acc2); 01050 acc3 = __SMLADX(x3, c0, acc3); 01051 } 01052 01053 if (k == 2U) 01054 { 01055 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 01056 a = *py; 01057 b = *(py+1); 01058 01059 #ifndef ARM_MATH_BIG_ENDIAN 01060 01061 c0 = __PKHBT(a, b, 16); 01062 01063 #else 01064 01065 c0 = __PKHBT(b, a, 16);; 01066 01067 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01068 01069 /* Read x[7], x[8], x[9] */ 01070 a = *px; 01071 b = *(px + 1); 01072 01073 #ifndef ARM_MATH_BIG_ENDIAN 01074 01075 x3 = __PKHBT(a, b, 16); 01076 a = *(px + 2); 01077 x2 = __PKHBT(b, a, 16); 01078 01079 #else 01080 01081 x3 = __PKHBT(b, a, 16); 01082 a = *(px + 2); 01083 x2 = __PKHBT(a, b, 16); 01084 01085 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01086 px += 2U; 01087 01088 /* Perform the multiply-accumulates */ 01089 acc0 = __SMLADX(x0, c0, acc0); 01090 acc1 = __SMLADX(x1, c0, acc1); 01091 acc2 = __SMLADX(x3, c0, acc2); 01092 acc3 = __SMLADX(x2, c0, acc3); 01093 } 01094 01095 if (k == 3U) 01096 { 01097 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 01098 a = *py; 01099 b = *(py+1); 01100 01101 #ifndef ARM_MATH_BIG_ENDIAN 01102 01103 c0 = __PKHBT(a, b, 16); 01104 01105 #else 01106 01107 c0 = __PKHBT(b, a, 16);; 01108 01109 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01110 01111 /* Read x[7], x[8], x[9] */ 01112 a = *px; 01113 b = *(px + 1); 01114 01115 #ifndef ARM_MATH_BIG_ENDIAN 01116 01117 x3 = __PKHBT(a, b, 16); 01118 a = *(px + 2); 01119 x2 = __PKHBT(b, a, 16); 01120 01121 #else 01122 01123 x3 = __PKHBT(b, a, 16); 01124 a = *(px + 2); 01125 x2 = __PKHBT(a, b, 16); 01126 01127 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01128 01129 /* Perform the multiply-accumulates */ 01130 acc0 = __SMLADX(x0, c0, acc0); 01131 acc1 = __SMLADX(x1, c0, acc1); 01132 acc2 = __SMLADX(x3, c0, acc2); 01133 acc3 = __SMLADX(x2, c0, acc3); 01134 01135 /* Read y[srcBLen - 7] */ 01136 c0 = *(py-1); 01137 #ifdef ARM_MATH_BIG_ENDIAN 01138 01139 c0 = c0 << 16U; 01140 #else 01141 01142 c0 = c0 & 0x0000FFFF; 01143 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01144 01145 /* Read x[10] */ 01146 a = *(px+2); 01147 b = *(px+3); 01148 01149 #ifndef ARM_MATH_BIG_ENDIAN 01150 01151 x3 = __PKHBT(a, b, 16); 01152 01153 #else 01154 01155 x3 = __PKHBT(b, a, 16);; 01156 01157 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01158 01159 px += 3U; 01160 01161 /* Perform the multiply-accumulates */ 01162 acc0 = __SMLADX(x1, c0, acc0); 01163 acc1 = __SMLAD(x2, c0, acc1); 01164 acc2 = __SMLADX(x2, c0, acc2); 01165 acc3 = __SMLADX(x3, c0, acc3); 01166 } 01167 01168 /* Store the results in the accumulators in the destination buffer. */ 01169 *pOut++ = (q15_t)(acc0 >> 15); 01170 *pOut++ = (q15_t)(acc1 >> 15); 01171 *pOut++ = (q15_t)(acc2 >> 15); 01172 *pOut++ = (q15_t)(acc3 >> 15); 01173 01174 /* Increment the pointer pIn1 index, count by 4 */ 01175 count += 4U; 01176 01177 /* Update the inputA and inputB pointers for next MAC calculation */ 01178 px = pIn1 + count; 01179 py = pSrc2; 01180 01181 /* Decrement the loop counter */ 01182 blkCnt--; 01183 } 01184 01185 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 01186 ** No loop unrolling is used. */ 01187 blkCnt = blockSize2 % 0x4U; 01188 01189 while (blkCnt > 0U) 01190 { 01191 /* Accumulator is made zero for every iteration */ 01192 sum = 0; 01193 01194 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01195 k = srcBLen >> 2U; 01196 01197 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01198 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01199 while (k > 0U) 01200 { 01201 /* Perform the multiply-accumulates */ 01202 sum += ((q31_t) * px++ * *py--); 01203 sum += ((q31_t) * px++ * *py--); 01204 sum += ((q31_t) * px++ * *py--); 01205 sum += ((q31_t) * px++ * *py--); 01206 01207 /* Decrement the loop counter */ 01208 k--; 01209 } 01210 01211 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01212 ** No loop unrolling is used. */ 01213 k = srcBLen % 0x4U; 01214 01215 while (k > 0U) 01216 { 01217 /* Perform the multiply-accumulates */ 01218 sum += ((q31_t) * px++ * *py--); 01219 01220 /* Decrement the loop counter */ 01221 k--; 01222 } 01223 01224 /* Store the result in the accumulator in the destination buffer. */ 01225 *pOut++ = (q15_t) (sum >> 15); 01226 01227 /* Increment the pointer pIn1 index, count by 1 */ 01228 count++; 01229 01230 /* Update the inputA and inputB pointers for next MAC calculation */ 01231 px = pIn1 + count; 01232 py = pSrc2; 01233 01234 /* Decrement the loop counter */ 01235 blkCnt--; 01236 } 01237 } 01238 else 01239 { 01240 /* If the srcBLen is not a multiple of 4, 01241 * the blockSize2 loop cannot be unrolled by 4 */ 01242 blkCnt = blockSize2; 01243 01244 while (blkCnt > 0U) 01245 { 01246 /* Accumulator is made zero for every iteration */ 01247 sum = 0; 01248 01249 /* srcBLen number of MACS should be performed */ 01250 k = srcBLen; 01251 01252 while (k > 0U) 01253 { 01254 /* Perform the multiply-accumulate */ 01255 sum += ((q31_t) * px++ * *py--); 01256 01257 /* Decrement the loop counter */ 01258 k--; 01259 } 01260 01261 /* Store the result in the accumulator in the destination buffer. */ 01262 *pOut++ = (q15_t) (sum >> 15); 01263 01264 /* Increment the MAC count */ 01265 count++; 01266 01267 /* Update the inputA and inputB pointers for next MAC calculation */ 01268 px = pIn1 + count; 01269 py = pSrc2; 01270 01271 /* Decrement the loop counter */ 01272 blkCnt--; 01273 } 01274 } 01275 01276 01277 /* -------------------------- 01278 * Initializations of stage3 01279 * -------------------------*/ 01280 01281 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 01282 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 01283 * .... 01284 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 01285 * sum += x[srcALen-1] * y[srcBLen-1] 01286 */ 01287 01288 /* In this stage the MAC operations are decreased by 1 for every iteration. 01289 The blockSize3 variable holds the number of MAC operations performed */ 01290 01291 /* Working pointer of inputA */ 01292 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); 01293 px = pSrc1; 01294 01295 /* Working pointer of inputB */ 01296 pSrc2 = pIn2 + (srcBLen - 1U); 01297 pIn2 = pSrc2 - 1U; 01298 py = pIn2; 01299 01300 /* ------------------- 01301 * Stage3 process 01302 * ------------------*/ 01303 01304 /* For loop unrolling by 4, this stage is divided into two. */ 01305 /* First part of this stage computes the MAC operations greater than 4 */ 01306 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 01307 01308 /* The first part of the stage starts here */ 01309 j = blockSize3 >> 2U; 01310 01311 while ((j > 0U) && (blockSize3 > 0U)) 01312 { 01313 /* Accumulator is made zero for every iteration */ 01314 sum = 0; 01315 01316 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01317 k = blockSize3 >> 2U; 01318 01319 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01320 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01321 py++; 01322 01323 while (k > 0U) 01324 { 01325 sum += ((q31_t) * px++ * *py--); 01326 sum += ((q31_t) * px++ * *py--); 01327 sum += ((q31_t) * px++ * *py--); 01328 sum += ((q31_t) * px++ * *py--); 01329 /* Decrement the loop counter */ 01330 k--; 01331 } 01332 01333 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 01334 ** No loop unrolling is used. */ 01335 k = blockSize3 % 0x4U; 01336 01337 while (k > 0U) 01338 { 01339 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 01340 sum += ((q31_t) * px++ * *py--); 01341 01342 /* Decrement the loop counter */ 01343 k--; 01344 } 01345 01346 /* Store the result in the accumulator in the destination buffer. */ 01347 *pOut++ = (q15_t) (sum >> 15); 01348 01349 /* Update the inputA and inputB pointers for next MAC calculation */ 01350 px = ++pSrc1; 01351 py = pIn2; 01352 01353 /* Decrement the loop counter */ 01354 blockSize3--; 01355 01356 j--; 01357 } 01358 01359 /* The second part of the stage starts here */ 01360 /* SIMD is not used for the next MAC operations, 01361 * so pointer py is updated to read only one sample at a time */ 01362 py = py + 1U; 01363 01364 while (blockSize3 > 0U) 01365 { 01366 /* Accumulator is made zero for every iteration */ 01367 sum = 0; 01368 01369 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01370 k = blockSize3; 01371 01372 while (k > 0U) 01373 { 01374 /* Perform the multiply-accumulates */ 01375 /* sum += x[srcALen-1] * y[srcBLen-1] */ 01376 sum += ((q31_t) * px++ * *py--); 01377 01378 /* Decrement the loop counter */ 01379 k--; 01380 } 01381 01382 /* Store the result in the accumulator in the destination buffer. */ 01383 *pOut++ = (q15_t) (sum >> 15); 01384 01385 /* Update the inputA and inputB pointers for next MAC calculation */ 01386 px = ++pSrc1; 01387 py = pSrc2; 01388 01389 /* Decrement the loop counter */ 01390 blockSize3--; 01391 } 01392 01393 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 01394 } 01395 01396 /** 01397 * @} end of Conv group 01398 */ 01399
Generated on Tue Jul 12 2022 16:46:23 by
1.7.2