CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_conv_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_fast_q15.c 00009 * 00010 * Description: Fast Q15 Convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @return none. 00060 * 00061 * <b>Scaling and Overflow Behavior:</b> 00062 * 00063 * \par 00064 * This fast version uses a 32-bit accumulator with 2.30 format. 00065 * The accumulator maintains full precision of the intermediate multiplication results 00066 * but provides only a single guard bit. There is no saturation on intermediate additions. 00067 * Thus, if the accumulator overflows it wraps around and distorts the result. 00068 * The input signals should be scaled down to avoid intermediate overflows. 00069 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, 00070 * as maximum of min(srcALen, srcBLen) number of additions are carried internally. 00071 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. 00072 * 00073 * \par 00074 * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. 00075 */ 00076 00077 void arm_conv_fast_q15( 00078 q15_t * pSrcA, 00079 uint32_t srcALen, 00080 q15_t * pSrcB, 00081 uint32_t srcBLen, 00082 q15_t * pDst) 00083 { 00084 #ifndef UNALIGNED_SUPPORT_DISABLE 00085 q15_t *pIn1; /* inputA pointer */ 00086 q15_t *pIn2; /* inputB pointer */ 00087 q15_t *pOut = pDst; /* output pointer */ 00088 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00089 q15_t *px; /* Intermediate inputA pointer */ 00090 q15_t *py; /* Intermediate inputB pointer */ 00091 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00092 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00093 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ 00094 00095 /* The algorithm implementation is based on the lengths of the inputs. */ 00096 /* srcB is always made to slide across srcA. */ 00097 /* So srcBLen is always considered as shorter or equal to srcALen */ 00098 if(srcALen >= srcBLen) 00099 { 00100 /* Initialization of inputA pointer */ 00101 pIn1 = pSrcA; 00102 00103 /* Initialization of inputB pointer */ 00104 pIn2 = pSrcB; 00105 } 00106 else 00107 { 00108 /* Initialization of inputA pointer */ 00109 pIn1 = pSrcB; 00110 00111 /* Initialization of inputB pointer */ 00112 pIn2 = pSrcA; 00113 00114 /* srcBLen is always considered as shorter or equal to srcALen */ 00115 j = srcBLen; 00116 srcBLen = srcALen; 00117 srcALen = j; 00118 } 00119 00120 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00121 /* The function is internally 00122 * divided into three stages according to the number of multiplications that has to be 00123 * taken place between inputA samples and inputB samples. In the first stage of the 00124 * algorithm, the multiplications increase by one for every iteration. 00125 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00126 * In the third stage of the algorithm, the multiplications decrease by one 00127 * for every iteration. */ 00128 00129 /* The algorithm is implemented in three stages. 00130 The loop counters of each stage is initiated here. */ 00131 blockSize1 = srcBLen - 1u; 00132 blockSize2 = srcALen - (srcBLen - 1u); 00133 blockSize3 = blockSize1; 00134 00135 /* -------------------------- 00136 * Initializations of stage1 00137 * -------------------------*/ 00138 00139 /* sum = x[0] * y[0] 00140 * sum = x[0] * y[1] + x[1] * y[0] 00141 * .... 00142 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00143 */ 00144 00145 /* In this stage the MAC operations are increased by 1 for every iteration. 00146 The count variable holds the number of MAC operations performed */ 00147 count = 1u; 00148 00149 /* Working pointer of inputA */ 00150 px = pIn1; 00151 00152 /* Working pointer of inputB */ 00153 py = pIn2; 00154 00155 00156 /* ------------------------ 00157 * Stage1 process 00158 * ----------------------*/ 00159 00160 /* For loop unrolling by 4, this stage is divided into two. */ 00161 /* First part of this stage computes the MAC operations less than 4 */ 00162 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00163 00164 /* The first part of the stage starts here */ 00165 while((count < 4u) && (blockSize1 > 0u)) 00166 { 00167 /* Accumulator is made zero for every iteration */ 00168 sum = 0; 00169 00170 /* Loop over number of MAC operations between 00171 * inputA samples and inputB samples */ 00172 k = count; 00173 00174 while(k > 0u) 00175 { 00176 /* Perform the multiply-accumulates */ 00177 sum = __SMLAD(*px++, *py--, sum); 00178 00179 /* Decrement the loop counter */ 00180 k--; 00181 } 00182 00183 /* Store the result in the accumulator in the destination buffer. */ 00184 *pOut++ = (q15_t) (sum >> 15); 00185 00186 /* Update the inputA and inputB pointers for next MAC calculation */ 00187 py = pIn2 + count; 00188 px = pIn1; 00189 00190 /* Increment the MAC count */ 00191 count++; 00192 00193 /* Decrement the loop counter */ 00194 blockSize1--; 00195 } 00196 00197 /* The second part of the stage starts here */ 00198 /* The internal loop, over count, is unrolled by 4 */ 00199 /* To, read the last two inputB samples using SIMD: 00200 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00201 py = py - 1; 00202 00203 while(blockSize1 > 0u) 00204 { 00205 /* Accumulator is made zero for every iteration */ 00206 sum = 0; 00207 00208 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00209 k = count >> 2u; 00210 00211 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00212 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00213 while(k > 0u) 00214 { 00215 /* Perform the multiply-accumulates */ 00216 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00217 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00218 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00219 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00220 00221 /* Decrement the loop counter */ 00222 k--; 00223 } 00224 00225 /* For the next MAC operations, the pointer py is used without SIMD 00226 * So, py is incremented by 1 */ 00227 py = py + 1u; 00228 00229 /* If the count is not a multiple of 4, compute any remaining MACs here. 00230 ** No loop unrolling is used. */ 00231 k = count % 0x4u; 00232 00233 while(k > 0u) 00234 { 00235 /* Perform the multiply-accumulates */ 00236 sum = __SMLAD(*px++, *py--, sum); 00237 00238 /* Decrement the loop counter */ 00239 k--; 00240 } 00241 00242 /* Store the result in the accumulator in the destination buffer. */ 00243 *pOut++ = (q15_t) (sum >> 15); 00244 00245 /* Update the inputA and inputB pointers for next MAC calculation */ 00246 py = pIn2 + (count - 1u); 00247 px = pIn1; 00248 00249 /* Increment the MAC count */ 00250 count++; 00251 00252 /* Decrement the loop counter */ 00253 blockSize1--; 00254 } 00255 00256 /* -------------------------- 00257 * Initializations of stage2 00258 * ------------------------*/ 00259 00260 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00261 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00262 * .... 00263 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00264 */ 00265 00266 /* Working pointer of inputA */ 00267 px = pIn1; 00268 00269 /* Working pointer of inputB */ 00270 pSrc2 = pIn2 + (srcBLen - 1u); 00271 py = pSrc2; 00272 00273 /* count is the index by which the pointer pIn1 to be incremented */ 00274 count = 0u; 00275 00276 00277 /* -------------------- 00278 * Stage2 process 00279 * -------------------*/ 00280 00281 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00282 * So, to loop unroll over blockSize2, 00283 * srcBLen should be greater than or equal to 4 */ 00284 if(srcBLen >= 4u) 00285 { 00286 /* Loop unroll over blockSize2, by 4 */ 00287 blkCnt = blockSize2 >> 2u; 00288 00289 while(blkCnt > 0u) 00290 { 00291 py = py - 1u; 00292 00293 /* Set all accumulators to zero */ 00294 acc0 = 0; 00295 acc1 = 0; 00296 acc2 = 0; 00297 acc3 = 0; 00298 00299 00300 /* read x[0], x[1] samples */ 00301 x0 = *__SIMD32(px); 00302 /* read x[1], x[2] samples */ 00303 x1 = _SIMD32_OFFSET(px+1); 00304 px+= 2u; 00305 00306 00307 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00308 k = srcBLen >> 2u; 00309 00310 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00311 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00312 do 00313 { 00314 /* Read the last two inputB samples using SIMD: 00315 * y[srcBLen - 1] and y[srcBLen - 2] */ 00316 c0 = *__SIMD32(py)--; 00317 00318 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00319 acc0 = __SMLADX(x0, c0, acc0); 00320 00321 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00322 acc1 = __SMLADX(x1, c0, acc1); 00323 00324 /* Read x[2], x[3] */ 00325 x2 = *__SIMD32(px); 00326 00327 /* Read x[3], x[4] */ 00328 x3 = _SIMD32_OFFSET(px+1); 00329 00330 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00331 acc2 = __SMLADX(x2, c0, acc2); 00332 00333 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00334 acc3 = __SMLADX(x3, c0, acc3); 00335 00336 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00337 c0 = *__SIMD32(py)--; 00338 00339 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00340 acc0 = __SMLADX(x2, c0, acc0); 00341 00342 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00343 acc1 = __SMLADX(x3, c0, acc1); 00344 00345 /* Read x[4], x[5] */ 00346 x0 = _SIMD32_OFFSET(px+2); 00347 00348 /* Read x[5], x[6] */ 00349 x1 = _SIMD32_OFFSET(px+3); 00350 px += 4u; 00351 00352 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00353 acc2 = __SMLADX(x0, c0, acc2); 00354 00355 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00356 acc3 = __SMLADX(x1, c0, acc3); 00357 00358 } while(--k); 00359 00360 /* For the next MAC operations, SIMD is not used 00361 * So, the 16 bit pointer if inputB, py is updated */ 00362 00363 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00364 ** No loop unrolling is used. */ 00365 k = srcBLen % 0x4u; 00366 00367 if(k == 1u) 00368 { 00369 /* Read y[srcBLen - 5] */ 00370 c0 = *(py+1); 00371 00372 #ifdef ARM_MATH_BIG_ENDIAN 00373 00374 c0 = c0 << 16u; 00375 00376 #else 00377 00378 c0 = c0 & 0x0000FFFF; 00379 00380 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00381 00382 /* Read x[7] */ 00383 x3 = *__SIMD32(px); 00384 px++; 00385 00386 /* Perform the multiply-accumulates */ 00387 acc0 = __SMLAD(x0, c0, acc0); 00388 acc1 = __SMLAD(x1, c0, acc1); 00389 acc2 = __SMLADX(x1, c0, acc2); 00390 acc3 = __SMLADX(x3, c0, acc3); 00391 } 00392 00393 if(k == 2u) 00394 { 00395 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00396 c0 = _SIMD32_OFFSET(py); 00397 00398 /* Read x[7], x[8] */ 00399 x3 = *__SIMD32(px); 00400 00401 /* Read x[9] */ 00402 x2 = _SIMD32_OFFSET(px+1); 00403 px += 2u; 00404 00405 /* Perform the multiply-accumulates */ 00406 acc0 = __SMLADX(x0, c0, acc0); 00407 acc1 = __SMLADX(x1, c0, acc1); 00408 acc2 = __SMLADX(x3, c0, acc2); 00409 acc3 = __SMLADX(x2, c0, acc3); 00410 } 00411 00412 if(k == 3u) 00413 { 00414 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00415 c0 = _SIMD32_OFFSET(py); 00416 00417 /* Read x[7], x[8] */ 00418 x3 = *__SIMD32(px); 00419 00420 /* Read x[9] */ 00421 x2 = _SIMD32_OFFSET(px+1); 00422 00423 /* Perform the multiply-accumulates */ 00424 acc0 = __SMLADX(x0, c0, acc0); 00425 acc1 = __SMLADX(x1, c0, acc1); 00426 acc2 = __SMLADX(x3, c0, acc2); 00427 acc3 = __SMLADX(x2, c0, acc3); 00428 00429 /* Read y[srcBLen - 7] */ 00430 c0 = *(py-1); 00431 #ifdef ARM_MATH_BIG_ENDIAN 00432 00433 c0 = c0 << 16u; 00434 #else 00435 00436 c0 = c0 & 0x0000FFFF; 00437 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00438 00439 /* Read x[10] */ 00440 x3 = _SIMD32_OFFSET(px+2); 00441 px += 3u; 00442 00443 /* Perform the multiply-accumulates */ 00444 acc0 = __SMLADX(x1, c0, acc0); 00445 acc1 = __SMLAD(x2, c0, acc1); 00446 acc2 = __SMLADX(x2, c0, acc2); 00447 acc3 = __SMLADX(x3, c0, acc3); 00448 } 00449 00450 /* Store the results in the accumulators in the destination buffer. */ 00451 #ifndef ARM_MATH_BIG_ENDIAN 00452 00453 *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16); 00454 *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16); 00455 00456 #else 00457 00458 *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16); 00459 *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16); 00460 00461 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00462 00463 /* Increment the pointer pIn1 index, count by 4 */ 00464 count += 4u; 00465 00466 /* Update the inputA and inputB pointers for next MAC calculation */ 00467 px = pIn1 + count; 00468 py = pSrc2; 00469 00470 /* Decrement the loop counter */ 00471 blkCnt--; 00472 } 00473 00474 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00475 ** No loop unrolling is used. */ 00476 blkCnt = blockSize2 % 0x4u; 00477 00478 while(blkCnt > 0u) 00479 { 00480 /* Accumulator is made zero for every iteration */ 00481 sum = 0; 00482 00483 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00484 k = srcBLen >> 2u; 00485 00486 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00487 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00488 while(k > 0u) 00489 { 00490 /* Perform the multiply-accumulates */ 00491 sum += ((q31_t) * px++ * *py--); 00492 sum += ((q31_t) * px++ * *py--); 00493 sum += ((q31_t) * px++ * *py--); 00494 sum += ((q31_t) * px++ * *py--); 00495 00496 /* Decrement the loop counter */ 00497 k--; 00498 } 00499 00500 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00501 ** No loop unrolling is used. */ 00502 k = srcBLen % 0x4u; 00503 00504 while(k > 0u) 00505 { 00506 /* Perform the multiply-accumulates */ 00507 sum += ((q31_t) * px++ * *py--); 00508 00509 /* Decrement the loop counter */ 00510 k--; 00511 } 00512 00513 /* Store the result in the accumulator in the destination buffer. */ 00514 *pOut++ = (q15_t) (sum >> 15); 00515 00516 /* Increment the pointer pIn1 index, count by 1 */ 00517 count++; 00518 00519 /* Update the inputA and inputB pointers for next MAC calculation */ 00520 px = pIn1 + count; 00521 py = pSrc2; 00522 00523 /* Decrement the loop counter */ 00524 blkCnt--; 00525 } 00526 } 00527 else 00528 { 00529 /* If the srcBLen is not a multiple of 4, 00530 * the blockSize2 loop cannot be unrolled by 4 */ 00531 blkCnt = blockSize2; 00532 00533 while(blkCnt > 0u) 00534 { 00535 /* Accumulator is made zero for every iteration */ 00536 sum = 0; 00537 00538 /* srcBLen number of MACS should be performed */ 00539 k = srcBLen; 00540 00541 while(k > 0u) 00542 { 00543 /* Perform the multiply-accumulate */ 00544 sum += ((q31_t) * px++ * *py--); 00545 00546 /* Decrement the loop counter */ 00547 k--; 00548 } 00549 00550 /* Store the result in the accumulator in the destination buffer. */ 00551 *pOut++ = (q15_t) (sum >> 15); 00552 00553 /* Increment the MAC count */ 00554 count++; 00555 00556 /* Update the inputA and inputB pointers for next MAC calculation */ 00557 px = pIn1 + count; 00558 py = pSrc2; 00559 00560 /* Decrement the loop counter */ 00561 blkCnt--; 00562 } 00563 } 00564 00565 00566 /* -------------------------- 00567 * Initializations of stage3 00568 * -------------------------*/ 00569 00570 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00571 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00572 * .... 00573 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00574 * sum += x[srcALen-1] * y[srcBLen-1] 00575 */ 00576 00577 /* In this stage the MAC operations are decreased by 1 for every iteration. 00578 The blockSize3 variable holds the number of MAC operations performed */ 00579 00580 /* Working pointer of inputA */ 00581 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00582 px = pSrc1; 00583 00584 /* Working pointer of inputB */ 00585 pSrc2 = pIn2 + (srcBLen - 1u); 00586 pIn2 = pSrc2 - 1u; 00587 py = pIn2; 00588 00589 /* ------------------- 00590 * Stage3 process 00591 * ------------------*/ 00592 00593 /* For loop unrolling by 4, this stage is divided into two. */ 00594 /* First part of this stage computes the MAC operations greater than 4 */ 00595 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00596 00597 /* The first part of the stage starts here */ 00598 j = blockSize3 >> 2u; 00599 00600 while((j > 0u) && (blockSize3 > 0u)) 00601 { 00602 /* Accumulator is made zero for every iteration */ 00603 sum = 0; 00604 00605 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00606 k = blockSize3 >> 2u; 00607 00608 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00609 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00610 while(k > 0u) 00611 { 00612 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00613 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00614 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00615 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00616 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00617 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00618 00619 /* Decrement the loop counter */ 00620 k--; 00621 } 00622 00623 /* For the next MAC operations, the pointer py is used without SIMD 00624 * So, py is incremented by 1 */ 00625 py = py + 1u; 00626 00627 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00628 ** No loop unrolling is used. */ 00629 k = blockSize3 % 0x4u; 00630 00631 while(k > 0u) 00632 { 00633 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00634 sum = __SMLAD(*px++, *py--, sum); 00635 00636 /* Decrement the loop counter */ 00637 k--; 00638 } 00639 00640 /* Store the result in the accumulator in the destination buffer. */ 00641 *pOut++ = (q15_t) (sum >> 15); 00642 00643 /* Update the inputA and inputB pointers for next MAC calculation */ 00644 px = ++pSrc1; 00645 py = pIn2; 00646 00647 /* Decrement the loop counter */ 00648 blockSize3--; 00649 00650 j--; 00651 } 00652 00653 /* The second part of the stage starts here */ 00654 /* SIMD is not used for the next MAC operations, 00655 * so pointer py is updated to read only one sample at a time */ 00656 py = py + 1u; 00657 00658 while(blockSize3 > 0u) 00659 { 00660 /* Accumulator is made zero for every iteration */ 00661 sum = 0; 00662 00663 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00664 k = blockSize3; 00665 00666 while(k > 0u) 00667 { 00668 /* Perform the multiply-accumulates */ 00669 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00670 sum = __SMLAD(*px++, *py--, sum); 00671 00672 /* Decrement the loop counter */ 00673 k--; 00674 } 00675 00676 /* Store the result in the accumulator in the destination buffer. */ 00677 *pOut++ = (q15_t) (sum >> 15); 00678 00679 /* Update the inputA and inputB pointers for next MAC calculation */ 00680 px = ++pSrc1; 00681 py = pSrc2; 00682 00683 /* Decrement the loop counter */ 00684 blockSize3--; 00685 } 00686 00687 #else 00688 q15_t *pIn1; /* inputA pointer */ 00689 q15_t *pIn2; /* inputB pointer */ 00690 q15_t *pOut = pDst; /* output pointer */ 00691 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00692 q15_t *px; /* Intermediate inputA pointer */ 00693 q15_t *py; /* Intermediate inputB pointer */ 00694 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00695 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00696 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ 00697 q15_t a, b; 00698 00699 /* The algorithm implementation is based on the lengths of the inputs. */ 00700 /* srcB is always made to slide across srcA. */ 00701 /* So srcBLen is always considered as shorter or equal to srcALen */ 00702 if(srcALen >= srcBLen) 00703 { 00704 /* Initialization of inputA pointer */ 00705 pIn1 = pSrcA; 00706 00707 /* Initialization of inputB pointer */ 00708 pIn2 = pSrcB; 00709 } 00710 else 00711 { 00712 /* Initialization of inputA pointer */ 00713 pIn1 = pSrcB; 00714 00715 /* Initialization of inputB pointer */ 00716 pIn2 = pSrcA; 00717 00718 /* srcBLen is always considered as shorter or equal to srcALen */ 00719 j = srcBLen; 00720 srcBLen = srcALen; 00721 srcALen = j; 00722 } 00723 00724 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00725 /* The function is internally 00726 * divided into three stages according to the number of multiplications that has to be 00727 * taken place between inputA samples and inputB samples. In the first stage of the 00728 * algorithm, the multiplications increase by one for every iteration. 00729 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00730 * In the third stage of the algorithm, the multiplications decrease by one 00731 * for every iteration. */ 00732 00733 /* The algorithm is implemented in three stages. 00734 The loop counters of each stage is initiated here. */ 00735 blockSize1 = srcBLen - 1u; 00736 blockSize2 = srcALen - (srcBLen - 1u); 00737 blockSize3 = blockSize1; 00738 00739 /* -------------------------- 00740 * Initializations of stage1 00741 * -------------------------*/ 00742 00743 /* sum = x[0] * y[0] 00744 * sum = x[0] * y[1] + x[1] * y[0] 00745 * .... 00746 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00747 */ 00748 00749 /* In this stage the MAC operations are increased by 1 for every iteration. 00750 The count variable holds the number of MAC operations performed */ 00751 count = 1u; 00752 00753 /* Working pointer of inputA */ 00754 px = pIn1; 00755 00756 /* Working pointer of inputB */ 00757 py = pIn2; 00758 00759 00760 /* ------------------------ 00761 * Stage1 process 00762 * ----------------------*/ 00763 00764 /* For loop unrolling by 4, this stage is divided into two. */ 00765 /* First part of this stage computes the MAC operations less than 4 */ 00766 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00767 00768 /* The first part of the stage starts here */ 00769 while((count < 4u) && (blockSize1 > 0u)) 00770 { 00771 /* Accumulator is made zero for every iteration */ 00772 sum = 0; 00773 00774 /* Loop over number of MAC operations between 00775 * inputA samples and inputB samples */ 00776 k = count; 00777 00778 while(k > 0u) 00779 { 00780 /* Perform the multiply-accumulates */ 00781 sum += ((q31_t) * px++ * *py--); 00782 00783 /* Decrement the loop counter */ 00784 k--; 00785 } 00786 00787 /* Store the result in the accumulator in the destination buffer. */ 00788 *pOut++ = (q15_t) (sum >> 15); 00789 00790 /* Update the inputA and inputB pointers for next MAC calculation */ 00791 py = pIn2 + count; 00792 px = pIn1; 00793 00794 /* Increment the MAC count */ 00795 count++; 00796 00797 /* Decrement the loop counter */ 00798 blockSize1--; 00799 } 00800 00801 /* The second part of the stage starts here */ 00802 /* The internal loop, over count, is unrolled by 4 */ 00803 /* To, read the last two inputB samples using SIMD: 00804 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00805 py = py - 1; 00806 00807 while(blockSize1 > 0u) 00808 { 00809 /* Accumulator is made zero for every iteration */ 00810 sum = 0; 00811 00812 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00813 k = count >> 2u; 00814 00815 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00816 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00817 py++; 00818 00819 while(k > 0u) 00820 { 00821 /* Perform the multiply-accumulates */ 00822 sum += ((q31_t) * px++ * *py--); 00823 sum += ((q31_t) * px++ * *py--); 00824 sum += ((q31_t) * px++ * *py--); 00825 sum += ((q31_t) * px++ * *py--); 00826 00827 /* Decrement the loop counter */ 00828 k--; 00829 } 00830 00831 /* If the count is not a multiple of 4, compute any remaining MACs here. 00832 ** No loop unrolling is used. */ 00833 k = count % 0x4u; 00834 00835 while(k > 0u) 00836 { 00837 /* Perform the multiply-accumulates */ 00838 sum += ((q31_t) * px++ * *py--); 00839 00840 /* Decrement the loop counter */ 00841 k--; 00842 } 00843 00844 /* Store the result in the accumulator in the destination buffer. */ 00845 *pOut++ = (q15_t) (sum >> 15); 00846 00847 /* Update the inputA and inputB pointers for next MAC calculation */ 00848 py = pIn2 + (count - 1u); 00849 px = pIn1; 00850 00851 /* Increment the MAC count */ 00852 count++; 00853 00854 /* Decrement the loop counter */ 00855 blockSize1--; 00856 } 00857 00858 /* -------------------------- 00859 * Initializations of stage2 00860 * ------------------------*/ 00861 00862 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00863 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00864 * .... 00865 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00866 */ 00867 00868 /* Working pointer of inputA */ 00869 px = pIn1; 00870 00871 /* Working pointer of inputB */ 00872 pSrc2 = pIn2 + (srcBLen - 1u); 00873 py = pSrc2; 00874 00875 /* count is the index by which the pointer pIn1 to be incremented */ 00876 count = 0u; 00877 00878 00879 /* -------------------- 00880 * Stage2 process 00881 * -------------------*/ 00882 00883 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00884 * So, to loop unroll over blockSize2, 00885 * srcBLen should be greater than or equal to 4 */ 00886 if(srcBLen >= 4u) 00887 { 00888 /* Loop unroll over blockSize2, by 4 */ 00889 blkCnt = blockSize2 >> 2u; 00890 00891 while(blkCnt > 0u) 00892 { 00893 py = py - 1u; 00894 00895 /* Set all accumulators to zero */ 00896 acc0 = 0; 00897 acc1 = 0; 00898 acc2 = 0; 00899 acc3 = 0; 00900 00901 /* read x[0], x[1] samples */ 00902 a = *px++; 00903 b = *px++; 00904 00905 #ifndef ARM_MATH_BIG_ENDIAN 00906 00907 x0 = __PKHBT(a, b, 16); 00908 a = *px; 00909 x1 = __PKHBT(b, a, 16); 00910 00911 #else 00912 00913 x0 = __PKHBT(b, a, 16); 00914 a = *px; 00915 x1 = __PKHBT(a, b, 16); 00916 00917 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00918 00919 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00920 k = srcBLen >> 2u; 00921 00922 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00923 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00924 do 00925 { 00926 /* Read the last two inputB samples using SIMD: 00927 * y[srcBLen - 1] and y[srcBLen - 2] */ 00928 a = *py; 00929 b = *(py+1); 00930 py -= 2; 00931 00932 #ifndef ARM_MATH_BIG_ENDIAN 00933 00934 c0 = __PKHBT(a, b, 16); 00935 00936 #else 00937 00938 c0 = __PKHBT(b, a, 16);; 00939 00940 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00941 00942 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00943 acc0 = __SMLADX(x0, c0, acc0); 00944 00945 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00946 acc1 = __SMLADX(x1, c0, acc1); 00947 00948 a = *px; 00949 b = *(px + 1); 00950 00951 #ifndef ARM_MATH_BIG_ENDIAN 00952 00953 x2 = __PKHBT(a, b, 16); 00954 a = *(px + 2); 00955 x3 = __PKHBT(b, a, 16); 00956 00957 #else 00958 00959 x2 = __PKHBT(b, a, 16); 00960 a = *(px + 2); 00961 x3 = __PKHBT(a, b, 16); 00962 00963 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00964 00965 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00966 acc2 = __SMLADX(x2, c0, acc2); 00967 00968 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00969 acc3 = __SMLADX(x3, c0, acc3); 00970 00971 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00972 a = *py; 00973 b = *(py+1); 00974 py -= 2; 00975 00976 #ifndef ARM_MATH_BIG_ENDIAN 00977 00978 c0 = __PKHBT(a, b, 16); 00979 00980 #else 00981 00982 c0 = __PKHBT(b, a, 16);; 00983 00984 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00985 00986 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00987 acc0 = __SMLADX(x2, c0, acc0); 00988 00989 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00990 acc1 = __SMLADX(x3, c0, acc1); 00991 00992 /* Read x[4], x[5], x[6] */ 00993 a = *(px + 2); 00994 b = *(px + 3); 00995 00996 #ifndef ARM_MATH_BIG_ENDIAN 00997 00998 x0 = __PKHBT(a, b, 16); 00999 a = *(px + 4); 01000 x1 = __PKHBT(b, a, 16); 01001 01002 #else 01003 01004 x0 = __PKHBT(b, a, 16); 01005 a = *(px + 4); 01006 x1 = __PKHBT(a, b, 16); 01007 01008 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01009 01010 px += 4u; 01011 01012 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 01013 acc2 = __SMLADX(x0, c0, acc2); 01014 01015 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 01016 acc3 = __SMLADX(x1, c0, acc3); 01017 01018 } while(--k); 01019 01020 /* For the next MAC operations, SIMD is not used 01021 * So, the 16 bit pointer if inputB, py is updated */ 01022 01023 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01024 ** No loop unrolling is used. */ 01025 k = srcBLen % 0x4u; 01026 01027 if(k == 1u) 01028 { 01029 /* Read y[srcBLen - 5] */ 01030 c0 = *(py+1); 01031 01032 #ifdef ARM_MATH_BIG_ENDIAN 01033 01034 c0 = c0 << 16u; 01035 01036 #else 01037 01038 c0 = c0 & 0x0000FFFF; 01039 01040 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01041 01042 /* Read x[7] */ 01043 a = *px; 01044 b = *(px+1); 01045 px++; 01046 01047 #ifndef ARM_MATH_BIG_ENDIAN 01048 01049 x3 = __PKHBT(a, b, 16); 01050 01051 #else 01052 01053 x3 = __PKHBT(b, a, 16);; 01054 01055 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01056 01057 01058 /* Perform the multiply-accumulates */ 01059 acc0 = __SMLAD(x0, c0, acc0); 01060 acc1 = __SMLAD(x1, c0, acc1); 01061 acc2 = __SMLADX(x1, c0, acc2); 01062 acc3 = __SMLADX(x3, c0, acc3); 01063 } 01064 01065 if(k == 2u) 01066 { 01067 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 01068 a = *py; 01069 b = *(py+1); 01070 01071 #ifndef ARM_MATH_BIG_ENDIAN 01072 01073 c0 = __PKHBT(a, b, 16); 01074 01075 #else 01076 01077 c0 = __PKHBT(b, a, 16);; 01078 01079 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01080 01081 /* Read x[7], x[8], x[9] */ 01082 a = *px; 01083 b = *(px + 1); 01084 01085 #ifndef ARM_MATH_BIG_ENDIAN 01086 01087 x3 = __PKHBT(a, b, 16); 01088 a = *(px + 2); 01089 x2 = __PKHBT(b, a, 16); 01090 01091 #else 01092 01093 x3 = __PKHBT(b, a, 16); 01094 a = *(px + 2); 01095 x2 = __PKHBT(a, b, 16); 01096 01097 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01098 px += 2u; 01099 01100 /* Perform the multiply-accumulates */ 01101 acc0 = __SMLADX(x0, c0, acc0); 01102 acc1 = __SMLADX(x1, c0, acc1); 01103 acc2 = __SMLADX(x3, c0, acc2); 01104 acc3 = __SMLADX(x2, c0, acc3); 01105 } 01106 01107 if(k == 3u) 01108 { 01109 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 01110 a = *py; 01111 b = *(py+1); 01112 01113 #ifndef ARM_MATH_BIG_ENDIAN 01114 01115 c0 = __PKHBT(a, b, 16); 01116 01117 #else 01118 01119 c0 = __PKHBT(b, a, 16);; 01120 01121 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01122 01123 /* Read x[7], x[8], x[9] */ 01124 a = *px; 01125 b = *(px + 1); 01126 01127 #ifndef ARM_MATH_BIG_ENDIAN 01128 01129 x3 = __PKHBT(a, b, 16); 01130 a = *(px + 2); 01131 x2 = __PKHBT(b, a, 16); 01132 01133 #else 01134 01135 x3 = __PKHBT(b, a, 16); 01136 a = *(px + 2); 01137 x2 = __PKHBT(a, b, 16); 01138 01139 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01140 01141 /* Perform the multiply-accumulates */ 01142 acc0 = __SMLADX(x0, c0, acc0); 01143 acc1 = __SMLADX(x1, c0, acc1); 01144 acc2 = __SMLADX(x3, c0, acc2); 01145 acc3 = __SMLADX(x2, c0, acc3); 01146 01147 /* Read y[srcBLen - 7] */ 01148 c0 = *(py-1); 01149 #ifdef ARM_MATH_BIG_ENDIAN 01150 01151 c0 = c0 << 16u; 01152 #else 01153 01154 c0 = c0 & 0x0000FFFF; 01155 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01156 01157 /* Read x[10] */ 01158 a = *(px+2); 01159 b = *(px+3); 01160 01161 #ifndef ARM_MATH_BIG_ENDIAN 01162 01163 x3 = __PKHBT(a, b, 16); 01164 01165 #else 01166 01167 x3 = __PKHBT(b, a, 16);; 01168 01169 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01170 01171 px += 3u; 01172 01173 /* Perform the multiply-accumulates */ 01174 acc0 = __SMLADX(x1, c0, acc0); 01175 acc1 = __SMLAD(x2, c0, acc1); 01176 acc2 = __SMLADX(x2, c0, acc2); 01177 acc3 = __SMLADX(x3, c0, acc3); 01178 } 01179 01180 /* Store the results in the accumulators in the destination buffer. */ 01181 *pOut++ = (q15_t)(acc0 >> 15); 01182 *pOut++ = (q15_t)(acc1 >> 15); 01183 *pOut++ = (q15_t)(acc2 >> 15); 01184 *pOut++ = (q15_t)(acc3 >> 15); 01185 01186 /* Increment the pointer pIn1 index, count by 4 */ 01187 count += 4u; 01188 01189 /* Update the inputA and inputB pointers for next MAC calculation */ 01190 px = pIn1 + count; 01191 py = pSrc2; 01192 01193 /* Decrement the loop counter */ 01194 blkCnt--; 01195 } 01196 01197 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 01198 ** No loop unrolling is used. */ 01199 blkCnt = blockSize2 % 0x4u; 01200 01201 while(blkCnt > 0u) 01202 { 01203 /* Accumulator is made zero for every iteration */ 01204 sum = 0; 01205 01206 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01207 k = srcBLen >> 2u; 01208 01209 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01210 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01211 while(k > 0u) 01212 { 01213 /* Perform the multiply-accumulates */ 01214 sum += ((q31_t) * px++ * *py--); 01215 sum += ((q31_t) * px++ * *py--); 01216 sum += ((q31_t) * px++ * *py--); 01217 sum += ((q31_t) * px++ * *py--); 01218 01219 /* Decrement the loop counter */ 01220 k--; 01221 } 01222 01223 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01224 ** No loop unrolling is used. */ 01225 k = srcBLen % 0x4u; 01226 01227 while(k > 0u) 01228 { 01229 /* Perform the multiply-accumulates */ 01230 sum += ((q31_t) * px++ * *py--); 01231 01232 /* Decrement the loop counter */ 01233 k--; 01234 } 01235 01236 /* Store the result in the accumulator in the destination buffer. */ 01237 *pOut++ = (q15_t) (sum >> 15); 01238 01239 /* Increment the pointer pIn1 index, count by 1 */ 01240 count++; 01241 01242 /* Update the inputA and inputB pointers for next MAC calculation */ 01243 px = pIn1 + count; 01244 py = pSrc2; 01245 01246 /* Decrement the loop counter */ 01247 blkCnt--; 01248 } 01249 } 01250 else 01251 { 01252 /* If the srcBLen is not a multiple of 4, 01253 * the blockSize2 loop cannot be unrolled by 4 */ 01254 blkCnt = blockSize2; 01255 01256 while(blkCnt > 0u) 01257 { 01258 /* Accumulator is made zero for every iteration */ 01259 sum = 0; 01260 01261 /* srcBLen number of MACS should be performed */ 01262 k = srcBLen; 01263 01264 while(k > 0u) 01265 { 01266 /* Perform the multiply-accumulate */ 01267 sum += ((q31_t) * px++ * *py--); 01268 01269 /* Decrement the loop counter */ 01270 k--; 01271 } 01272 01273 /* Store the result in the accumulator in the destination buffer. */ 01274 *pOut++ = (q15_t) (sum >> 15); 01275 01276 /* Increment the MAC count */ 01277 count++; 01278 01279 /* Update the inputA and inputB pointers for next MAC calculation */ 01280 px = pIn1 + count; 01281 py = pSrc2; 01282 01283 /* Decrement the loop counter */ 01284 blkCnt--; 01285 } 01286 } 01287 01288 01289 /* -------------------------- 01290 * Initializations of stage3 01291 * -------------------------*/ 01292 01293 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 01294 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 01295 * .... 01296 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 01297 * sum += x[srcALen-1] * y[srcBLen-1] 01298 */ 01299 01300 /* In this stage the MAC operations are decreased by 1 for every iteration. 01301 The blockSize3 variable holds the number of MAC operations performed */ 01302 01303 /* Working pointer of inputA */ 01304 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 01305 px = pSrc1; 01306 01307 /* Working pointer of inputB */ 01308 pSrc2 = pIn2 + (srcBLen - 1u); 01309 pIn2 = pSrc2 - 1u; 01310 py = pIn2; 01311 01312 /* ------------------- 01313 * Stage3 process 01314 * ------------------*/ 01315 01316 /* For loop unrolling by 4, this stage is divided into two. */ 01317 /* First part of this stage computes the MAC operations greater than 4 */ 01318 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 01319 01320 /* The first part of the stage starts here */ 01321 j = blockSize3 >> 2u; 01322 01323 while((j > 0u) && (blockSize3 > 0u)) 01324 { 01325 /* Accumulator is made zero for every iteration */ 01326 sum = 0; 01327 01328 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01329 k = blockSize3 >> 2u; 01330 01331 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01332 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01333 py++; 01334 01335 while(k > 0u) 01336 { 01337 sum += ((q31_t) * px++ * *py--); 01338 sum += ((q31_t) * px++ * *py--); 01339 sum += ((q31_t) * px++ * *py--); 01340 sum += ((q31_t) * px++ * *py--); 01341 /* Decrement the loop counter */ 01342 k--; 01343 } 01344 01345 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 01346 ** No loop unrolling is used. */ 01347 k = blockSize3 % 0x4u; 01348 01349 while(k > 0u) 01350 { 01351 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 01352 sum += ((q31_t) * px++ * *py--); 01353 01354 /* Decrement the loop counter */ 01355 k--; 01356 } 01357 01358 /* Store the result in the accumulator in the destination buffer. */ 01359 *pOut++ = (q15_t) (sum >> 15); 01360 01361 /* Update the inputA and inputB pointers for next MAC calculation */ 01362 px = ++pSrc1; 01363 py = pIn2; 01364 01365 /* Decrement the loop counter */ 01366 blockSize3--; 01367 01368 j--; 01369 } 01370 01371 /* The second part of the stage starts here */ 01372 /* SIMD is not used for the next MAC operations, 01373 * so pointer py is updated to read only one sample at a time */ 01374 py = py + 1u; 01375 01376 while(blockSize3 > 0u) 01377 { 01378 /* Accumulator is made zero for every iteration */ 01379 sum = 0; 01380 01381 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01382 k = blockSize3; 01383 01384 while(k > 0u) 01385 { 01386 /* Perform the multiply-accumulates */ 01387 /* sum += x[srcALen-1] * y[srcBLen-1] */ 01388 sum += ((q31_t) * px++ * *py--); 01389 01390 /* Decrement the loop counter */ 01391 k--; 01392 } 01393 01394 /* Store the result in the accumulator in the destination buffer. */ 01395 *pOut++ = (q15_t) (sum >> 15); 01396 01397 /* Update the inputA and inputB pointers for next MAC calculation */ 01398 px = ++pSrc1; 01399 py = pSrc2; 01400 01401 /* Decrement the loop counter */ 01402 blockSize3--; 01403 } 01404 01405 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 01406 } 01407 01408 /** 01409 * @} end of Conv group 01410 */
Generated on Tue Jul 12 2022 11:59:16 by 1.7.2