Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_conv_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_fast_q15.c 00009 * 00010 * Description: Fast Q15 Convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @return none. 00060 * 00061 * <b>Scaling and Overflow Behavior:</b> 00062 * 00063 * \par 00064 * This fast version uses a 32-bit accumulator with 2.30 format. 00065 * The accumulator maintains full precision of the intermediate multiplication results 00066 * but provides only a single guard bit. There is no saturation on intermediate additions. 00067 * Thus, if the accumulator overflows it wraps around and distorts the result. 00068 * The input signals should be scaled down to avoid intermediate overflows. 00069 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, 00070 * as maximum of min(srcALen, srcBLen) number of additions are carried internally. 00071 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. 00072 * 00073 * \par 00074 * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. 00075 */ 00076 00077 void arm_conv_fast_q15( 00078 q15_t * pSrcA, 00079 uint32_t srcALen, 00080 q15_t * pSrcB, 00081 uint32_t srcBLen, 00082 q15_t * pDst) 00083 { 00084 #ifndef UNALIGNED_SUPPORT_DISABLE 00085 q15_t *pIn1; /* inputA pointer */ 00086 q15_t *pIn2; /* inputB pointer */ 00087 q15_t *pOut = pDst; /* output pointer */ 00088 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00089 q15_t *px; /* Intermediate inputA pointer */ 00090 q15_t *py; /* Intermediate inputB pointer */ 00091 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00092 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00093 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ 00094 00095 /* The algorithm implementation is based on the lengths of the inputs. */ 00096 /* srcB is always made to slide across srcA. */ 00097 /* So srcBLen is always considered as shorter or equal to srcALen */ 00098 if(srcALen >= srcBLen) 00099 { 00100 /* Initialization of inputA pointer */ 00101 pIn1 = pSrcA; 00102 00103 /* Initialization of inputB pointer */ 00104 pIn2 = pSrcB; 00105 } 00106 else 00107 { 00108 /* Initialization of inputA pointer */ 00109 pIn1 = pSrcB; 00110 00111 /* Initialization of inputB pointer */ 00112 pIn2 = pSrcA; 00113 00114 /* srcBLen is always considered as shorter or equal to srcALen */ 00115 j = srcBLen; 00116 srcBLen = srcALen; 00117 srcALen = j; 00118 } 00119 00120 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00121 /* The function is internally 00122 * divided into three stages according to the number of multiplications that has to be 00123 * taken place between inputA samples and inputB samples. In the first stage of the 00124 * algorithm, the multiplications increase by one for every iteration. 00125 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00126 * In the third stage of the algorithm, the multiplications decrease by one 00127 * for every iteration. */ 00128 00129 /* The algorithm is implemented in three stages. 00130 The loop counters of each stage is initiated here. */ 00131 blockSize1 = srcBLen - 1u; 00132 blockSize2 = srcALen - (srcBLen - 1u); 00133 blockSize3 = blockSize1; 00134 00135 /* -------------------------- 00136 * Initializations of stage1 00137 * -------------------------*/ 00138 00139 /* sum = x[0] * y[0] 00140 * sum = x[0] * y[1] + x[1] * y[0] 00141 * .... 00142 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00143 */ 00144 00145 /* In this stage the MAC operations are increased by 1 for every iteration. 00146 The count variable holds the number of MAC operations performed */ 00147 count = 1u; 00148 00149 /* Working pointer of inputA */ 00150 px = pIn1; 00151 00152 /* Working pointer of inputB */ 00153 py = pIn2; 00154 00155 00156 /* ------------------------ 00157 * Stage1 process 00158 * ----------------------*/ 00159 00160 /* For loop unrolling by 4, this stage is divided into two. */ 00161 /* First part of this stage computes the MAC operations less than 4 */ 00162 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00163 00164 /* The first part of the stage starts here */ 00165 while((count < 4u) && (blockSize1 > 0u)) 00166 { 00167 /* Accumulator is made zero for every iteration */ 00168 sum = 0; 00169 00170 /* Loop over number of MAC operations between 00171 * inputA samples and inputB samples */ 00172 k = count; 00173 00174 while(k > 0u) 00175 { 00176 /* Perform the multiply-accumulates */ 00177 sum = __SMLAD(*px++, *py--, sum); 00178 00179 /* Decrement the loop counter */ 00180 k--; 00181 } 00182 00183 /* Store the result in the accumulator in the destination buffer. */ 00184 *pOut++ = (q15_t) (sum >> 15); 00185 00186 /* Update the inputA and inputB pointers for next MAC calculation */ 00187 py = pIn2 + count; 00188 px = pIn1; 00189 00190 /* Increment the MAC count */ 00191 count++; 00192 00193 /* Decrement the loop counter */ 00194 blockSize1--; 00195 } 00196 00197 /* The second part of the stage starts here */ 00198 /* The internal loop, over count, is unrolled by 4 */ 00199 /* To, read the last two inputB samples using SIMD: 00200 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00201 py = py - 1; 00202 00203 while(blockSize1 > 0u) 00204 { 00205 /* Accumulator is made zero for every iteration */ 00206 sum = 0; 00207 00208 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00209 k = count >> 2u; 00210 00211 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00212 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00213 while(k > 0u) 00214 { 00215 /* Perform the multiply-accumulates */ 00216 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00217 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00218 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00219 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00220 00221 /* Decrement the loop counter */ 00222 k--; 00223 } 00224 00225 /* For the next MAC operations, the pointer py is used without SIMD 00226 * So, py is incremented by 1 */ 00227 py = py + 1u; 00228 00229 /* If the count is not a multiple of 4, compute any remaining MACs here. 00230 ** No loop unrolling is used. */ 00231 k = count % 0x4u; 00232 00233 while(k > 0u) 00234 { 00235 /* Perform the multiply-accumulates */ 00236 sum = __SMLAD(*px++, *py--, sum); 00237 00238 /* Decrement the loop counter */ 00239 k--; 00240 } 00241 00242 /* Store the result in the accumulator in the destination buffer. */ 00243 *pOut++ = (q15_t) (sum >> 15); 00244 00245 /* Update the inputA and inputB pointers for next MAC calculation */ 00246 py = pIn2 + (count - 1u); 00247 px = pIn1; 00248 00249 /* Increment the MAC count */ 00250 count++; 00251 00252 /* Decrement the loop counter */ 00253 blockSize1--; 00254 } 00255 00256 /* -------------------------- 00257 * Initializations of stage2 00258 * ------------------------*/ 00259 00260 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00261 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00262 * .... 00263 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00264 */ 00265 00266 /* Working pointer of inputA */ 00267 px = pIn1; 00268 00269 /* Working pointer of inputB */ 00270 pSrc2 = pIn2 + (srcBLen - 1u); 00271 py = pSrc2; 00272 00273 /* count is the index by which the pointer pIn1 to be incremented */ 00274 count = 0u; 00275 00276 00277 /* -------------------- 00278 * Stage2 process 00279 * -------------------*/ 00280 00281 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00282 * So, to loop unroll over blockSize2, 00283 * srcBLen should be greater than or equal to 4 */ 00284 if(srcBLen >= 4u) 00285 { 00286 /* Loop unroll over blockSize2, by 4 */ 00287 blkCnt = blockSize2 >> 2u; 00288 00289 while(blkCnt > 0u) 00290 { 00291 py = py - 1u; 00292 00293 /* Set all accumulators to zero */ 00294 acc0 = 0; 00295 acc1 = 0; 00296 acc2 = 0; 00297 acc3 = 0; 00298 00299 00300 /* read x[0], x[1] samples */ 00301 x0 = *__SIMD32(px); 00302 /* read x[1], x[2] samples */ 00303 x1 = _SIMD32_OFFSET(px+1); 00304 px+= 2u; 00305 00306 00307 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00308 k = srcBLen >> 2u; 00309 00310 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00311 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00312 do 00313 { 00314 /* Read the last two inputB samples using SIMD: 00315 * y[srcBLen - 1] and y[srcBLen - 2] */ 00316 c0 = *__SIMD32(py)--; 00317 00318 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00319 acc0 = __SMLADX(x0, c0, acc0); 00320 00321 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00322 acc1 = __SMLADX(x1, c0, acc1); 00323 00324 /* Read x[2], x[3] */ 00325 x2 = *__SIMD32(px); 00326 00327 /* Read x[3], x[4] */ 00328 x3 = _SIMD32_OFFSET(px+1); 00329 00330 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00331 acc2 = __SMLADX(x2, c0, acc2); 00332 00333 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00334 acc3 = __SMLADX(x3, c0, acc3); 00335 00336 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00337 c0 = *__SIMD32(py)--; 00338 00339 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00340 acc0 = __SMLADX(x2, c0, acc0); 00341 00342 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00343 acc1 = __SMLADX(x3, c0, acc1); 00344 00345 /* Read x[4], x[5] */ 00346 x0 = _SIMD32_OFFSET(px+2); 00347 00348 /* Read x[5], x[6] */ 00349 x1 = _SIMD32_OFFSET(px+3); 00350 px += 4u; 00351 00352 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00353 acc2 = __SMLADX(x0, c0, acc2); 00354 00355 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00356 acc3 = __SMLADX(x1, c0, acc3); 00357 00358 } while(--k); 00359 00360 /* For the next MAC operations, SIMD is not used 00361 * So, the 16 bit pointer if inputB, py is updated */ 00362 00363 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00364 ** No loop unrolling is used. */ 00365 k = srcBLen % 0x4u; 00366 00367 if(k == 1u) 00368 { 00369 /* Read y[srcBLen - 5] */ 00370 c0 = *(py+1); 00371 00372 #ifdef ARM_MATH_BIG_ENDIAN 00373 00374 c0 = c0 << 16u; 00375 00376 #else 00377 00378 c0 = c0 & 0x0000FFFF; 00379 00380 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00381 00382 /* Read x[7] */ 00383 x3 = *__SIMD32(px); 00384 px++; 00385 00386 /* Perform the multiply-accumulates */ 00387 acc0 = __SMLAD(x0, c0, acc0); 00388 acc1 = __SMLAD(x1, c0, acc1); 00389 acc2 = __SMLADX(x1, c0, acc2); 00390 acc3 = __SMLADX(x3, c0, acc3); 00391 } 00392 00393 if(k == 2u) 00394 { 00395 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00396 c0 = _SIMD32_OFFSET(py); 00397 00398 /* Read x[7], x[8] */ 00399 x3 = *__SIMD32(px); 00400 00401 /* Read x[9] */ 00402 x2 = _SIMD32_OFFSET(px+1); 00403 px += 2u; 00404 00405 /* Perform the multiply-accumulates */ 00406 acc0 = __SMLADX(x0, c0, acc0); 00407 acc1 = __SMLADX(x1, c0, acc1); 00408 acc2 = __SMLADX(x3, c0, acc2); 00409 acc3 = __SMLADX(x2, c0, acc3); 00410 } 00411 00412 if(k == 3u) 00413 { 00414 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00415 c0 = _SIMD32_OFFSET(py); 00416 00417 /* Read x[7], x[8] */ 00418 x3 = *__SIMD32(px); 00419 00420 /* Read x[9] */ 00421 x2 = _SIMD32_OFFSET(px+1); 00422 00423 /* Perform the multiply-accumulates */ 00424 acc0 = __SMLADX(x0, c0, acc0); 00425 acc1 = __SMLADX(x1, c0, acc1); 00426 acc2 = __SMLADX(x3, c0, acc2); 00427 acc3 = __SMLADX(x2, c0, acc3); 00428 00429 /* Read y[srcBLen - 7] */ 00430 c0 = *(py-1); 00431 #ifdef ARM_MATH_BIG_ENDIAN 00432 00433 c0 = c0 << 16u; 00434 #else 00435 00436 c0 = c0 & 0x0000FFFF; 00437 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00438 00439 /* Read x[10] */ 00440 x3 = _SIMD32_OFFSET(px+2); 00441 px += 3u; 00442 00443 /* Perform the multiply-accumulates */ 00444 acc0 = __SMLADX(x1, c0, acc0); 00445 acc1 = __SMLAD(x2, c0, acc1); 00446 acc2 = __SMLADX(x2, c0, acc2); 00447 acc3 = __SMLADX(x3, c0, acc3); 00448 } 00449 00450 /* Store the results in the accumulators in the destination buffer. */ 00451 #ifndef ARM_MATH_BIG_ENDIAN 00452 00453 *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16); 00454 *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16); 00455 00456 #else 00457 00458 *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16); 00459 *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16); 00460 00461 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00462 00463 /* Increment the pointer pIn1 index, count by 4 */ 00464 count += 4u; 00465 00466 /* Update the inputA and inputB pointers for next MAC calculation */ 00467 px = pIn1 + count; 00468 py = pSrc2; 00469 00470 /* Decrement the loop counter */ 00471 blkCnt--; 00472 } 00473 00474 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00475 ** No loop unrolling is used. */ 00476 blkCnt = blockSize2 % 0x4u; 00477 00478 while(blkCnt > 0u) 00479 { 00480 /* Accumulator is made zero for every iteration */ 00481 sum = 0; 00482 00483 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00484 k = srcBLen >> 2u; 00485 00486 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00487 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00488 while(k > 0u) 00489 { 00490 /* Perform the multiply-accumulates */ 00491 sum += ((q31_t) * px++ * *py--); 00492 sum += ((q31_t) * px++ * *py--); 00493 sum += ((q31_t) * px++ * *py--); 00494 sum += ((q31_t) * px++ * *py--); 00495 00496 /* Decrement the loop counter */ 00497 k--; 00498 } 00499 00500 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00501 ** No loop unrolling is used. */ 00502 k = srcBLen % 0x4u; 00503 00504 while(k > 0u) 00505 { 00506 /* Perform the multiply-accumulates */ 00507 sum += ((q31_t) * px++ * *py--); 00508 00509 /* Decrement the loop counter */ 00510 k--; 00511 } 00512 00513 /* Store the result in the accumulator in the destination buffer. */ 00514 *pOut++ = (q15_t) (sum >> 15); 00515 00516 /* Increment the pointer pIn1 index, count by 1 */ 00517 count++; 00518 00519 /* Update the inputA and inputB pointers for next MAC calculation */ 00520 px = pIn1 + count; 00521 py = pSrc2; 00522 00523 /* Decrement the loop counter */ 00524 blkCnt--; 00525 } 00526 } 00527 else 00528 { 00529 /* If the srcBLen is not a multiple of 4, 00530 * the blockSize2 loop cannot be unrolled by 4 */ 00531 blkCnt = blockSize2; 00532 00533 while(blkCnt > 0u) 00534 { 00535 /* Accumulator is made zero for every iteration */ 00536 sum = 0; 00537 00538 /* srcBLen number of MACS should be performed */ 00539 k = srcBLen; 00540 00541 while(k > 0u) 00542 { 00543 /* Perform the multiply-accumulate */ 00544 sum += ((q31_t) * px++ * *py--); 00545 00546 /* Decrement the loop counter */ 00547 k--; 00548 } 00549 00550 /* Store the result in the accumulator in the destination buffer. */ 00551 *pOut++ = (q15_t) (sum >> 15); 00552 00553 /* Increment the MAC count */ 00554 count++; 00555 00556 /* Update the inputA and inputB pointers for next MAC calculation */ 00557 px = pIn1 + count; 00558 py = pSrc2; 00559 00560 /* Decrement the loop counter */ 00561 blkCnt--; 00562 } 00563 } 00564 00565 00566 /* -------------------------- 00567 * Initializations of stage3 00568 * -------------------------*/ 00569 00570 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00571 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00572 * .... 00573 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00574 * sum += x[srcALen-1] * y[srcBLen-1] 00575 */ 00576 00577 /* In this stage the MAC operations are decreased by 1 for every iteration. 00578 The blockSize3 variable holds the number of MAC operations performed */ 00579 00580 /* Working pointer of inputA */ 00581 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00582 px = pSrc1; 00583 00584 /* Working pointer of inputB */ 00585 pSrc2 = pIn2 + (srcBLen - 1u); 00586 pIn2 = pSrc2 - 1u; 00587 py = pIn2; 00588 00589 /* ------------------- 00590 * Stage3 process 00591 * ------------------*/ 00592 00593 /* For loop unrolling by 4, this stage is divided into two. */ 00594 /* First part of this stage computes the MAC operations greater than 4 */ 00595 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00596 00597 /* The first part of the stage starts here */ 00598 j = blockSize3 >> 2u; 00599 00600 while((j > 0u) && (blockSize3 > 0u)) 00601 { 00602 /* Accumulator is made zero for every iteration */ 00603 sum = 0; 00604 00605 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00606 k = blockSize3 >> 2u; 00607 00608 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00609 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00610 while(k > 0u) 00611 { 00612 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00613 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00614 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00615 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00616 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00617 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00618 00619 /* Decrement the loop counter */ 00620 k--; 00621 } 00622 00623 /* For the next MAC operations, the pointer py is used without SIMD 00624 * So, py is incremented by 1 */ 00625 py = py + 1u; 00626 00627 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00628 ** No loop unrolling is used. */ 00629 k = blockSize3 % 0x4u; 00630 00631 while(k > 0u) 00632 { 00633 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00634 sum = __SMLAD(*px++, *py--, sum); 00635 00636 /* Decrement the loop counter */ 00637 k--; 00638 } 00639 00640 /* Store the result in the accumulator in the destination buffer. */ 00641 *pOut++ = (q15_t) (sum >> 15); 00642 00643 /* Update the inputA and inputB pointers for next MAC calculation */ 00644 px = ++pSrc1; 00645 py = pIn2; 00646 00647 /* Decrement the loop counter */ 00648 blockSize3--; 00649 00650 j--; 00651 } 00652 00653 /* The second part of the stage starts here */ 00654 /* SIMD is not used for the next MAC operations, 00655 * so pointer py is updated to read only one sample at a time */ 00656 py = py + 1u; 00657 00658 while(blockSize3 > 0u) 00659 { 00660 /* Accumulator is made zero for every iteration */ 00661 sum = 0; 00662 00663 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00664 k = blockSize3; 00665 00666 while(k > 0u) 00667 { 00668 /* Perform the multiply-accumulates */ 00669 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00670 sum = __SMLAD(*px++, *py--, sum); 00671 00672 /* Decrement the loop counter */ 00673 k--; 00674 } 00675 00676 /* Store the result in the accumulator in the destination buffer. */ 00677 *pOut++ = (q15_t) (sum >> 15); 00678 00679 /* Update the inputA and inputB pointers for next MAC calculation */ 00680 px = ++pSrc1; 00681 py = pSrc2; 00682 00683 /* Decrement the loop counter */ 00684 blockSize3--; 00685 } 00686 00687 #else 00688 q15_t *pIn1; /* inputA pointer */ 00689 q15_t *pIn2; /* inputB pointer */ 00690 q15_t *pOut = pDst; /* output pointer */ 00691 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00692 q15_t *px; /* Intermediate inputA pointer */ 00693 q15_t *py; /* Intermediate inputB pointer */ 00694 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00695 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00696 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ 00697 q15_t a, b; 00698 00699 /* The algorithm implementation is based on the lengths of the inputs. */ 00700 /* srcB is always made to slide across srcA. */ 00701 /* So srcBLen is always considered as shorter or equal to srcALen */ 00702 if(srcALen >= srcBLen) 00703 { 00704 /* Initialization of inputA pointer */ 00705 pIn1 = pSrcA; 00706 00707 /* Initialization of inputB pointer */ 00708 pIn2 = pSrcB; 00709 } 00710 else 00711 { 00712 /* Initialization of inputA pointer */ 00713 pIn1 = pSrcB; 00714 00715 /* Initialization of inputB pointer */ 00716 pIn2 = pSrcA; 00717 00718 /* srcBLen is always considered as shorter or equal to srcALen */ 00719 j = srcBLen; 00720 srcBLen = srcALen; 00721 srcALen = j; 00722 } 00723 00724 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00725 /* The function is internally 00726 * divided into three stages according to the number of multiplications that has to be 00727 * taken place between inputA samples and inputB samples. In the first stage of the 00728 * algorithm, the multiplications increase by one for every iteration. 00729 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00730 * In the third stage of the algorithm, the multiplications decrease by one 00731 * for every iteration. */ 00732 00733 /* The algorithm is implemented in three stages. 00734 The loop counters of each stage is initiated here. */ 00735 blockSize1 = srcBLen - 1u; 00736 blockSize2 = srcALen - (srcBLen - 1u); 00737 blockSize3 = blockSize1; 00738 00739 /* -------------------------- 00740 * Initializations of stage1 00741 * -------------------------*/ 00742 00743 /* sum = x[0] * y[0] 00744 * sum = x[0] * y[1] + x[1] * y[0] 00745 * .... 00746 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00747 */ 00748 00749 /* In this stage the MAC operations are increased by 1 for every iteration. 00750 The count variable holds the number of MAC operations performed */ 00751 count = 1u; 00752 00753 /* Working pointer of inputA */ 00754 px = pIn1; 00755 00756 /* Working pointer of inputB */ 00757 py = pIn2; 00758 00759 00760 /* ------------------------ 00761 * Stage1 process 00762 * ----------------------*/ 00763 00764 /* For loop unrolling by 4, this stage is divided into two. */ 00765 /* First part of this stage computes the MAC operations less than 4 */ 00766 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00767 00768 /* The first part of the stage starts here */ 00769 while((count < 4u) && (blockSize1 > 0u)) 00770 { 00771 /* Accumulator is made zero for every iteration */ 00772 sum = 0; 00773 00774 /* Loop over number of MAC operations between 00775 * inputA samples and inputB samples */ 00776 k = count; 00777 00778 while(k > 0u) 00779 { 00780 /* Perform the multiply-accumulates */ 00781 sum += ((q31_t) * px++ * *py--); 00782 00783 /* Decrement the loop counter */ 00784 k--; 00785 } 00786 00787 /* Store the result in the accumulator in the destination buffer. */ 00788 *pOut++ = (q15_t) (sum >> 15); 00789 00790 /* Update the inputA and inputB pointers for next MAC calculation */ 00791 py = pIn2 + count; 00792 px = pIn1; 00793 00794 /* Increment the MAC count */ 00795 count++; 00796 00797 /* Decrement the loop counter */ 00798 blockSize1--; 00799 } 00800 00801 /* The second part of the stage starts here */ 00802 /* The internal loop, over count, is unrolled by 4 */ 00803 /* To, read the last two inputB samples using SIMD: 00804 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00805 py = py - 1; 00806 00807 while(blockSize1 > 0u) 00808 { 00809 /* Accumulator is made zero for every iteration */ 00810 sum = 0; 00811 00812 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00813 k = count >> 2u; 00814 00815 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00816 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00817 py++; 00818 00819 while(k > 0u) 00820 { 00821 /* Perform the multiply-accumulates */ 00822 sum += ((q31_t) * px++ * *py--); 00823 sum += ((q31_t) * px++ * *py--); 00824 sum += ((q31_t) * px++ * *py--); 00825 sum += ((q31_t) * px++ * *py--); 00826 00827 /* Decrement the loop counter */ 00828 k--; 00829 } 00830 00831 /* If the count is not a multiple of 4, compute any remaining MACs here. 00832 ** No loop unrolling is used. */ 00833 k = count % 0x4u; 00834 00835 while(k > 0u) 00836 { 00837 /* Perform the multiply-accumulates */ 00838 sum += ((q31_t) * px++ * *py--); 00839 00840 /* Decrement the loop counter */ 00841 k--; 00842 } 00843 00844 /* Store the result in the accumulator in the destination buffer. */ 00845 *pOut++ = (q15_t) (sum >> 15); 00846 00847 /* Update the inputA and inputB pointers for next MAC calculation */ 00848 py = pIn2 + (count - 1u); 00849 px = pIn1; 00850 00851 /* Increment the MAC count */ 00852 count++; 00853 00854 /* Decrement the loop counter */ 00855 blockSize1--; 00856 } 00857 00858 /* -------------------------- 00859 * Initializations of stage2 00860 * ------------------------*/ 00861 00862 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00863 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00864 * .... 00865 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00866 */ 00867 00868 /* Working pointer of inputA */ 00869 px = pIn1; 00870 00871 /* Working pointer of inputB */ 00872 pSrc2 = pIn2 + (srcBLen - 1u); 00873 py = pSrc2; 00874 00875 /* count is the index by which the pointer pIn1 to be incremented */ 00876 count = 0u; 00877 00878 00879 /* -------------------- 00880 * Stage2 process 00881 * -------------------*/ 00882 00883 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00884 * So, to loop unroll over blockSize2, 00885 * srcBLen should be greater than or equal to 4 */ 00886 if(srcBLen >= 4u) 00887 { 00888 /* Loop unroll over blockSize2, by 4 */ 00889 blkCnt = blockSize2 >> 2u; 00890 00891 while(blkCnt > 0u) 00892 { 00893 py = py - 1u; 00894 00895 /* Set all accumulators to zero */ 00896 acc0 = 0; 00897 acc1 = 0; 00898 acc2 = 0; 00899 acc3 = 0; 00900 00901 /* read x[0], x[1] samples */ 00902 a = *px++; 00903 b = *px++; 00904 00905 #ifndef ARM_MATH_BIG_ENDIAN 00906 00907 x0 = __PKHBT(a, b, 16); 00908 a = *px; 00909 x1 = __PKHBT(b, a, 16); 00910 00911 #else 00912 00913 x0 = __PKHBT(b, a, 16); 00914 a = *px; 00915 x1 = __PKHBT(a, b, 16); 00916 00917 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00918 00919 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00920 k = srcBLen >> 2u; 00921 00922 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00923 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00924 do 00925 { 00926 /* Read the last two inputB samples using SIMD: 00927 * y[srcBLen - 1] and y[srcBLen - 2] */ 00928 a = *py; 00929 b = *(py+1); 00930 py -= 2; 00931 00932 #ifndef ARM_MATH_BIG_ENDIAN 00933 00934 c0 = __PKHBT(a, b, 16); 00935 00936 #else 00937 00938 c0 = __PKHBT(b, a, 16);; 00939 00940 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00941 00942 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00943 acc0 = __SMLADX(x0, c0, acc0); 00944 00945 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00946 acc1 = __SMLADX(x1, c0, acc1); 00947 00948 a = *px; 00949 b = *(px + 1); 00950 00951 #ifndef ARM_MATH_BIG_ENDIAN 00952 00953 x2 = __PKHBT(a, b, 16); 00954 a = *(px + 2); 00955 x3 = __PKHBT(b, a, 16); 00956 00957 #else 00958 00959 x2 = __PKHBT(b, a, 16); 00960 a = *(px + 2); 00961 x3 = __PKHBT(a, b, 16); 00962 00963 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00964 00965 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00966 acc2 = __SMLADX(x2, c0, acc2); 00967 00968 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00969 acc3 = __SMLADX(x3, c0, acc3); 00970 00971 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00972 a = *py; 00973 b = *(py+1); 00974 py -= 2; 00975 00976 #ifndef ARM_MATH_BIG_ENDIAN 00977 00978 c0 = __PKHBT(a, b, 16); 00979 00980 #else 00981 00982 c0 = __PKHBT(b, a, 16);; 00983 00984 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00985 00986 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00987 acc0 = __SMLADX(x2, c0, acc0); 00988 00989 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00990 acc1 = __SMLADX(x3, c0, acc1); 00991 00992 /* Read x[4], x[5], x[6] */ 00993 a = *(px + 2); 00994 b = *(px + 3); 00995 00996 #ifndef ARM_MATH_BIG_ENDIAN 00997 00998 x0 = __PKHBT(a, b, 16); 00999 a = *(px + 4); 01000 x1 = __PKHBT(b, a, 16); 01001 01002 #else 01003 01004 x0 = __PKHBT(b, a, 16); 01005 a = *(px + 4); 01006 x1 = __PKHBT(a, b, 16); 01007 01008 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01009 01010 px += 4u; 01011 01012 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 01013 acc2 = __SMLADX(x0, c0, acc2); 01014 01015 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 01016 acc3 = __SMLADX(x1, c0, acc3); 01017 01018 } while(--k); 01019 01020 /* For the next MAC operations, SIMD is not used 01021 * So, the 16 bit pointer if inputB, py is updated */ 01022 01023 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01024 ** No loop unrolling is used. */ 01025 k = srcBLen % 0x4u; 01026 01027 if(k == 1u) 01028 { 01029 /* Read y[srcBLen - 5] */ 01030 c0 = *(py+1); 01031 01032 #ifdef ARM_MATH_BIG_ENDIAN 01033 01034 c0 = c0 << 16u; 01035 01036 #else 01037 01038 c0 = c0 & 0x0000FFFF; 01039 01040 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01041 01042 /* Read x[7] */ 01043 a = *px; 01044 b = *(px+1); 01045 px++; 01046 01047 #ifndef ARM_MATH_BIG_ENDIAN 01048 01049 x3 = __PKHBT(a, b, 16); 01050 01051 #else 01052 01053 x3 = __PKHBT(b, a, 16);; 01054 01055 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01056 01057 01058 /* Perform the multiply-accumulates */ 01059 acc0 = __SMLAD(x0, c0, acc0); 01060 acc1 = __SMLAD(x1, c0, acc1); 01061 acc2 = __SMLADX(x1, c0, acc2); 01062 acc3 = __SMLADX(x3, c0, acc3); 01063 } 01064 01065 if(k == 2u) 01066 { 01067 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 01068 a = *py; 01069 b = *(py+1); 01070 01071 #ifndef ARM_MATH_BIG_ENDIAN 01072 01073 c0 = __PKHBT(a, b, 16); 01074 01075 #else 01076 01077 c0 = __PKHBT(b, a, 16);; 01078 01079 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01080 01081 /* Read x[7], x[8], x[9] */ 01082 a = *px; 01083 b = *(px + 1); 01084 01085 #ifndef ARM_MATH_BIG_ENDIAN 01086 01087 x3 = __PKHBT(a, b, 16); 01088 a = *(px + 2); 01089 x2 = __PKHBT(b, a, 16); 01090 01091 #else 01092 01093 x3 = __PKHBT(b, a, 16); 01094 a = *(px + 2); 01095 x2 = __PKHBT(a, b, 16); 01096 01097 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01098 px += 2u; 01099 01100 /* Perform the multiply-accumulates */ 01101 acc0 = __SMLADX(x0, c0, acc0); 01102 acc1 = __SMLADX(x1, c0, acc1); 01103 acc2 = __SMLADX(x3, c0, acc2); 01104 acc3 = __SMLADX(x2, c0, acc3); 01105 } 01106 01107 if(k == 3u) 01108 { 01109 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 01110 a = *py; 01111 b = *(py+1); 01112 01113 #ifndef ARM_MATH_BIG_ENDIAN 01114 01115 c0 = __PKHBT(a, b, 16); 01116 01117 #else 01118 01119 c0 = __PKHBT(b, a, 16);; 01120 01121 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01122 01123 /* Read x[7], x[8], x[9] */ 01124 a = *px; 01125 b = *(px + 1); 01126 01127 #ifndef ARM_MATH_BIG_ENDIAN 01128 01129 x3 = __PKHBT(a, b, 16); 01130 a = *(px + 2); 01131 x2 = __PKHBT(b, a, 16); 01132 01133 #else 01134 01135 x3 = __PKHBT(b, a, 16); 01136 a = *(px + 2); 01137 x2 = __PKHBT(a, b, 16); 01138 01139 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01140 01141 /* Perform the multiply-accumulates */ 01142 acc0 = __SMLADX(x0, c0, acc0); 01143 acc1 = __SMLADX(x1, c0, acc1); 01144 acc2 = __SMLADX(x3, c0, acc2); 01145 acc3 = __SMLADX(x2, c0, acc3); 01146 01147 /* Read y[srcBLen - 7] */ 01148 c0 = *(py-1); 01149 #ifdef ARM_MATH_BIG_ENDIAN 01150 01151 c0 = c0 << 16u; 01152 #else 01153 01154 c0 = c0 & 0x0000FFFF; 01155 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01156 01157 /* Read x[10] */ 01158 a = *(px+2); 01159 b = *(px+3); 01160 01161 #ifndef ARM_MATH_BIG_ENDIAN 01162 01163 x3 = __PKHBT(a, b, 16); 01164 01165 #else 01166 01167 x3 = __PKHBT(b, a, 16);; 01168 01169 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01170 01171 px += 3u; 01172 01173 /* Perform the multiply-accumulates */ 01174 acc0 = __SMLADX(x1, c0, acc0); 01175 acc1 = __SMLAD(x2, c0, acc1); 01176 acc2 = __SMLADX(x2, c0, acc2); 01177 acc3 = __SMLADX(x3, c0, acc3); 01178 } 01179 01180 /* Store the results in the accumulators in the destination buffer. */ 01181 *pOut++ = (q15_t)(acc0 >> 15); 01182 *pOut++ = (q15_t)(acc1 >> 15); 01183 *pOut++ = (q15_t)(acc2 >> 15); 01184 *pOut++ = (q15_t)(acc3 >> 15); 01185 01186 /* Increment the pointer pIn1 index, count by 4 */ 01187 count += 4u; 01188 01189 /* Update the inputA and inputB pointers for next MAC calculation */ 01190 px = pIn1 + count; 01191 py = pSrc2; 01192 01193 /* Decrement the loop counter */ 01194 blkCnt--; 01195 } 01196 01197 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 01198 ** No loop unrolling is used. */ 01199 blkCnt = blockSize2 % 0x4u; 01200 01201 while(blkCnt > 0u) 01202 { 01203 /* Accumulator is made zero for every iteration */ 01204 sum = 0; 01205 01206 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01207 k = srcBLen >> 2u; 01208 01209 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01210 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01211 while(k > 0u) 01212 { 01213 /* Perform the multiply-accumulates */ 01214 sum += ((q31_t) * px++ * *py--); 01215 sum += ((q31_t) * px++ * *py--); 01216 sum += ((q31_t) * px++ * *py--); 01217 sum += ((q31_t) * px++ * *py--); 01218 01219 /* Decrement the loop counter */ 01220 k--; 01221 } 01222 01223 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01224 ** No loop unrolling is used. */ 01225 k = srcBLen % 0x4u; 01226 01227 while(k > 0u) 01228 { 01229 /* Perform the multiply-accumulates */ 01230 sum += ((q31_t) * px++ * *py--); 01231 01232 /* Decrement the loop counter */ 01233 k--; 01234 } 01235 01236 /* Store the result in the accumulator in the destination buffer. */ 01237 *pOut++ = (q15_t) (sum >> 15); 01238 01239 /* Increment the pointer pIn1 index, count by 1 */ 01240 count++; 01241 01242 /* Update the inputA and inputB pointers for next MAC calculation */ 01243 px = pIn1 + count; 01244 py = pSrc2; 01245 01246 /* Decrement the loop counter */ 01247 blkCnt--; 01248 } 01249 } 01250 else 01251 { 01252 /* If the srcBLen is not a multiple of 4, 01253 * the blockSize2 loop cannot be unrolled by 4 */ 01254 blkCnt = blockSize2; 01255 01256 while(blkCnt > 0u) 01257 { 01258 /* Accumulator is made zero for every iteration */ 01259 sum = 0; 01260 01261 /* srcBLen number of MACS should be performed */ 01262 k = srcBLen; 01263 01264 while(k > 0u) 01265 { 01266 /* Perform the multiply-accumulate */ 01267 sum += ((q31_t) * px++ * *py--); 01268 01269 /* Decrement the loop counter */ 01270 k--; 01271 } 01272 01273 /* Store the result in the accumulator in the destination buffer. */ 01274 *pOut++ = (q15_t) (sum >> 15); 01275 01276 /* Increment the MAC count */ 01277 count++; 01278 01279 /* Update the inputA and inputB pointers for next MAC calculation */ 01280 px = pIn1 + count; 01281 py = pSrc2; 01282 01283 /* Decrement the loop counter */ 01284 blkCnt--; 01285 } 01286 } 01287 01288 01289 /* -------------------------- 01290 * Initializations of stage3 01291 * -------------------------*/ 01292 01293 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 01294 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 01295 * .... 01296 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 01297 * sum += x[srcALen-1] * y[srcBLen-1] 01298 */ 01299 01300 /* In this stage the MAC operations are decreased by 1 for every iteration. 01301 The blockSize3 variable holds the number of MAC operations performed */ 01302 01303 /* Working pointer of inputA */ 01304 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 01305 px = pSrc1; 01306 01307 /* Working pointer of inputB */ 01308 pSrc2 = pIn2 + (srcBLen - 1u); 01309 pIn2 = pSrc2 - 1u; 01310 py = pIn2; 01311 01312 /* ------------------- 01313 * Stage3 process 01314 * ------------------*/ 01315 01316 /* For loop unrolling by 4, this stage is divided into two. */ 01317 /* First part of this stage computes the MAC operations greater than 4 */ 01318 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 01319 01320 /* The first part of the stage starts here */ 01321 j = blockSize3 >> 2u; 01322 01323 while((j > 0u) && (blockSize3 > 0u)) 01324 { 01325 /* Accumulator is made zero for every iteration */ 01326 sum = 0; 01327 01328 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01329 k = blockSize3 >> 2u; 01330 01331 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01332 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01333 py++; 01334 01335 while(k > 0u) 01336 { 01337 sum += ((q31_t) * px++ * *py--); 01338 sum += ((q31_t) * px++ * *py--); 01339 sum += ((q31_t) * px++ * *py--); 01340 sum += ((q31_t) * px++ * *py--); 01341 /* Decrement the loop counter */ 01342 k--; 01343 } 01344 01345 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 01346 ** No loop unrolling is used. */ 01347 k = blockSize3 % 0x4u; 01348 01349 while(k > 0u) 01350 { 01351 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 01352 sum += ((q31_t) * px++ * *py--); 01353 01354 /* Decrement the loop counter */ 01355 k--; 01356 } 01357 01358 /* Store the result in the accumulator in the destination buffer. */ 01359 *pOut++ = (q15_t) (sum >> 15); 01360 01361 /* Update the inputA and inputB pointers for next MAC calculation */ 01362 px = ++pSrc1; 01363 py = pIn2; 01364 01365 /* Decrement the loop counter */ 01366 blockSize3--; 01367 01368 j--; 01369 } 01370 01371 /* The second part of the stage starts here */ 01372 /* SIMD is not used for the next MAC operations, 01373 * so pointer py is updated to read only one sample at a time */ 01374 py = py + 1u; 01375 01376 while(blockSize3 > 0u) 01377 { 01378 /* Accumulator is made zero for every iteration */ 01379 sum = 0; 01380 01381 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01382 k = blockSize3; 01383 01384 while(k > 0u) 01385 { 01386 /* Perform the multiply-accumulates */ 01387 /* sum += x[srcALen-1] * y[srcBLen-1] */ 01388 sum += ((q31_t) * px++ * *py--); 01389 01390 /* Decrement the loop counter */ 01391 k--; 01392 } 01393 01394 /* Store the result in the accumulator in the destination buffer. */ 01395 *pOut++ = (q15_t) (sum >> 15); 01396 01397 /* Update the inputA and inputB pointers for next MAC calculation */ 01398 px = ++pSrc1; 01399 py = pSrc2; 01400 01401 /* Decrement the loop counter */ 01402 blockSize3--; 01403 } 01404 01405 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 01406 } 01407 01408 /** 01409 * @} end of Conv group 01410 */
Generated on Tue Jul 12 2022 18:44:08 by
1.7.2
