CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_conv_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_q15.c 00009 * 00010 * Description: Convolution of Q15 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q15 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @return none. 00060 * 00061 * @details 00062 * <b>Scaling and Overflow Behavior:</b> 00063 * 00064 * \par 00065 * The function is implemented using a 64-bit internal accumulator. 00066 * Both inputs are in 1.15 format and multiplications yield a 2.30 result. 00067 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. 00068 * This approach provides 33 guard bits and there is no risk of overflow. 00069 * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. 00070 * 00071 * \par 00072 * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00073 * 00074 * \par 00075 * Refer the function <code>arm_conv_opt_q15()</code> for a faster implementation of this function using scratch buffers. 00076 * 00077 */ 00078 00079 void arm_conv_q15( 00080 q15_t * pSrcA, 00081 uint32_t srcALen, 00082 q15_t * pSrcB, 00083 uint32_t srcBLen, 00084 q15_t * pDst) 00085 { 00086 00087 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) 00088 00089 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00090 00091 q15_t *pIn1; /* inputA pointer */ 00092 q15_t *pIn2; /* inputB pointer */ 00093 q15_t *pOut = pDst; /* output pointer */ 00094 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00095 q15_t *px; /* Intermediate inputA pointer */ 00096 q15_t *py; /* Intermediate inputB pointer */ 00097 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00098 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00099 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ 00100 00101 /* The algorithm implementation is based on the lengths of the inputs. */ 00102 /* srcB is always made to slide across srcA. */ 00103 /* So srcBLen is always considered as shorter or equal to srcALen */ 00104 if(srcALen >= srcBLen) 00105 { 00106 /* Initialization of inputA pointer */ 00107 pIn1 = pSrcA; 00108 00109 /* Initialization of inputB pointer */ 00110 pIn2 = pSrcB; 00111 } 00112 else 00113 { 00114 /* Initialization of inputA pointer */ 00115 pIn1 = pSrcB; 00116 00117 /* Initialization of inputB pointer */ 00118 pIn2 = pSrcA; 00119 00120 /* srcBLen is always considered as shorter or equal to srcALen */ 00121 j = srcBLen; 00122 srcBLen = srcALen; 00123 srcALen = j; 00124 } 00125 00126 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00127 /* The function is internally 00128 * divided into three stages according to the number of multiplications that has to be 00129 * taken place between inputA samples and inputB samples. In the first stage of the 00130 * algorithm, the multiplications increase by one for every iteration. 00131 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00132 * In the third stage of the algorithm, the multiplications decrease by one 00133 * for every iteration. */ 00134 00135 /* The algorithm is implemented in three stages. 00136 The loop counters of each stage is initiated here. */ 00137 blockSize1 = srcBLen - 1u; 00138 blockSize2 = srcALen - (srcBLen - 1u); 00139 00140 /* -------------------------- 00141 * Initializations of stage1 00142 * -------------------------*/ 00143 00144 /* sum = x[0] * y[0] 00145 * sum = x[0] * y[1] + x[1] * y[0] 00146 * .... 00147 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00148 */ 00149 00150 /* In this stage the MAC operations are increased by 1 for every iteration. 00151 The count variable holds the number of MAC operations performed */ 00152 count = 1u; 00153 00154 /* Working pointer of inputA */ 00155 px = pIn1; 00156 00157 /* Working pointer of inputB */ 00158 py = pIn2; 00159 00160 00161 /* ------------------------ 00162 * Stage1 process 00163 * ----------------------*/ 00164 00165 /* For loop unrolling by 4, this stage is divided into two. */ 00166 /* First part of this stage computes the MAC operations less than 4 */ 00167 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00168 00169 /* The first part of the stage starts here */ 00170 while((count < 4u) && (blockSize1 > 0u)) 00171 { 00172 /* Accumulator is made zero for every iteration */ 00173 sum = 0; 00174 00175 /* Loop over number of MAC operations between 00176 * inputA samples and inputB samples */ 00177 k = count; 00178 00179 while(k > 0u) 00180 { 00181 /* Perform the multiply-accumulates */ 00182 sum = __SMLALD(*px++, *py--, sum); 00183 00184 /* Decrement the loop counter */ 00185 k--; 00186 } 00187 00188 /* Store the result in the accumulator in the destination buffer. */ 00189 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00190 00191 /* Update the inputA and inputB pointers for next MAC calculation */ 00192 py = pIn2 + count; 00193 px = pIn1; 00194 00195 /* Increment the MAC count */ 00196 count++; 00197 00198 /* Decrement the loop counter */ 00199 blockSize1--; 00200 } 00201 00202 /* The second part of the stage starts here */ 00203 /* The internal loop, over count, is unrolled by 4 */ 00204 /* To, read the last two inputB samples using SIMD: 00205 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00206 py = py - 1; 00207 00208 while(blockSize1 > 0u) 00209 { 00210 /* Accumulator is made zero for every iteration */ 00211 sum = 0; 00212 00213 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00214 k = count >> 2u; 00215 00216 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00217 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00218 while(k > 0u) 00219 { 00220 /* Perform the multiply-accumulates */ 00221 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00222 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00223 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00224 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00225 00226 /* Decrement the loop counter */ 00227 k--; 00228 } 00229 00230 /* For the next MAC operations, the pointer py is used without SIMD 00231 * So, py is incremented by 1 */ 00232 py = py + 1u; 00233 00234 /* If the count is not a multiple of 4, compute any remaining MACs here. 00235 ** No loop unrolling is used. */ 00236 k = count % 0x4u; 00237 00238 while(k > 0u) 00239 { 00240 /* Perform the multiply-accumulates */ 00241 sum = __SMLALD(*px++, *py--, sum); 00242 00243 /* Decrement the loop counter */ 00244 k--; 00245 } 00246 00247 /* Store the result in the accumulator in the destination buffer. */ 00248 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00249 00250 /* Update the inputA and inputB pointers for next MAC calculation */ 00251 py = pIn2 + (count - 1u); 00252 px = pIn1; 00253 00254 /* Increment the MAC count */ 00255 count++; 00256 00257 /* Decrement the loop counter */ 00258 blockSize1--; 00259 } 00260 00261 /* -------------------------- 00262 * Initializations of stage2 00263 * ------------------------*/ 00264 00265 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00266 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00267 * .... 00268 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00269 */ 00270 00271 /* Working pointer of inputA */ 00272 px = pIn1; 00273 00274 /* Working pointer of inputB */ 00275 pSrc2 = pIn2 + (srcBLen - 1u); 00276 py = pSrc2; 00277 00278 /* count is the index by which the pointer pIn1 to be incremented */ 00279 count = 0u; 00280 00281 00282 /* -------------------- 00283 * Stage2 process 00284 * -------------------*/ 00285 00286 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00287 * So, to loop unroll over blockSize2, 00288 * srcBLen should be greater than or equal to 4 */ 00289 if(srcBLen >= 4u) 00290 { 00291 /* Loop unroll over blockSize2, by 4 */ 00292 blkCnt = blockSize2 >> 2u; 00293 00294 while(blkCnt > 0u) 00295 { 00296 py = py - 1u; 00297 00298 /* Set all accumulators to zero */ 00299 acc0 = 0; 00300 acc1 = 0; 00301 acc2 = 0; 00302 acc3 = 0; 00303 00304 00305 /* read x[0], x[1] samples */ 00306 x0 = *__SIMD32(px); 00307 /* read x[1], x[2] samples */ 00308 x1 = _SIMD32_OFFSET(px+1); 00309 px+= 2u; 00310 00311 00312 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00313 k = srcBLen >> 2u; 00314 00315 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00316 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00317 do 00318 { 00319 /* Read the last two inputB samples using SIMD: 00320 * y[srcBLen - 1] and y[srcBLen - 2] */ 00321 c0 = *__SIMD32(py)--; 00322 00323 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00324 acc0 = __SMLALDX(x0, c0, acc0); 00325 00326 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00327 acc1 = __SMLALDX(x1, c0, acc1); 00328 00329 /* Read x[2], x[3] */ 00330 x2 = *__SIMD32(px); 00331 00332 /* Read x[3], x[4] */ 00333 x3 = _SIMD32_OFFSET(px+1); 00334 00335 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00336 acc2 = __SMLALDX(x2, c0, acc2); 00337 00338 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00339 acc3 = __SMLALDX(x3, c0, acc3); 00340 00341 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00342 c0 = *__SIMD32(py)--; 00343 00344 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00345 acc0 = __SMLALDX(x2, c0, acc0); 00346 00347 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00348 acc1 = __SMLALDX(x3, c0, acc1); 00349 00350 /* Read x[4], x[5] */ 00351 x0 = _SIMD32_OFFSET(px+2); 00352 00353 /* Read x[5], x[6] */ 00354 x1 = _SIMD32_OFFSET(px+3); 00355 px += 4u; 00356 00357 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00358 acc2 = __SMLALDX(x0, c0, acc2); 00359 00360 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00361 acc3 = __SMLALDX(x1, c0, acc3); 00362 00363 } while(--k); 00364 00365 /* For the next MAC operations, SIMD is not used 00366 * So, the 16 bit pointer if inputB, py is updated */ 00367 00368 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00369 ** No loop unrolling is used. */ 00370 k = srcBLen % 0x4u; 00371 00372 if(k == 1u) 00373 { 00374 /* Read y[srcBLen - 5] */ 00375 c0 = *(py+1); 00376 00377 #ifdef ARM_MATH_BIG_ENDIAN 00378 00379 c0 = c0 << 16u; 00380 00381 #else 00382 00383 c0 = c0 & 0x0000FFFF; 00384 00385 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00386 /* Read x[7] */ 00387 x3 = *__SIMD32(px); 00388 px++; 00389 00390 /* Perform the multiply-accumulates */ 00391 acc0 = __SMLALD(x0, c0, acc0); 00392 acc1 = __SMLALD(x1, c0, acc1); 00393 acc2 = __SMLALDX(x1, c0, acc2); 00394 acc3 = __SMLALDX(x3, c0, acc3); 00395 } 00396 00397 if(k == 2u) 00398 { 00399 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00400 c0 = _SIMD32_OFFSET(py); 00401 00402 /* Read x[7], x[8] */ 00403 x3 = *__SIMD32(px); 00404 00405 /* Read x[9] */ 00406 x2 = _SIMD32_OFFSET(px+1); 00407 px += 2u; 00408 00409 /* Perform the multiply-accumulates */ 00410 acc0 = __SMLALDX(x0, c0, acc0); 00411 acc1 = __SMLALDX(x1, c0, acc1); 00412 acc2 = __SMLALDX(x3, c0, acc2); 00413 acc3 = __SMLALDX(x2, c0, acc3); 00414 } 00415 00416 if(k == 3u) 00417 { 00418 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00419 c0 = _SIMD32_OFFSET(py); 00420 00421 /* Read x[7], x[8] */ 00422 x3 = *__SIMD32(px); 00423 00424 /* Read x[9] */ 00425 x2 = _SIMD32_OFFSET(px+1); 00426 00427 /* Perform the multiply-accumulates */ 00428 acc0 = __SMLALDX(x0, c0, acc0); 00429 acc1 = __SMLALDX(x1, c0, acc1); 00430 acc2 = __SMLALDX(x3, c0, acc2); 00431 acc3 = __SMLALDX(x2, c0, acc3); 00432 00433 c0 = *(py-1); 00434 00435 #ifdef ARM_MATH_BIG_ENDIAN 00436 00437 c0 = c0 << 16u; 00438 #else 00439 00440 c0 = c0 & 0x0000FFFF; 00441 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00442 /* Read x[10] */ 00443 x3 = _SIMD32_OFFSET(px+2); 00444 px += 3u; 00445 00446 /* Perform the multiply-accumulates */ 00447 acc0 = __SMLALDX(x1, c0, acc0); 00448 acc1 = __SMLALD(x2, c0, acc1); 00449 acc2 = __SMLALDX(x2, c0, acc2); 00450 acc3 = __SMLALDX(x3, c0, acc3); 00451 } 00452 00453 00454 /* Store the results in the accumulators in the destination buffer. */ 00455 00456 #ifndef ARM_MATH_BIG_ENDIAN 00457 00458 *__SIMD32(pOut)++ = 00459 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00460 *__SIMD32(pOut)++ = 00461 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00462 00463 #else 00464 00465 *__SIMD32(pOut)++ = 00466 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00467 *__SIMD32(pOut)++ = 00468 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00469 00470 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00471 00472 /* Increment the pointer pIn1 index, count by 4 */ 00473 count += 4u; 00474 00475 /* Update the inputA and inputB pointers for next MAC calculation */ 00476 px = pIn1 + count; 00477 py = pSrc2; 00478 00479 /* Decrement the loop counter */ 00480 blkCnt--; 00481 } 00482 00483 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00484 ** No loop unrolling is used. */ 00485 blkCnt = blockSize2 % 0x4u; 00486 00487 while(blkCnt > 0u) 00488 { 00489 /* Accumulator is made zero for every iteration */ 00490 sum = 0; 00491 00492 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00493 k = srcBLen >> 2u; 00494 00495 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00496 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00497 while(k > 0u) 00498 { 00499 /* Perform the multiply-accumulates */ 00500 sum += (q63_t) ((q31_t) * px++ * *py--); 00501 sum += (q63_t) ((q31_t) * px++ * *py--); 00502 sum += (q63_t) ((q31_t) * px++ * *py--); 00503 sum += (q63_t) ((q31_t) * px++ * *py--); 00504 00505 /* Decrement the loop counter */ 00506 k--; 00507 } 00508 00509 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00510 ** No loop unrolling is used. */ 00511 k = srcBLen % 0x4u; 00512 00513 while(k > 0u) 00514 { 00515 /* Perform the multiply-accumulates */ 00516 sum += (q63_t) ((q31_t) * px++ * *py--); 00517 00518 /* Decrement the loop counter */ 00519 k--; 00520 } 00521 00522 /* Store the result in the accumulator in the destination buffer. */ 00523 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00524 00525 /* Increment the pointer pIn1 index, count by 1 */ 00526 count++; 00527 00528 /* Update the inputA and inputB pointers for next MAC calculation */ 00529 px = pIn1 + count; 00530 py = pSrc2; 00531 00532 /* Decrement the loop counter */ 00533 blkCnt--; 00534 } 00535 } 00536 else 00537 { 00538 /* If the srcBLen is not a multiple of 4, 00539 * the blockSize2 loop cannot be unrolled by 4 */ 00540 blkCnt = blockSize2; 00541 00542 while(blkCnt > 0u) 00543 { 00544 /* Accumulator is made zero for every iteration */ 00545 sum = 0; 00546 00547 /* srcBLen number of MACS should be performed */ 00548 k = srcBLen; 00549 00550 while(k > 0u) 00551 { 00552 /* Perform the multiply-accumulate */ 00553 sum += (q63_t) ((q31_t) * px++ * *py--); 00554 00555 /* Decrement the loop counter */ 00556 k--; 00557 } 00558 00559 /* Store the result in the accumulator in the destination buffer. */ 00560 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00561 00562 /* Increment the MAC count */ 00563 count++; 00564 00565 /* Update the inputA and inputB pointers for next MAC calculation */ 00566 px = pIn1 + count; 00567 py = pSrc2; 00568 00569 /* Decrement the loop counter */ 00570 blkCnt--; 00571 } 00572 } 00573 00574 00575 /* -------------------------- 00576 * Initializations of stage3 00577 * -------------------------*/ 00578 00579 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00580 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00581 * .... 00582 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00583 * sum += x[srcALen-1] * y[srcBLen-1] 00584 */ 00585 00586 /* In this stage the MAC operations are decreased by 1 for every iteration. 00587 The blockSize3 variable holds the number of MAC operations performed */ 00588 00589 blockSize3 = srcBLen - 1u; 00590 00591 /* Working pointer of inputA */ 00592 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00593 px = pSrc1; 00594 00595 /* Working pointer of inputB */ 00596 pSrc2 = pIn2 + (srcBLen - 1u); 00597 pIn2 = pSrc2 - 1u; 00598 py = pIn2; 00599 00600 /* ------------------- 00601 * Stage3 process 00602 * ------------------*/ 00603 00604 /* For loop unrolling by 4, this stage is divided into two. */ 00605 /* First part of this stage computes the MAC operations greater than 4 */ 00606 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00607 00608 /* The first part of the stage starts here */ 00609 j = blockSize3 >> 2u; 00610 00611 while((j > 0u) && (blockSize3 > 0u)) 00612 { 00613 /* Accumulator is made zero for every iteration */ 00614 sum = 0; 00615 00616 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00617 k = blockSize3 >> 2u; 00618 00619 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00620 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00621 while(k > 0u) 00622 { 00623 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00624 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00625 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00626 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00627 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00628 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00629 00630 /* Decrement the loop counter */ 00631 k--; 00632 } 00633 00634 /* For the next MAC operations, the pointer py is used without SIMD 00635 * So, py is incremented by 1 */ 00636 py = py + 1u; 00637 00638 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00639 ** No loop unrolling is used. */ 00640 k = blockSize3 % 0x4u; 00641 00642 while(k > 0u) 00643 { 00644 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00645 sum = __SMLALD(*px++, *py--, sum); 00646 00647 /* Decrement the loop counter */ 00648 k--; 00649 } 00650 00651 /* Store the result in the accumulator in the destination buffer. */ 00652 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00653 00654 /* Update the inputA and inputB pointers for next MAC calculation */ 00655 px = ++pSrc1; 00656 py = pIn2; 00657 00658 /* Decrement the loop counter */ 00659 blockSize3--; 00660 00661 j--; 00662 } 00663 00664 /* The second part of the stage starts here */ 00665 /* SIMD is not used for the next MAC operations, 00666 * so pointer py is updated to read only one sample at a time */ 00667 py = py + 1u; 00668 00669 while(blockSize3 > 0u) 00670 { 00671 /* Accumulator is made zero for every iteration */ 00672 sum = 0; 00673 00674 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00675 k = blockSize3; 00676 00677 while(k > 0u) 00678 { 00679 /* Perform the multiply-accumulates */ 00680 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00681 sum = __SMLALD(*px++, *py--, sum); 00682 00683 /* Decrement the loop counter */ 00684 k--; 00685 } 00686 00687 /* Store the result in the accumulator in the destination buffer. */ 00688 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00689 00690 /* Update the inputA and inputB pointers for next MAC calculation */ 00691 px = ++pSrc1; 00692 py = pSrc2; 00693 00694 /* Decrement the loop counter */ 00695 blockSize3--; 00696 } 00697 00698 #else 00699 00700 /* Run the below code for Cortex-M0 */ 00701 00702 q15_t *pIn1 = pSrcA; /* input pointer */ 00703 q15_t *pIn2 = pSrcB; /* coefficient pointer */ 00704 q63_t sum; /* Accumulator */ 00705 uint32_t i, j; /* loop counter */ 00706 00707 /* Loop to calculate output of convolution for output length number of times */ 00708 for (i = 0; i < (srcALen + srcBLen - 1); i++) 00709 { 00710 /* Initialize sum with zero to carry on MAC operations */ 00711 sum = 0; 00712 00713 /* Loop to perform MAC operations according to convolution equation */ 00714 for (j = 0; j <= i; j++) 00715 { 00716 /* Check the array limitations */ 00717 if(((i - j) < srcBLen) && (j < srcALen)) 00718 { 00719 /* z[i] += x[i-j] * y[j] */ 00720 sum += (q31_t) pIn1[j] * (pIn2[i - j]); 00721 } 00722 } 00723 00724 /* Store the output in the destination buffer */ 00725 pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u); 00726 } 00727 00728 #endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)*/ 00729 00730 } 00731 00732 /** 00733 * @} end of Conv group 00734 */
Generated on Tue Jul 12 2022 11:59:16 by 1.7.2