CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_conv_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_q15.c 00009 * 00010 * Description: Q15 Convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated 00025 * 00026 * Version 0.0.7 2010/06/10 00027 * Misra-C changes done 00028 * 00029 * -------------------------------------------------------------------- */ 00030 00031 #include "arm_math.h" 00032 00033 /** 00034 * @ingroup groupFilters 00035 */ 00036 00037 /** 00038 * @addtogroup Conv 00039 * @{ 00040 */ 00041 00042 /** 00043 * @brief Convolution of Q15 sequences. 00044 * @param[in] *pSrcA points to the first input sequence. 00045 * @param[in] srcALen length of the first input sequence. 00046 * @param[in] *pSrcB points to the second input sequence. 00047 * @param[in] srcBLen length of the second input sequence. 00048 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00049 * @return none. 00050 * 00051 * @details 00052 * <b>Scaling and Overflow Behavior:</b> 00053 * 00054 * \par 00055 * The function is implemented using a 64-bit internal accumulator. 00056 * Both inputs are in 1.15 format and multiplications yield a 2.30 result. 00057 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. 00058 * This approach provides 33 guard bits and there is no risk of overflow. 00059 * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. 00060 * 00061 * \par 00062 * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function. 00063 */ 00064 00065 void arm_conv_q15( 00066 q15_t * pSrcA, 00067 uint32_t srcALen, 00068 q15_t * pSrcB, 00069 uint32_t srcBLen, 00070 q15_t * pDst) 00071 { 00072 q15_t *pIn1; /* inputA pointer */ 00073 q15_t *pIn2; /* inputB pointer */ 00074 q15_t *pOut = pDst; /* output pointer */ 00075 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00076 q15_t *px; /* Intermediate inputA pointer */ 00077 q15_t *py; /* Intermediate inputB pointer */ 00078 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00079 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00080 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ 00081 q31_t *pb; /* 32 bit pointer for inputB buffer */ 00082 00083 00084 /* The algorithm implementation is based on the lengths of the inputs. */ 00085 /* srcB is always made to slide across srcA. */ 00086 /* So srcBLen is always considered as shorter or equal to srcALen */ 00087 if(srcALen >= srcBLen) 00088 { 00089 /* Initialization of inputA pointer */ 00090 pIn1 = pSrcA; 00091 00092 /* Initialization of inputB pointer */ 00093 pIn2 = pSrcB; 00094 } 00095 else 00096 { 00097 /* Initialization of inputA pointer */ 00098 pIn1 = pSrcB; 00099 00100 /* Initialization of inputB pointer */ 00101 pIn2 = pSrcA; 00102 00103 /* srcBLen is always considered as shorter or equal to srcALen */ 00104 j = srcBLen; 00105 srcBLen = srcALen; 00106 srcALen = j; 00107 } 00108 00109 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00110 /* The function is internally 00111 * divided into three stages according to the number of multiplications that has to be 00112 * taken place between inputA samples and inputB samples. In the first stage of the 00113 * algorithm, the multiplications increase by one for every iteration. 00114 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00115 * In the third stage of the algorithm, the multiplications decrease by one 00116 * for every iteration. */ 00117 00118 /* The algorithm is implemented in three stages. 00119 The loop counters of each stage is initiated here. */ 00120 blockSize1 = srcBLen - 1u; 00121 blockSize2 = srcALen - (srcBLen - 1u); 00122 00123 /* -------------------------- 00124 * Initializations of stage1 00125 * -------------------------*/ 00126 00127 /* sum = x[0] * y[0] 00128 * sum = x[0] * y[1] + x[1] * y[0] 00129 * .... 00130 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00131 */ 00132 00133 /* In this stage the MAC operations are increased by 1 for every iteration. 00134 The count variable holds the number of MAC operations performed */ 00135 count = 1u; 00136 00137 /* Working pointer of inputA */ 00138 px = pIn1; 00139 00140 /* Working pointer of inputB */ 00141 py = pIn2; 00142 00143 00144 /* ------------------------ 00145 * Stage1 process 00146 * ----------------------*/ 00147 00148 /* For loop unrolling by 4, this stage is divided into two. */ 00149 /* First part of this stage computes the MAC operations less than 4 */ 00150 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00151 00152 /* The first part of the stage starts here */ 00153 while((count < 4u) && (blockSize1 > 0u)) 00154 { 00155 /* Accumulator is made zero for every iteration */ 00156 sum = 0; 00157 00158 /* Loop over number of MAC operations between 00159 * inputA samples and inputB samples */ 00160 k = count; 00161 00162 while(k > 0u) 00163 { 00164 /* Perform the multiply-accumulates */ 00165 sum = __SMLALD(*px++, *py--, sum); 00166 00167 /* Decrement the loop counter */ 00168 k--; 00169 } 00170 00171 /* Store the result in the accumulator in the destination buffer. */ 00172 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00173 00174 /* Update the inputA and inputB pointers for next MAC calculation */ 00175 py = pIn2 + count; 00176 px = pIn1; 00177 00178 /* Increment the MAC count */ 00179 count++; 00180 00181 /* Decrement the loop counter */ 00182 blockSize1--; 00183 } 00184 00185 /* The second part of the stage starts here */ 00186 /* The internal loop, over count, is unrolled by 4 */ 00187 /* To, read the last two inputB samples using SIMD: 00188 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00189 py = py - 1; 00190 00191 while(blockSize1 > 0u) 00192 { 00193 /* Accumulator is made zero for every iteration */ 00194 sum = 0; 00195 00196 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00197 k = count >> 2u; 00198 00199 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00200 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00201 while(k > 0u) 00202 { 00203 /* Perform the multiply-accumulates */ 00204 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00205 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00206 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00207 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00208 00209 /* Decrement the loop counter */ 00210 k--; 00211 } 00212 00213 /* For the next MAC operations, the pointer py is used without SIMD 00214 * So, py is incremented by 1 */ 00215 py = py + 1u; 00216 00217 /* If the count is not a multiple of 4, compute any remaining MACs here. 00218 ** No loop unrolling is used. */ 00219 k = count % 0x4u; 00220 00221 while(k > 0u) 00222 { 00223 /* Perform the multiply-accumulates */ 00224 sum = __SMLALD(*px++, *py--, sum); 00225 00226 /* Decrement the loop counter */ 00227 k--; 00228 } 00229 00230 /* Store the result in the accumulator in the destination buffer. */ 00231 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00232 00233 /* Update the inputA and inputB pointers for next MAC calculation */ 00234 py = pIn2 + (count - 1u); 00235 px = pIn1; 00236 00237 /* Increment the MAC count */ 00238 count++; 00239 00240 /* Decrement the loop counter */ 00241 blockSize1--; 00242 } 00243 00244 /* -------------------------- 00245 * Initializations of stage2 00246 * ------------------------*/ 00247 00248 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00249 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00250 * .... 00251 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00252 */ 00253 00254 /* Working pointer of inputA */ 00255 px = pIn1; 00256 00257 /* Working pointer of inputB */ 00258 pSrc2 = pIn2 + (srcBLen - 1u); 00259 py = pSrc2; 00260 00261 /* Initialize inputB pointer of type q31 */ 00262 pb = (q31_t *) (py - 1u); 00263 00264 /* count is the index by which the pointer pIn1 to be incremented */ 00265 count = 1u; 00266 00267 00268 /* -------------------- 00269 * Stage2 process 00270 * -------------------*/ 00271 00272 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00273 * So, to loop unroll over blockSize2, 00274 * srcBLen should be greater than or equal to 4 */ 00275 if(srcBLen >= 4u) 00276 { 00277 /* Loop unroll over blockSize2, by 4 */ 00278 blkCnt = blockSize2 >> 2u; 00279 00280 while(blkCnt > 0u) 00281 { 00282 /* Set all accumulators to zero */ 00283 acc0 = 0; 00284 acc1 = 0; 00285 acc2 = 0; 00286 acc3 = 0; 00287 00288 00289 /* read x[0], x[1] samples */ 00290 x0 = *(q31_t *) (px++); 00291 /* read x[1], x[2] samples */ 00292 x1 = *(q31_t *) (px++); 00293 00294 00295 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00296 k = srcBLen >> 2u; 00297 00298 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00299 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00300 do 00301 { 00302 /* Read the last two inputB samples using SIMD: 00303 * y[srcBLen - 1] and y[srcBLen - 2] */ 00304 c0 = *(pb--); 00305 00306 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00307 acc0 = __SMLALDX(x0, c0, acc0); 00308 00309 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00310 acc1 = __SMLALDX(x1, c0, acc1); 00311 00312 /* Read x[2], x[3] */ 00313 x2 = *(q31_t *) (px++); 00314 00315 /* Read x[3], x[4] */ 00316 x3 = *(q31_t *) (px++); 00317 00318 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00319 acc2 = __SMLALDX(x2, c0, acc2); 00320 00321 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00322 acc3 = __SMLALDX(x3, c0, acc3); 00323 00324 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00325 c0 = *(pb--); 00326 00327 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00328 acc0 = __SMLALDX(x2, c0, acc0); 00329 00330 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00331 acc1 = __SMLALDX(x3, c0, acc1); 00332 00333 /* Read x[4], x[5] */ 00334 x0 = *(q31_t *) (px++); 00335 00336 /* Read x[5], x[6] */ 00337 x1 = *(q31_t *) (px++); 00338 00339 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00340 acc2 = __SMLALDX(x0, c0, acc2); 00341 00342 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00343 acc3 = __SMLALDX(x1, c0, acc3); 00344 00345 } while(--k); 00346 00347 /* For the next MAC operations, SIMD is not used 00348 * So, the 16 bit pointer if inputB, py is updated */ 00349 py = (q15_t *) pb; 00350 py = py + 1; 00351 00352 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00353 ** No loop unrolling is used. */ 00354 k = srcBLen % 0x4u; 00355 00356 if(k == 1u) 00357 { 00358 /* Read y[srcBLen - 5] */ 00359 c0 = *(py); 00360 00361 /* Read x[7] */ 00362 x3 = *(q31_t *) px++; 00363 00364 /* Perform the multiply-accumulates */ 00365 acc0 = __SMLALD(x0, c0, acc0); 00366 acc1 = __SMLALD(x1, c0, acc1); 00367 acc2 = __SMLALDX(x1, c0, acc2); 00368 acc3 = __SMLALDX(x3, c0, acc3); 00369 } 00370 00371 if(k == 2u) 00372 { 00373 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00374 c0 = *(pb); 00375 00376 /* Read x[7], x[8] */ 00377 x3 = *(q31_t *) px++; 00378 00379 /* Read x[9] */ 00380 x2 = *(q31_t *) px++; 00381 00382 /* Perform the multiply-accumulates */ 00383 acc0 = __SMLALDX(x0, c0, acc0); 00384 acc1 = __SMLALDX(x1, c0, acc1); 00385 acc2 = __SMLALDX(x3, c0, acc2); 00386 acc3 = __SMLALDX(x2, c0, acc3); 00387 } 00388 00389 if(k == 3u) 00390 { 00391 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00392 c0 = *pb--; 00393 00394 /* Read x[7], x[8] */ 00395 x3 = *(q31_t *) px++; 00396 00397 /* Read x[9] */ 00398 x2 = *(q31_t *) px++; 00399 00400 /* Perform the multiply-accumulates */ 00401 acc0 = __SMLALDX(x0, c0, acc0); 00402 acc1 = __SMLALDX(x1, c0, acc1); 00403 acc2 = __SMLALDX(x3, c0, acc2); 00404 acc3 = __SMLALDX(x2, c0, acc3); 00405 00406 /* Read y[srcBLen - 7] */ 00407 c0 = (q15_t) (*pb >> 16); 00408 00409 /* Read x[10] */ 00410 x3 = *(q31_t *) px++; 00411 00412 /* Perform the multiply-accumulates */ 00413 acc0 = __SMLALDX(x1, c0, acc0); 00414 acc1 = __SMLALD(x2, c0, acc1); 00415 acc2 = __SMLALDX(x2, c0, acc2); 00416 acc3 = __SMLALDX(x3, c0, acc3); 00417 } 00418 00419 /* Store the results in the accumulators in the destination buffer. */ 00420 *__SIMD32(pOut)++ = 00421 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00422 *__SIMD32(pOut)++ = 00423 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00424 00425 /* Update the inputA and inputB pointers for next MAC calculation */ 00426 px = pIn1 + (count * 4u); 00427 py = pSrc2; 00428 pb = (q31_t *) (py - 1); 00429 00430 /* Increment the pointer pIn1 index, count by 1 */ 00431 count++; 00432 00433 /* Decrement the loop counter */ 00434 blkCnt--; 00435 } 00436 00437 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00438 ** No loop unrolling is used. */ 00439 blkCnt = blockSize2 % 0x4u; 00440 00441 while(blkCnt > 0u) 00442 { 00443 /* Accumulator is made zero for every iteration */ 00444 sum = 0; 00445 00446 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00447 k = srcBLen >> 2u; 00448 00449 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00450 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00451 while(k > 0u) 00452 { 00453 /* Perform the multiply-accumulates */ 00454 sum += (q63_t) ((q31_t) * px++ * *py--); 00455 sum += (q63_t) ((q31_t) * px++ * *py--); 00456 sum += (q63_t) ((q31_t) * px++ * *py--); 00457 sum += (q63_t) ((q31_t) * px++ * *py--); 00458 00459 /* Decrement the loop counter */ 00460 k--; 00461 } 00462 00463 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00464 ** No loop unrolling is used. */ 00465 k = srcBLen % 0x4u; 00466 00467 while(k > 0u) 00468 { 00469 /* Perform the multiply-accumulates */ 00470 sum += (q63_t) ((q31_t) * px++ * *py--); 00471 00472 /* Decrement the loop counter */ 00473 k--; 00474 } 00475 00476 /* Store the result in the accumulator in the destination buffer. */ 00477 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00478 00479 /* Update the inputA and inputB pointers for next MAC calculation */ 00480 px = pIn1 + count; 00481 py = pSrc2; 00482 00483 /* Increment the pointer pIn1 index, count by 1 */ 00484 count++; 00485 00486 /* Decrement the loop counter */ 00487 blkCnt--; 00488 } 00489 } 00490 else 00491 { 00492 /* If the srcBLen is not a multiple of 4, 00493 * the blockSize2 loop cannot be unrolled by 4 */ 00494 blkCnt = blockSize2; 00495 00496 while(blkCnt > 0u) 00497 { 00498 /* Accumulator is made zero for every iteration */ 00499 sum = 0; 00500 00501 /* srcBLen number of MACS should be performed */ 00502 k = srcBLen; 00503 00504 while(k > 0u) 00505 { 00506 /* Perform the multiply-accumulate */ 00507 sum += (q63_t) ((q31_t) * px++ * *py--); 00508 00509 /* Decrement the loop counter */ 00510 k--; 00511 } 00512 00513 /* Store the result in the accumulator in the destination buffer. */ 00514 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00515 00516 /* Update the inputA and inputB pointers for next MAC calculation */ 00517 px = pIn1 + count; 00518 py = pSrc2; 00519 00520 /* Increment the MAC count */ 00521 count++; 00522 00523 /* Decrement the loop counter */ 00524 blkCnt--; 00525 } 00526 } 00527 00528 00529 /* -------------------------- 00530 * Initializations of stage3 00531 * -------------------------*/ 00532 00533 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00534 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00535 * .... 00536 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00537 * sum += x[srcALen-1] * y[srcBLen-1] 00538 */ 00539 00540 /* In this stage the MAC operations are decreased by 1 for every iteration. 00541 The blockSize3 variable holds the number of MAC operations performed */ 00542 00543 blockSize3 = srcBLen - 1u; 00544 00545 /* Working pointer of inputA */ 00546 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00547 px = pSrc1; 00548 00549 /* Working pointer of inputB */ 00550 pSrc2 = pIn2 + (srcBLen - 1u); 00551 pIn2 = pSrc2 - 1u; 00552 py = pIn2; 00553 00554 /* ------------------- 00555 * Stage3 process 00556 * ------------------*/ 00557 00558 /* For loop unrolling by 4, this stage is divided into two. */ 00559 /* First part of this stage computes the MAC operations greater than 4 */ 00560 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00561 00562 /* The first part of the stage starts here */ 00563 j = blockSize3 >> 2u; 00564 00565 while((j > 0u) && (blockSize3 > 0u)) 00566 { 00567 /* Accumulator is made zero for every iteration */ 00568 sum = 0; 00569 00570 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00571 k = blockSize3 >> 2u; 00572 00573 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00574 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00575 while(k > 0u) 00576 { 00577 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00578 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00579 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00580 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00581 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00582 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00583 00584 /* Decrement the loop counter */ 00585 k--; 00586 } 00587 00588 /* For the next MAC operations, the pointer py is used without SIMD 00589 * So, py is incremented by 1 */ 00590 py = py + 1u; 00591 00592 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00593 ** No loop unrolling is used. */ 00594 k = blockSize3 % 0x4u; 00595 00596 while(k > 0u) 00597 { 00598 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00599 sum = __SMLALD(*px++, *py--, sum); 00600 00601 /* Decrement the loop counter */ 00602 k--; 00603 } 00604 00605 /* Store the result in the accumulator in the destination buffer. */ 00606 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00607 00608 /* Update the inputA and inputB pointers for next MAC calculation */ 00609 px = ++pSrc1; 00610 py = pIn2; 00611 00612 /* Decrement the loop counter */ 00613 blockSize3--; 00614 00615 j--; 00616 } 00617 00618 /* The second part of the stage starts here */ 00619 /* SIMD is not used for the next MAC operations, 00620 * so pointer py is updated to read only one sample at a time */ 00621 py = py + 1u; 00622 00623 while(blockSize3 > 0u) 00624 { 00625 /* Accumulator is made zero for every iteration */ 00626 sum = 0; 00627 00628 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00629 k = blockSize3; 00630 00631 while(k > 0u) 00632 { 00633 /* Perform the multiply-accumulates */ 00634 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00635 sum = __SMLALD(*px++, *py--, sum); 00636 00637 /* Decrement the loop counter */ 00638 k--; 00639 } 00640 00641 /* Store the result in the accumulator in the destination buffer. */ 00642 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00643 00644 /* Update the inputA and inputB pointers for next MAC calculation */ 00645 px = ++pSrc1; 00646 py = pSrc2; 00647 00648 /* Decrement the loop counter */ 00649 blockSize3--; 00650 } 00651 00652 } 00653 00654 /** 00655 * @} end of Conv group 00656 */
Generated on Tue Jul 12 2022 14:13:52 by 1.7.2