CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_conv_fast_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_fast_q31.c 00009 * 00010 * Description: Q31 Convolution (fast version). 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated. 00025 * -------------------------------------------------------------------- */ 00026 00027 #include "arm_math.h" 00028 00029 /** 00030 * @ingroup groupFilters 00031 */ 00032 00033 /** 00034 * @addtogroup Conv 00035 * @{ 00036 */ 00037 00038 /** 00039 * @param[in] *pSrcA points to the first input sequence. 00040 * @param[in] srcALen length of the first input sequence. 00041 * @param[in] *pSrcB points to the second input sequence. 00042 * @param[in] srcBLen length of the second input sequence. 00043 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00044 * @return none. 00045 * 00046 * @details 00047 * <b>Scaling and Overflow Behavior:</b> 00048 * 00049 * \par 00050 * This function is optimized for speed at the expense of fixed-point precision and overflow protection. 00051 * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. 00052 * These intermediate results are accumulated in a 32-bit register in 2.30 format. 00053 * Finally, the accumulator is saturated and converted to a 1.31 result. 00054 * 00055 * \par 00056 * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result. 00057 * In order to avoid overflows completely the input signals must be scaled down. 00058 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, 00059 * as maximum of min(srcALen, srcBLen) number of additions are carried internally. 00060 * 00061 * \par 00062 * See <code>arm_conv_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision. 00063 */ 00064 00065 void arm_conv_fast_q31( 00066 q31_t * pSrcA, 00067 uint32_t srcALen, 00068 q31_t * pSrcB, 00069 uint32_t srcBLen, 00070 q31_t * pDst) 00071 { 00072 q31_t *pIn1; /* inputA pointer */ 00073 q31_t *pIn2; /* inputB pointer */ 00074 q31_t *pOut = pDst; /* output pointer */ 00075 q31_t *px; /* Intermediate inputA pointer */ 00076 q31_t *py; /* Intermediate inputB pointer */ 00077 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00078 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00079 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00080 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ 00081 00082 00083 /* The algorithm implementation is based on the lengths of the inputs. */ 00084 /* srcB is always made to slide across srcA. */ 00085 /* So srcBLen is always considered as shorter or equal to srcALen */ 00086 if(srcALen >= srcBLen) 00087 { 00088 /* Initialization of inputA pointer */ 00089 pIn1 = pSrcA; 00090 00091 /* Initialization of inputB pointer */ 00092 pIn2 = pSrcB; 00093 } 00094 else 00095 { 00096 /* Initialization of inputA pointer */ 00097 pIn1 = pSrcB; 00098 00099 /* Initialization of inputB pointer */ 00100 pIn2 = pSrcA; 00101 00102 /* srcBLen is always considered as shorter or equal to srcALen */ 00103 j = srcBLen; 00104 srcBLen = srcALen; 00105 srcALen = j; 00106 } 00107 00108 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00109 /* The function is internally 00110 * divided into three stages according to the number of multiplications that has to be 00111 * taken place between inputA samples and inputB samples. In the first stage of the 00112 * algorithm, the multiplications increase by one for every iteration. 00113 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00114 * In the third stage of the algorithm, the multiplications decrease by one 00115 * for every iteration. */ 00116 00117 /* The algorithm is implemented in three stages. 00118 The loop counters of each stage is initiated here. */ 00119 blockSize1 = srcBLen - 1u; 00120 blockSize2 = srcALen - (srcBLen - 1u); 00121 blockSize3 = blockSize1; 00122 00123 /* -------------------------- 00124 * Initializations of stage1 00125 * -------------------------*/ 00126 00127 /* sum = x[0] * y[0] 00128 * sum = x[0] * y[1] + x[1] * y[0] 00129 * .... 00130 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00131 */ 00132 00133 /* In this stage the MAC operations are increased by 1 for every iteration. 00134 The count variable holds the number of MAC operations performed */ 00135 count = 1u; 00136 00137 /* Working pointer of inputA */ 00138 px = pIn1; 00139 00140 /* Working pointer of inputB */ 00141 py = pIn2; 00142 00143 00144 /* ------------------------ 00145 * Stage1 process 00146 * ----------------------*/ 00147 00148 /* The first stage starts here */ 00149 while(blockSize1 > 0u) 00150 { 00151 /* Accumulator is made zero for every iteration */ 00152 sum = 0; 00153 00154 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00155 k = count >> 2u; 00156 00157 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00158 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00159 while(k > 0u) 00160 { 00161 /* x[0] * y[srcBLen - 1] */ 00162 sum = (q31_t) ((((q63_t) sum << 32) + 00163 ((q63_t) * px++ * (*py--))) >> 32); 00164 00165 /* x[1] * y[srcBLen - 2] */ 00166 sum = (q31_t) ((((q63_t) sum << 32) + 00167 ((q63_t) * px++ * (*py--))) >> 32); 00168 00169 /* x[2] * y[srcBLen - 3] */ 00170 sum = (q31_t) ((((q63_t) sum << 32) + 00171 ((q63_t) * px++ * (*py--))) >> 32); 00172 00173 /* x[3] * y[srcBLen - 4] */ 00174 sum = (q31_t) ((((q63_t) sum << 32) + 00175 ((q63_t) * px++ * (*py--))) >> 32); 00176 00177 /* Decrement the loop counter */ 00178 k--; 00179 } 00180 00181 /* If the count is not a multiple of 4, compute any remaining MACs here. 00182 ** No loop unrolling is used. */ 00183 k = count % 0x4u; 00184 00185 while(k > 0u) 00186 { 00187 /* Perform the multiply-accumulate */ 00188 sum = (q31_t) ((((q63_t) sum << 32) + 00189 ((q63_t) * px++ * (*py--))) >> 32); 00190 00191 /* Decrement the loop counter */ 00192 k--; 00193 } 00194 00195 /* Store the result in the accumulator in the destination buffer. */ 00196 *pOut++ = sum << 1; 00197 00198 /* Update the inputA and inputB pointers for next MAC calculation */ 00199 py = pIn2 + count; 00200 px = pIn1; 00201 00202 /* Increment the MAC count */ 00203 count++; 00204 00205 /* Decrement the loop counter */ 00206 blockSize1--; 00207 } 00208 00209 /* -------------------------- 00210 * Initializations of stage2 00211 * ------------------------*/ 00212 00213 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00214 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00215 * .... 00216 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00217 */ 00218 00219 /* Working pointer of inputA */ 00220 px = pIn1; 00221 00222 /* Working pointer of inputB */ 00223 pSrc2 = pIn2 + (srcBLen - 1u); 00224 py = pSrc2; 00225 00226 /* count is index by which the pointer pIn1 to be incremented */ 00227 count = 1u; 00228 00229 /* ------------------- 00230 * Stage2 process 00231 * ------------------*/ 00232 00233 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00234 * So, to loop unroll over blockSize2, 00235 * srcBLen should be greater than or equal to 4 */ 00236 if(srcBLen >= 4u) 00237 { 00238 /* Loop unroll over blockSize2, by 4 */ 00239 blkCnt = blockSize2 >> 2u; 00240 00241 while(blkCnt > 0u) 00242 { 00243 /* Set all accumulators to zero */ 00244 acc0 = 0; 00245 acc1 = 0; 00246 acc2 = 0; 00247 acc3 = 0; 00248 00249 /* read x[0], x[1], x[2] samples */ 00250 x0 = *(px++); 00251 x1 = *(px++); 00252 x2 = *(px++); 00253 00254 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00255 k = srcBLen >> 2u; 00256 00257 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00258 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00259 do 00260 { 00261 /* Read y[srcBLen - 1] sample */ 00262 c0 = *(py--); 00263 00264 /* Read x[3] sample */ 00265 x3 = *(px++); 00266 00267 /* Perform the multiply-accumulates */ 00268 /* acc0 += x[0] * y[srcBLen - 1] */ 00269 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00270 00271 /* acc1 += x[1] * y[srcBLen - 1] */ 00272 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00273 00274 /* acc2 += x[2] * y[srcBLen - 1] */ 00275 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00276 00277 /* acc3 += x[3] * y[srcBLen - 1] */ 00278 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00279 00280 /* Read y[srcBLen - 2] sample */ 00281 c0 = *(py--); 00282 00283 /* Read x[4] sample */ 00284 x0 = *(px++); 00285 00286 /* Perform the multiply-accumulate */ 00287 /* acc0 += x[1] * y[srcBLen - 2] */ 00288 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 00289 /* acc1 += x[2] * y[srcBLen - 2] */ 00290 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 00291 /* acc2 += x[3] * y[srcBLen - 2] */ 00292 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 00293 /* acc3 += x[4] * y[srcBLen - 2] */ 00294 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 00295 00296 /* Read y[srcBLen - 3] sample */ 00297 c0 = *(py--); 00298 00299 /* Read x[5] sample */ 00300 x1 = *(px++); 00301 00302 /* Perform the multiply-accumulates */ 00303 /* acc0 += x[2] * y[srcBLen - 3] */ 00304 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 00305 /* acc1 += x[3] * y[srcBLen - 2] */ 00306 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 00307 /* acc2 += x[4] * y[srcBLen - 2] */ 00308 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 00309 /* acc3 += x[5] * y[srcBLen - 2] */ 00310 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 00311 00312 /* Read y[srcBLen - 4] sample */ 00313 c0 = *(py--); 00314 00315 /* Read x[6] sample */ 00316 x2 = *(px++); 00317 00318 /* Perform the multiply-accumulates */ 00319 /* acc0 += x[3] * y[srcBLen - 4] */ 00320 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 00321 /* acc1 += x[4] * y[srcBLen - 4] */ 00322 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 00323 /* acc2 += x[5] * y[srcBLen - 4] */ 00324 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 00325 /* acc3 += x[6] * y[srcBLen - 4] */ 00326 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 00327 00328 00329 } while(--k); 00330 00331 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00332 ** No loop unrolling is used. */ 00333 k = srcBLen % 0x4u; 00334 00335 while(k > 0u) 00336 { 00337 /* Read y[srcBLen - 5] sample */ 00338 c0 = *(py--); 00339 00340 /* Read x[7] sample */ 00341 x3 = *(px++); 00342 00343 /* Perform the multiply-accumulates */ 00344 /* acc0 += x[4] * y[srcBLen - 5] */ 00345 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00346 /* acc1 += x[5] * y[srcBLen - 5] */ 00347 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00348 /* acc2 += x[6] * y[srcBLen - 5] */ 00349 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00350 /* acc3 += x[7] * y[srcBLen - 5] */ 00351 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00352 00353 /* Reuse the present samples for the next MAC */ 00354 x0 = x1; 00355 x1 = x2; 00356 x2 = x3; 00357 00358 /* Decrement the loop counter */ 00359 k--; 00360 } 00361 00362 /* Store the results in the accumulators in the destination buffer. */ 00363 *pOut++ = (q31_t) (acc0 << 1); 00364 *pOut++ = (q31_t) (acc1 << 1); 00365 *pOut++ = (q31_t) (acc2 << 1); 00366 *pOut++ = (q31_t) (acc3 << 1); 00367 00368 /* Update the inputA and inputB pointers for next MAC calculation */ 00369 px = pIn1 + (count * 4u); 00370 py = pSrc2; 00371 00372 /* Increment the pointer pIn1 index, count by 1 */ 00373 count++; 00374 00375 /* Decrement the loop counter */ 00376 blkCnt--; 00377 } 00378 00379 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00380 ** No loop unrolling is used. */ 00381 blkCnt = blockSize2 % 0x4u; 00382 00383 while(blkCnt > 0u) 00384 { 00385 /* Accumulator is made zero for every iteration */ 00386 sum = 0; 00387 00388 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00389 k = srcBLen >> 2u; 00390 00391 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00392 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00393 while(k > 0u) 00394 { 00395 /* Perform the multiply-accumulates */ 00396 sum = (q31_t) ((((q63_t) sum << 32) + 00397 ((q63_t) * px++ * (*py--))) >> 32); 00398 sum = (q31_t) ((((q63_t) sum << 32) + 00399 ((q63_t) * px++ * (*py--))) >> 32); 00400 sum = (q31_t) ((((q63_t) sum << 32) + 00401 ((q63_t) * px++ * (*py--))) >> 32); 00402 sum = (q31_t) ((((q63_t) sum << 32) + 00403 ((q63_t) * px++ * (*py--))) >> 32); 00404 00405 /* Decrement the loop counter */ 00406 k--; 00407 } 00408 00409 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00410 ** No loop unrolling is used. */ 00411 k = srcBLen % 0x4u; 00412 00413 while(k > 0u) 00414 { 00415 /* Perform the multiply-accumulate */ 00416 sum = (q31_t) ((((q63_t) sum << 32) + 00417 ((q63_t) * px++ * (*py--))) >> 32); 00418 00419 /* Decrement the loop counter */ 00420 k--; 00421 } 00422 00423 /* Store the result in the accumulator in the destination buffer. */ 00424 *pOut++ = sum << 1; 00425 00426 /* Update the inputA and inputB pointers for next MAC calculation */ 00427 px = pIn1 + count; 00428 py = pSrc2; 00429 00430 /* Increment the MAC count */ 00431 count++; 00432 00433 /* Decrement the loop counter */ 00434 blkCnt--; 00435 } 00436 } 00437 else 00438 { 00439 /* If the srcBLen is not a multiple of 4, 00440 * the blockSize2 loop cannot be unrolled by 4 */ 00441 blkCnt = blockSize2; 00442 00443 while(blkCnt > 0u) 00444 { 00445 /* Accumulator is made zero for every iteration */ 00446 sum = 0; 00447 00448 /* srcBLen number of MACS should be performed */ 00449 k = srcBLen; 00450 00451 while(k > 0u) 00452 { 00453 /* Perform the multiply-accumulate */ 00454 sum = (q31_t) ((((q63_t) sum << 32) + 00455 ((q63_t) * px++ * (*py--))) >> 32); 00456 00457 /* Decrement the loop counter */ 00458 k--; 00459 } 00460 00461 /* Store the result in the accumulator in the destination buffer. */ 00462 *pOut++ = sum << 1; 00463 00464 /* Update the inputA and inputB pointers for next MAC calculation */ 00465 px = pIn1 + count; 00466 py = pSrc2; 00467 00468 /* Increment the MAC count */ 00469 count++; 00470 00471 /* Decrement the loop counter */ 00472 blkCnt--; 00473 } 00474 } 00475 00476 00477 /* -------------------------- 00478 * Initializations of stage3 00479 * -------------------------*/ 00480 00481 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00482 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00483 * .... 00484 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00485 * sum += x[srcALen-1] * y[srcBLen-1] 00486 */ 00487 00488 /* In this stage the MAC operations are decreased by 1 for every iteration. 00489 The blockSize3 variable holds the number of MAC operations performed */ 00490 00491 /* Working pointer of inputA */ 00492 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00493 px = pSrc1; 00494 00495 /* Working pointer of inputB */ 00496 pSrc2 = pIn2 + (srcBLen - 1u); 00497 py = pSrc2; 00498 00499 /* ------------------- 00500 * Stage3 process 00501 * ------------------*/ 00502 00503 while(blockSize3 > 0u) 00504 { 00505 /* Accumulator is made zero for every iteration */ 00506 sum = 0; 00507 00508 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00509 k = blockSize3 >> 2u; 00510 00511 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00512 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00513 while(k > 0u) 00514 { 00515 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00516 sum = (q31_t) ((((q63_t) sum << 32) + 00517 ((q63_t) * px++ * (*py--))) >> 32); 00518 00519 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00520 sum = (q31_t) ((((q63_t) sum << 32) + 00521 ((q63_t) * px++ * (*py--))) >> 32); 00522 00523 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00524 sum = (q31_t) ((((q63_t) sum << 32) + 00525 ((q63_t) * px++ * (*py--))) >> 32); 00526 00527 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00528 sum = (q31_t) ((((q63_t) sum << 32) + 00529 ((q63_t) * px++ * (*py--))) >> 32); 00530 00531 /* Decrement the loop counter */ 00532 k--; 00533 } 00534 00535 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00536 ** No loop unrolling is used. */ 00537 k = blockSize3 % 0x4u; 00538 00539 while(k > 0u) 00540 { 00541 /* Perform the multiply-accumulate */ 00542 sum = (q31_t) ((((q63_t) sum << 32) + 00543 ((q63_t) * px++ * (*py--))) >> 32); 00544 00545 /* Decrement the loop counter */ 00546 k--; 00547 } 00548 00549 /* Store the result in the accumulator in the destination buffer. */ 00550 *pOut++ = sum << 1; 00551 00552 /* Update the inputA and inputB pointers for next MAC calculation */ 00553 px = ++pSrc1; 00554 py = pSrc2; 00555 00556 /* Decrement the loop counter */ 00557 blockSize3--; 00558 } 00559 00560 } 00561 00562 /** 00563 * @} end of Conv group 00564 */
Generated on Tue Jul 12 2022 14:13:52 by 1.7.2