CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_conv_partial_f32.c
00001 /* ---------------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_f32.c 00009 * 00010 * Description: Partial Convolution of floating-point sequences 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated 00025 * 00026 * Version 0.0.7 2010/06/10 00027 * Misra-C changes done 00028 * 00029 * -------------------------------------------------------------------------- */ 00030 00031 #include "arm_math.h" 00032 00033 /** 00034 * @ingroup groupFilters 00035 */ 00036 00037 /** 00038 * @defgroup PartialConv Partial Convolution 00039 * 00040 * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated. 00041 * Each function has two additional arguments. 00042 * <code>firstIndex</code> specifies the starting index of the subset of output samples. 00043 * <code>numPoints</code> is the number of output samples to compute. 00044 * The function computes the output in the range 00045 * <code>[firstIndex, ..., firstIndex+numPoints-1]</code>. 00046 * The output array <code>pDst</code> contains <code>numPoints</code> values. 00047 * 00048 * The allowable range of output indices is [0 srcALen+srcBLen-2]. 00049 * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR. 00050 * Otherwise the functions return ARM_MATH_SUCCESS. 00051 * \note Refer arm_conv_f32() for details on fixed point behavior. 00052 */ 00053 00054 /** 00055 * @addtogroup PartialConv 00056 * @{ 00057 */ 00058 00059 /** 00060 * @brief Partial convolution of floating-point sequences. 00061 * @param[in] *pSrcA points to the first input sequence. 00062 * @param[in] srcALen length of the first input sequence. 00063 * @param[in] *pSrcB points to the second input sequence. 00064 * @param[in] srcBLen length of the second input sequence. 00065 * @param[out] *pDst points to the location where the output result is written. 00066 * @param[in] firstIndex is the first output sample to start with. 00067 * @param[in] numPoints is the number of output points to be computed. 00068 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00069 */ 00070 00071 arm_status arm_conv_partial_f32( 00072 float32_t * pSrcA, 00073 uint32_t srcALen, 00074 float32_t * pSrcB, 00075 uint32_t srcBLen, 00076 float32_t * pDst, 00077 uint32_t firstIndex, 00078 uint32_t numPoints) 00079 { 00080 float32_t *pIn1 = pSrcA; /* inputA pointer */ 00081 float32_t *pIn2 = pSrcB; /* inputB pointer */ 00082 float32_t *pOut = pDst; /* output pointer */ 00083 float32_t *px; /* Intermediate inputA pointer */ 00084 float32_t *py; /* Intermediate inputB pointer */ 00085 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00086 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00087 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00088 uint32_t j, k, count = 0u, blkCnt, check; 00089 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00090 arm_status status; /* status of Partial convolution */ 00091 00092 00093 /* Check for range of output samples to be calculated */ 00094 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00095 { 00096 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00097 status = ARM_MATH_ARGUMENT_ERROR; 00098 } 00099 else 00100 { 00101 00102 /* The algorithm implementation is based on the lengths of the inputs. */ 00103 /* srcB is always made to slide across srcA. */ 00104 /* So srcBLen is always considered as shorter or equal to srcALen */ 00105 if(srcALen >= srcBLen) 00106 { 00107 /* Initialization of inputA pointer */ 00108 pIn1 = pSrcA; 00109 00110 /* Initialization of inputB pointer */ 00111 pIn2 = pSrcB; 00112 } 00113 else 00114 { 00115 /* Initialization of inputA pointer */ 00116 pIn1 = pSrcB; 00117 00118 /* Initialization of inputB pointer */ 00119 pIn2 = pSrcA; 00120 00121 /* srcBLen is always considered as shorter or equal to srcALen */ 00122 j = srcBLen; 00123 srcBLen = srcALen; 00124 srcALen = j; 00125 } 00126 00127 /* Conditions to check which loopCounter holds 00128 * the first and last indices of the output samples to be calculated. */ 00129 check = firstIndex + numPoints; 00130 blockSize3 = (int32_t) check - (int32_t) srcALen; 00131 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00132 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; 00133 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00134 (int32_t) numPoints) : 0; 00135 blockSize2 = ((int32_t) check - blockSize3) - 00136 (blockSize1 + (int32_t) firstIndex); 00137 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00138 00139 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00140 /* The function is internally 00141 * divided into three stages according to the number of multiplications that has to be 00142 * taken place between inputA samples and inputB samples. In the first stage of the 00143 * algorithm, the multiplications increase by one for every iteration. 00144 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00145 * In the third stage of the algorithm, the multiplications decrease by one 00146 * for every iteration. */ 00147 00148 /* Set the output pointer to point to the firstIndex 00149 * of the output sample to be calculated. */ 00150 pOut = pDst + firstIndex; 00151 00152 /* -------------------------- 00153 * Initializations of stage1 00154 * -------------------------*/ 00155 00156 /* sum = x[0] * y[0] 00157 * sum = x[0] * y[1] + x[1] * y[0] 00158 * .... 00159 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00160 */ 00161 00162 /* In this stage the MAC operations are increased by 1 for every iteration. 00163 The count variable holds the number of MAC operations performed. 00164 Since the partial convolution starts from from firstIndex 00165 Number of Macs to be performed is firstIndex + 1 */ 00166 count = 1u + firstIndex; 00167 00168 /* Working pointer of inputA */ 00169 px = pIn1; 00170 00171 /* Working pointer of inputB */ 00172 pSrc1 = pIn2 + firstIndex; 00173 py = pSrc1; 00174 00175 /* ------------------------ 00176 * Stage1 process 00177 * ----------------------*/ 00178 00179 /* The first stage starts here */ 00180 while(blockSize1 > 0) 00181 { 00182 /* Accumulator is made zero for every iteration */ 00183 sum = 0.0f; 00184 00185 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00186 k = count >> 2u; 00187 00188 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00189 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00190 while(k > 0u) 00191 { 00192 /* x[0] * y[srcBLen - 1] */ 00193 sum += *px++ * *py--; 00194 00195 /* x[1] * y[srcBLen - 2] */ 00196 sum += *px++ * *py--; 00197 00198 /* x[2] * y[srcBLen - 3] */ 00199 sum += *px++ * *py--; 00200 00201 /* x[3] * y[srcBLen - 4] */ 00202 sum += *px++ * *py--; 00203 00204 /* Decrement the loop counter */ 00205 k--; 00206 } 00207 00208 /* If the count is not a multiple of 4, compute any remaining MACs here. 00209 ** No loop unrolling is used. */ 00210 k = count % 0x4u; 00211 00212 while(k > 0u) 00213 { 00214 /* Perform the multiply-accumulates */ 00215 sum += *px++ * *py--; 00216 00217 /* Decrement the loop counter */ 00218 k--; 00219 } 00220 00221 /* Store the result in the accumulator in the destination buffer. */ 00222 *pOut++ = sum; 00223 00224 /* Update the inputA and inputB pointers for next MAC calculation */ 00225 py = ++pSrc1; 00226 px = pIn1; 00227 00228 /* Increment the MAC count */ 00229 count++; 00230 00231 /* Decrement the loop counter */ 00232 blockSize1--; 00233 } 00234 00235 /* -------------------------- 00236 * Initializations of stage2 00237 * ------------------------*/ 00238 00239 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00240 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00241 * .... 00242 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00243 */ 00244 00245 /* Working pointer of inputA */ 00246 px = pIn1; 00247 00248 /* Working pointer of inputB */ 00249 pSrc2 = pIn2 + (srcBLen - 1u); 00250 py = pSrc2; 00251 00252 /* count is index by which the pointer pIn1 to be incremented */ 00253 count = 1u; 00254 00255 /* ------------------- 00256 * Stage2 process 00257 * ------------------*/ 00258 00259 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00260 * So, to loop unroll over blockSize2, 00261 * srcBLen should be greater than or equal to 4 */ 00262 if(srcBLen >= 4u) 00263 { 00264 /* Loop unroll over blockSize2, by 4 */ 00265 blkCnt = ((uint32_t) blockSize2 >> 2u); 00266 00267 while(blkCnt > 0u) 00268 { 00269 /* Set all accumulators to zero */ 00270 acc0 = 0.0f; 00271 acc1 = 0.0f; 00272 acc2 = 0.0f; 00273 acc3 = 0.0f; 00274 00275 /* read x[0], x[1], x[2] samples */ 00276 x0 = *(px++); 00277 x1 = *(px++); 00278 x2 = *(px++); 00279 00280 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00281 k = srcBLen >> 2u; 00282 00283 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00284 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00285 do 00286 { 00287 /* Read y[srcBLen - 1] sample */ 00288 c0 = *(py--); 00289 00290 /* Read x[3] sample */ 00291 x3 = *(px++); 00292 00293 /* Perform the multiply-accumulate */ 00294 /* acc0 += x[0] * y[srcBLen - 1] */ 00295 acc0 += x0 * c0; 00296 00297 /* acc1 += x[1] * y[srcBLen - 1] */ 00298 acc1 += x1 * c0; 00299 00300 /* acc2 += x[2] * y[srcBLen - 1] */ 00301 acc2 += x2 * c0; 00302 00303 /* acc3 += x[3] * y[srcBLen - 1] */ 00304 acc3 += x3 * c0; 00305 00306 /* Read y[srcBLen - 2] sample */ 00307 c0 = *(py--); 00308 00309 /* Read x[4] sample */ 00310 x0 = *(px++); 00311 00312 /* Perform the multiply-accumulate */ 00313 /* acc0 += x[1] * y[srcBLen - 2] */ 00314 acc0 += x1 * c0; 00315 /* acc1 += x[2] * y[srcBLen - 2] */ 00316 acc1 += x2 * c0; 00317 /* acc2 += x[3] * y[srcBLen - 2] */ 00318 acc2 += x3 * c0; 00319 /* acc3 += x[4] * y[srcBLen - 2] */ 00320 acc3 += x0 * c0; 00321 00322 /* Read y[srcBLen - 3] sample */ 00323 c0 = *(py--); 00324 00325 /* Read x[5] sample */ 00326 x1 = *(px++); 00327 00328 /* Perform the multiply-accumulates */ 00329 /* acc0 += x[2] * y[srcBLen - 3] */ 00330 acc0 += x2 * c0; 00331 /* acc1 += x[3] * y[srcBLen - 2] */ 00332 acc1 += x3 * c0; 00333 /* acc2 += x[4] * y[srcBLen - 2] */ 00334 acc2 += x0 * c0; 00335 /* acc3 += x[5] * y[srcBLen - 2] */ 00336 acc3 += x1 * c0; 00337 00338 /* Read y[srcBLen - 4] sample */ 00339 c0 = *(py--); 00340 00341 /* Read x[6] sample */ 00342 x2 = *(px++); 00343 00344 /* Perform the multiply-accumulates */ 00345 /* acc0 += x[3] * y[srcBLen - 4] */ 00346 acc0 += x3 * c0; 00347 /* acc1 += x[4] * y[srcBLen - 4] */ 00348 acc1 += x0 * c0; 00349 /* acc2 += x[5] * y[srcBLen - 4] */ 00350 acc2 += x1 * c0; 00351 /* acc3 += x[6] * y[srcBLen - 4] */ 00352 acc3 += x2 * c0; 00353 00354 00355 } while(--k); 00356 00357 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00358 ** No loop unrolling is used. */ 00359 k = srcBLen % 0x4u; 00360 00361 while(k > 0u) 00362 { 00363 /* Read y[srcBLen - 5] sample */ 00364 c0 = *(py--); 00365 00366 /* Read x[7] sample */ 00367 x3 = *(px++); 00368 00369 /* Perform the multiply-accumulates */ 00370 /* acc0 += x[4] * y[srcBLen - 5] */ 00371 acc0 += x0 * c0; 00372 /* acc1 += x[5] * y[srcBLen - 5] */ 00373 acc1 += x1 * c0; 00374 /* acc2 += x[6] * y[srcBLen - 5] */ 00375 acc2 += x2 * c0; 00376 /* acc3 += x[7] * y[srcBLen - 5] */ 00377 acc3 += x3 * c0; 00378 00379 /* Reuse the present samples for the next MAC */ 00380 x0 = x1; 00381 x1 = x2; 00382 x2 = x3; 00383 00384 /* Decrement the loop counter */ 00385 k--; 00386 } 00387 00388 /* Store the result in the accumulator in the destination buffer. */ 00389 *pOut++ = acc0; 00390 *pOut++ = acc1; 00391 *pOut++ = acc2; 00392 *pOut++ = acc3; 00393 00394 /* Update the inputA and inputB pointers for next MAC calculation */ 00395 px = pIn1 + (count * 4u); 00396 py = pSrc2; 00397 00398 /* Increment the pointer pIn1 index, count by 1 */ 00399 count++; 00400 00401 /* Decrement the loop counter */ 00402 blkCnt--; 00403 } 00404 00405 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00406 ** No loop unrolling is used. */ 00407 blkCnt = (uint32_t) blockSize2 % 0x4u; 00408 00409 while(blkCnt > 0u) 00410 { 00411 /* Accumulator is made zero for every iteration */ 00412 sum = 0.0f; 00413 00414 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00415 k = srcBLen >> 2u; 00416 00417 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00418 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00419 while(k > 0u) 00420 { 00421 /* Perform the multiply-accumulates */ 00422 sum += *px++ * *py--; 00423 sum += *px++ * *py--; 00424 sum += *px++ * *py--; 00425 sum += *px++ * *py--; 00426 00427 /* Decrement the loop counter */ 00428 k--; 00429 } 00430 00431 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00432 ** No loop unrolling is used. */ 00433 k = srcBLen % 0x4u; 00434 00435 while(k > 0u) 00436 { 00437 /* Perform the multiply-accumulate */ 00438 sum += *px++ * *py--; 00439 00440 /* Decrement the loop counter */ 00441 k--; 00442 } 00443 00444 /* Store the result in the accumulator in the destination buffer. */ 00445 *pOut++ = sum; 00446 00447 /* Update the inputA and inputB pointers for next MAC calculation */ 00448 px = pIn1 + count; 00449 py = pSrc2; 00450 00451 /* Increment the MAC count */ 00452 count++; 00453 00454 /* Decrement the loop counter */ 00455 blkCnt--; 00456 } 00457 } 00458 else 00459 { 00460 /* If the srcBLen is not a multiple of 4, 00461 * the blockSize2 loop cannot be unrolled by 4 */ 00462 blkCnt = (uint32_t) blockSize2; 00463 00464 while(blkCnt > 0u) 00465 { 00466 /* Accumulator is made zero for every iteration */ 00467 sum = 0.0f; 00468 00469 /* srcBLen number of MACS should be performed */ 00470 k = srcBLen; 00471 00472 while(k > 0u) 00473 { 00474 /* Perform the multiply-accumulate */ 00475 sum += *px++ * *py--; 00476 00477 /* Decrement the loop counter */ 00478 k--; 00479 } 00480 00481 /* Store the result in the accumulator in the destination buffer. */ 00482 *pOut++ = sum; 00483 00484 /* Update the inputA and inputB pointers for next MAC calculation */ 00485 px = pIn1 + count; 00486 py = pSrc2; 00487 00488 /* Increment the MAC count */ 00489 count++; 00490 00491 /* Decrement the loop counter */ 00492 blkCnt--; 00493 } 00494 } 00495 00496 00497 /* -------------------------- 00498 * Initializations of stage3 00499 * -------------------------*/ 00500 00501 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00502 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00503 * .... 00504 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00505 * sum += x[srcALen-1] * y[srcBLen-1] 00506 */ 00507 00508 /* In this stage the MAC operations are decreased by 1 for every iteration. 00509 The count variable holds the number of MAC operations performed */ 00510 count = srcBLen - 1u; 00511 00512 /* Working pointer of inputA */ 00513 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00514 px = pSrc1; 00515 00516 /* Working pointer of inputB */ 00517 pSrc2 = pIn2 + (srcBLen - 1u); 00518 py = pSrc2; 00519 00520 while(blockSize3 > 0) 00521 { 00522 /* Accumulator is made zero for every iteration */ 00523 sum = 0.0f; 00524 00525 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00526 k = count >> 2u; 00527 00528 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00529 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00530 while(k > 0u) 00531 { 00532 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00533 sum += *px++ * *py--; 00534 00535 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00536 sum += *px++ * *py--; 00537 00538 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00539 sum += *px++ * *py--; 00540 00541 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00542 sum += *px++ * *py--; 00543 00544 /* Decrement the loop counter */ 00545 k--; 00546 } 00547 00548 /* If the count is not a multiple of 4, compute any remaining MACs here. 00549 ** No loop unrolling is used. */ 00550 k = count % 0x4u; 00551 00552 while(k > 0u) 00553 { 00554 /* Perform the multiply-accumulates */ 00555 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00556 sum += *px++ * *py--; 00557 00558 /* Decrement the loop counter */ 00559 k--; 00560 } 00561 00562 /* Store the result in the accumulator in the destination buffer. */ 00563 *pOut++ = sum; 00564 00565 /* Update the inputA and inputB pointers for next MAC calculation */ 00566 px = ++pSrc1; 00567 py = pSrc2; 00568 00569 /* Decrement the MAC count */ 00570 count--; 00571 00572 /* Decrement the loop counter */ 00573 blockSize3--; 00574 00575 } 00576 00577 /* set status as ARM_MATH_SUCCESS */ 00578 status = ARM_MATH_SUCCESS; 00579 } 00580 00581 /* Return to application */ 00582 return (status); 00583 00584 } 00585 00586 /** 00587 * @} end of PartialConv group 00588 */
Generated on Tue Jul 12 2022 14:13:52 by 1.7.2