CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_conv_partial_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_q31.c 00009 * 00010 * Description: Q31 Partial convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated 00025 * 00026 * Version 0.0.7 2010/06/10 00027 * Misra-C changes done 00028 * 00029 * -------------------------------------------------------------------- */ 00030 00031 #include "arm_math.h" 00032 00033 /** 00034 * @ingroup groupFilters 00035 */ 00036 00037 /** 00038 * @addtogroup PartialConv 00039 * @{ 00040 */ 00041 00042 /** 00043 * @brief Partial convolution of Q31 sequences. 00044 * @param[in] *pSrcA points to the first input sequence. 00045 * @param[in] srcALen length of the first input sequence. 00046 * @param[in] *pSrcB points to the second input sequence. 00047 * @param[in] srcBLen length of the second input sequence. 00048 * @param[out] *pDst points to the location where the output result is written. 00049 * @param[in] firstIndex is the first output sample to start with. 00050 * @param[in] numPoints is the number of output points to be computed. 00051 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00052 * 00053 * See <code>arm_conv_partial_fast_q31()</code> for a faster but less precise implementation of this function. 00054 */ 00055 00056 arm_status arm_conv_partial_q31( 00057 q31_t * pSrcA, 00058 uint32_t srcALen, 00059 q31_t * pSrcB, 00060 uint32_t srcBLen, 00061 q31_t * pDst, 00062 uint32_t firstIndex, 00063 uint32_t numPoints) 00064 { 00065 q31_t *pIn1; /* inputA pointer */ 00066 q31_t *pIn2; /* inputB pointer */ 00067 q31_t *pOut = pDst; /* output pointer */ 00068 q31_t *px; /* Intermediate inputA pointer */ 00069 q31_t *py; /* Intermediate inputB pointer */ 00070 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00071 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00072 q31_t x0, x1, x2, x3, c0; 00073 uint32_t j, k, count, check, blkCnt; 00074 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00075 arm_status status; /* status of Partial convolution */ 00076 00077 00078 /* Check for range of output samples to be calculated */ 00079 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00080 { 00081 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00082 status = ARM_MATH_ARGUMENT_ERROR; 00083 } 00084 else 00085 { 00086 00087 /* The algorithm implementation is based on the lengths of the inputs. */ 00088 /* srcB is always made to slide across srcA. */ 00089 /* So srcBLen is always considered as shorter or equal to srcALen */ 00090 if(srcALen >= srcBLen) 00091 { 00092 /* Initialization of inputA pointer */ 00093 pIn1 = pSrcA; 00094 00095 /* Initialization of inputB pointer */ 00096 pIn2 = pSrcB; 00097 } 00098 else 00099 { 00100 /* Initialization of inputA pointer */ 00101 pIn1 = pSrcB; 00102 00103 /* Initialization of inputB pointer */ 00104 pIn2 = pSrcA; 00105 00106 /* srcBLen is always considered as shorter or equal to srcALen */ 00107 j = srcBLen; 00108 srcBLen = srcALen; 00109 srcALen = j; 00110 } 00111 00112 /* Conditions to check which loopCounter holds 00113 * the first and last indices of the output samples to be calculated. */ 00114 check = firstIndex + numPoints; 00115 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00116 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00117 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00118 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00119 (int32_t) numPoints) : 0; 00120 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00121 (int32_t) firstIndex); 00122 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00123 00124 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00125 /* The function is internally 00126 * divided into three stages according to the number of multiplications that has to be 00127 * taken place between inputA samples and inputB samples. In the first stage of the 00128 * algorithm, the multiplications increase by one for every iteration. 00129 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00130 * In the third stage of the algorithm, the multiplications decrease by one 00131 * for every iteration. */ 00132 00133 /* Set the output pointer to point to the firstIndex 00134 * of the output sample to be calculated. */ 00135 pOut = pDst + firstIndex; 00136 00137 /* -------------------------- 00138 * Initializations of stage1 00139 * -------------------------*/ 00140 00141 /* sum = x[0] * y[0] 00142 * sum = x[0] * y[1] + x[1] * y[0] 00143 * .... 00144 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00145 */ 00146 00147 /* In this stage the MAC operations are increased by 1 for every iteration. 00148 The count variable holds the number of MAC operations performed. 00149 Since the partial convolution starts from firstIndex 00150 Number of Macs to be performed is firstIndex + 1 */ 00151 count = 1u + firstIndex; 00152 00153 /* Working pointer of inputA */ 00154 px = pIn1; 00155 00156 /* Working pointer of inputB */ 00157 pSrc2 = pIn2 + firstIndex; 00158 py = pSrc2; 00159 00160 /* ------------------------ 00161 * Stage1 process 00162 * ----------------------*/ 00163 00164 /* The first loop starts here */ 00165 while(blockSize1 > 0) 00166 { 00167 /* Accumulator is made zero for every iteration */ 00168 sum = 0; 00169 00170 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00171 k = count >> 2u; 00172 00173 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00174 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00175 while(k > 0u) 00176 { 00177 /* x[0] * y[srcBLen - 1] */ 00178 sum += (q63_t) * px++ * (*py--); 00179 /* x[1] * y[srcBLen - 2] */ 00180 sum += (q63_t) * px++ * (*py--); 00181 /* x[2] * y[srcBLen - 3] */ 00182 sum += (q63_t) * px++ * (*py--); 00183 /* x[3] * y[srcBLen - 4] */ 00184 sum += (q63_t) * px++ * (*py--); 00185 00186 /* Decrement the loop counter */ 00187 k--; 00188 } 00189 00190 /* If the count is not a multiple of 4, compute any remaining MACs here. 00191 ** No loop unrolling is used. */ 00192 k = count % 0x4u; 00193 00194 while(k > 0u) 00195 { 00196 /* Perform the multiply-accumulate */ 00197 sum += (q63_t) * px++ * (*py--); 00198 00199 /* Decrement the loop counter */ 00200 k--; 00201 } 00202 00203 /* Store the result in the accumulator in the destination buffer. */ 00204 *pOut++ = (q31_t) (sum >> 31); 00205 00206 /* Update the inputA and inputB pointers for next MAC calculation */ 00207 py = ++pSrc2; 00208 px = pIn1; 00209 00210 /* Increment the MAC count */ 00211 count++; 00212 00213 /* Decrement the loop counter */ 00214 blockSize1--; 00215 } 00216 00217 /* -------------------------- 00218 * Initializations of stage2 00219 * ------------------------*/ 00220 00221 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00222 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00223 * .... 00224 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00225 */ 00226 00227 /* Working pointer of inputA */ 00228 px = pIn1; 00229 00230 /* Working pointer of inputB */ 00231 pSrc2 = pIn2 + (srcBLen - 1u); 00232 py = pSrc2; 00233 00234 /* count is index by which the pointer pIn1 to be incremented */ 00235 count = 1u; 00236 00237 /* ------------------- 00238 * Stage2 process 00239 * ------------------*/ 00240 00241 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00242 * So, to loop unroll over blockSize2, 00243 * srcBLen should be greater than or equal to 4 */ 00244 if(srcBLen >= 4u) 00245 { 00246 /* Loop unroll over blockSize2 */ 00247 blkCnt = ((uint32_t) blockSize2 >> 2u); 00248 00249 while(blkCnt > 0u) 00250 { 00251 /* Set all accumulators to zero */ 00252 acc0 = 0; 00253 acc1 = 0; 00254 acc2 = 0; 00255 acc3 = 0; 00256 00257 /* read x[0], x[1], x[2] samples */ 00258 x0 = *(px++); 00259 x1 = *(px++); 00260 x2 = *(px++); 00261 00262 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00263 k = srcBLen >> 2u; 00264 00265 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00266 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00267 do 00268 { 00269 /* Read y[srcBLen - 1] sample */ 00270 c0 = *(py--); 00271 00272 /* Read x[3] sample */ 00273 x3 = *(px++); 00274 00275 /* Perform the multiply-accumulates */ 00276 /* acc0 += x[0] * y[srcBLen - 1] */ 00277 acc0 += (q63_t) x0 *c0; 00278 /* acc1 += x[1] * y[srcBLen - 1] */ 00279 acc1 += (q63_t) x1 *c0; 00280 /* acc2 += x[2] * y[srcBLen - 1] */ 00281 acc2 += (q63_t) x2 *c0; 00282 /* acc3 += x[3] * y[srcBLen - 1] */ 00283 acc3 += (q63_t) x3 *c0; 00284 00285 /* Read y[srcBLen - 2] sample */ 00286 c0 = *(py--); 00287 00288 /* Read x[4] sample */ 00289 x0 = *(px++); 00290 00291 /* Perform the multiply-accumulate */ 00292 /* acc0 += x[1] * y[srcBLen - 2] */ 00293 acc0 += (q63_t) x1 *c0; 00294 /* acc1 += x[2] * y[srcBLen - 2] */ 00295 acc1 += (q63_t) x2 *c0; 00296 /* acc2 += x[3] * y[srcBLen - 2] */ 00297 acc2 += (q63_t) x3 *c0; 00298 /* acc3 += x[4] * y[srcBLen - 2] */ 00299 acc3 += (q63_t) x0 *c0; 00300 00301 /* Read y[srcBLen - 3] sample */ 00302 c0 = *(py--); 00303 00304 /* Read x[5] sample */ 00305 x1 = *(px++); 00306 00307 /* Perform the multiply-accumulates */ 00308 /* acc0 += x[2] * y[srcBLen - 3] */ 00309 acc0 += (q63_t) x2 *c0; 00310 /* acc1 += x[3] * y[srcBLen - 2] */ 00311 acc1 += (q63_t) x3 *c0; 00312 /* acc2 += x[4] * y[srcBLen - 2] */ 00313 acc2 += (q63_t) x0 *c0; 00314 /* acc3 += x[5] * y[srcBLen - 2] */ 00315 acc3 += (q63_t) x1 *c0; 00316 00317 /* Read y[srcBLen - 4] sample */ 00318 c0 = *(py--); 00319 00320 /* Read x[6] sample */ 00321 x2 = *(px++); 00322 00323 /* Perform the multiply-accumulates */ 00324 /* acc0 += x[3] * y[srcBLen - 4] */ 00325 acc0 += (q63_t) x3 *c0; 00326 /* acc1 += x[4] * y[srcBLen - 4] */ 00327 acc1 += (q63_t) x0 *c0; 00328 /* acc2 += x[5] * y[srcBLen - 4] */ 00329 acc2 += (q63_t) x1 *c0; 00330 /* acc3 += x[6] * y[srcBLen - 4] */ 00331 acc3 += (q63_t) x2 *c0; 00332 00333 } while(--k); 00334 00335 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00336 ** No loop unrolling is used. */ 00337 k = srcBLen % 0x4u; 00338 00339 while(k > 0u) 00340 { 00341 /* Read y[srcBLen - 5] sample */ 00342 c0 = *(py--); 00343 00344 /* Read x[7] sample */ 00345 x3 = *(px++); 00346 00347 /* Perform the multiply-accumulates */ 00348 /* acc0 += x[4] * y[srcBLen - 5] */ 00349 acc0 += (q63_t) x0 *c0; 00350 /* acc1 += x[5] * y[srcBLen - 5] */ 00351 acc1 += (q63_t) x1 *c0; 00352 /* acc2 += x[6] * y[srcBLen - 5] */ 00353 acc2 += (q63_t) x2 *c0; 00354 /* acc3 += x[7] * y[srcBLen - 5] */ 00355 acc3 += (q63_t) x3 *c0; 00356 00357 /* Reuse the present samples for the next MAC */ 00358 x0 = x1; 00359 x1 = x2; 00360 x2 = x3; 00361 00362 /* Decrement the loop counter */ 00363 k--; 00364 } 00365 00366 /* Store the result in the accumulator in the destination buffer. */ 00367 *pOut++ = (q31_t) (acc0 >> 31); 00368 *pOut++ = (q31_t) (acc1 >> 31); 00369 *pOut++ = (q31_t) (acc2 >> 31); 00370 *pOut++ = (q31_t) (acc3 >> 31); 00371 00372 /* Update the inputA and inputB pointers for next MAC calculation */ 00373 px = pIn1 + (count * 4u); 00374 py = pSrc2; 00375 00376 /* Increment the pointer pIn1 index, count by 1 */ 00377 count++; 00378 00379 /* Decrement the loop counter */ 00380 blkCnt--; 00381 } 00382 00383 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00384 ** No loop unrolling is used. */ 00385 blkCnt = (uint32_t) blockSize2 % 0x4u; 00386 00387 while(blkCnt > 0u) 00388 { 00389 /* Accumulator is made zero for every iteration */ 00390 sum = 0; 00391 00392 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00393 k = srcBLen >> 2u; 00394 00395 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00396 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00397 while(k > 0u) 00398 { 00399 /* Perform the multiply-accumulates */ 00400 sum += (q63_t) * px++ * (*py--); 00401 sum += (q63_t) * px++ * (*py--); 00402 sum += (q63_t) * px++ * (*py--); 00403 sum += (q63_t) * px++ * (*py--); 00404 00405 /* Decrement the loop counter */ 00406 k--; 00407 } 00408 00409 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00410 ** No loop unrolling is used. */ 00411 k = srcBLen % 0x4u; 00412 00413 while(k > 0u) 00414 { 00415 /* Perform the multiply-accumulate */ 00416 sum += (q63_t) * px++ * (*py--); 00417 00418 /* Decrement the loop counter */ 00419 k--; 00420 } 00421 00422 /* Store the result in the accumulator in the destination buffer. */ 00423 *pOut++ = (q31_t) (sum >> 31); 00424 00425 /* Update the inputA and inputB pointers for next MAC calculation */ 00426 px = pIn1 + count; 00427 py = pSrc2; 00428 00429 /* Increment the MAC count */ 00430 count++; 00431 00432 /* Decrement the loop counter */ 00433 blkCnt--; 00434 } 00435 } 00436 else 00437 { 00438 /* If the srcBLen is not a multiple of 4, 00439 * the blockSize2 loop cannot be unrolled by 4 */ 00440 blkCnt = (uint32_t) blockSize2; 00441 00442 while(blkCnt > 0u) 00443 { 00444 /* Accumulator is made zero for every iteration */ 00445 sum = 0; 00446 00447 /* srcBLen number of MACS should be performed */ 00448 k = srcBLen; 00449 00450 while(k > 0u) 00451 { 00452 /* Perform the multiply-accumulate */ 00453 sum += (q63_t) * px++ * (*py--); 00454 00455 /* Decrement the loop counter */ 00456 k--; 00457 } 00458 00459 /* Store the result in the accumulator in the destination buffer. */ 00460 *pOut++ = (q31_t) (sum >> 31); 00461 00462 /* Update the inputA and inputB pointers for next MAC calculation */ 00463 px = pIn1 + count; 00464 py = pSrc2; 00465 00466 /* Increment the MAC count */ 00467 count++; 00468 00469 /* Decrement the loop counter */ 00470 blkCnt--; 00471 } 00472 } 00473 00474 00475 /* -------------------------- 00476 * Initializations of stage3 00477 * -------------------------*/ 00478 00479 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00480 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00481 * .... 00482 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00483 * sum += x[srcALen-1] * y[srcBLen-1] 00484 */ 00485 00486 /* In this stage the MAC operations are decreased by 1 for every iteration. 00487 The blockSize3 variable holds the number of MAC operations performed */ 00488 count = srcBLen - 1u; 00489 00490 /* Working pointer of inputA */ 00491 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00492 px = pSrc1; 00493 00494 /* Working pointer of inputB */ 00495 pSrc2 = pIn2 + (srcBLen - 1u); 00496 py = pSrc2; 00497 00498 /* ------------------- 00499 * Stage3 process 00500 * ------------------*/ 00501 00502 while(blockSize3 > 0) 00503 { 00504 /* Accumulator is made zero for every iteration */ 00505 sum = 0; 00506 00507 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00508 k = count >> 2u; 00509 00510 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00511 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00512 while(k > 0u) 00513 { 00514 sum += (q63_t) * px++ * (*py--); 00515 sum += (q63_t) * px++ * (*py--); 00516 sum += (q63_t) * px++ * (*py--); 00517 sum += (q63_t) * px++ * (*py--); 00518 00519 /* Decrement the loop counter */ 00520 k--; 00521 } 00522 00523 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00524 ** No loop unrolling is used. */ 00525 k = count % 0x4u; 00526 00527 while(k > 0u) 00528 { 00529 /* Perform the multiply-accumulate */ 00530 sum += (q63_t) * px++ * (*py--); 00531 00532 /* Decrement the loop counter */ 00533 k--; 00534 } 00535 00536 /* Store the result in the accumulator in the destination buffer. */ 00537 *pOut++ = (q31_t) (sum >> 31); 00538 00539 /* Update the inputA and inputB pointers for next MAC calculation */ 00540 px = ++pSrc1; 00541 py = pSrc2; 00542 00543 /* Decrement the MAC count */ 00544 count--; 00545 00546 /* Decrement the loop counter */ 00547 blockSize3--; 00548 00549 } 00550 00551 /* set status as ARM_MATH_SUCCESS */ 00552 status = ARM_MATH_SUCCESS; 00553 } 00554 00555 /* Return to application */ 00556 return (status); 00557 00558 } 00559 00560 /** 00561 * @} end of PartialConv group 00562 */
Generated on Tue Jul 12 2022 14:13:52 by 1.7.2