CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_conv_partial_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_fast_q15.c 00009 * 00010 * Description: Fast Q15 Partial convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated. 00025 * -------------------------------------------------------------------- */ 00026 00027 #include "arm_math.h" 00028 00029 /** 00030 * @ingroup groupFilters 00031 */ 00032 00033 /** 00034 * @addtogroup PartialConv 00035 * @{ 00036 */ 00037 00038 /** 00039 * @brief Partial convolution of Q15 sequences (fast version). 00040 * @param[in] *pSrcA points to the first input sequence. 00041 * @param[in] srcALen length of the first input sequence. 00042 * @param[in] *pSrcB points to the second input sequence. 00043 * @param[in] srcBLen length of the second input sequence. 00044 * @param[out] *pDst points to the location where the output result is written. 00045 * @param[in] firstIndex is the first output sample to start with. 00046 * @param[in] numPoints is the number of output points to be computed. 00047 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00048 * 00049 * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. 00050 */ 00051 00052 00053 arm_status arm_conv_partial_fast_q15( 00054 q15_t * pSrcA, 00055 uint32_t srcALen, 00056 q15_t * pSrcB, 00057 uint32_t srcBLen, 00058 q15_t * pDst, 00059 uint32_t firstIndex, 00060 uint32_t numPoints) 00061 { 00062 q15_t *pIn1; /* inputA pointer */ 00063 q15_t *pIn2; /* inputB pointer */ 00064 q15_t *pOut = pDst; /* output pointer */ 00065 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00066 q15_t *px; /* Intermediate inputA pointer */ 00067 q15_t *py; /* Intermediate inputB pointer */ 00068 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00069 q31_t x0, x1, x2, x3, c0; 00070 uint32_t j, k, count, check, blkCnt; 00071 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00072 arm_status status; /* status of Partial convolution */ 00073 q31_t *pb; /* 32 bit pointer for inputB buffer */ 00074 00075 /* Check for range of output samples to be calculated */ 00076 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00077 { 00078 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00079 status = ARM_MATH_ARGUMENT_ERROR; 00080 } 00081 else 00082 { 00083 00084 /* The algorithm implementation is based on the lengths of the inputs. */ 00085 /* srcB is always made to slide across srcA. */ 00086 /* So srcBLen is always considered as shorter or equal to srcALen */ 00087 if(srcALen >= srcBLen) 00088 { 00089 /* Initialization of inputA pointer */ 00090 pIn1 = pSrcA; 00091 00092 /* Initialization of inputB pointer */ 00093 pIn2 = pSrcB; 00094 } 00095 else 00096 { 00097 /* Initialization of inputA pointer */ 00098 pIn1 = pSrcB; 00099 00100 /* Initialization of inputB pointer */ 00101 pIn2 = pSrcA; 00102 00103 /* srcBLen is always considered as shorter or equal to srcALen */ 00104 j = srcBLen; 00105 srcBLen = srcALen; 00106 srcALen = j; 00107 } 00108 00109 /* Conditions to check which loopCounter holds 00110 * the first and last indices of the output samples to be calculated. */ 00111 check = firstIndex + numPoints; 00112 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00113 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00114 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00115 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00116 (int32_t) numPoints) : 0; 00117 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00118 (int32_t) firstIndex); 00119 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00120 00121 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00122 /* The function is internally 00123 * divided into three stages according to the number of multiplications that has to be 00124 * taken place between inputA samples and inputB samples. In the first stage of the 00125 * algorithm, the multiplications increase by one for every iteration. 00126 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00127 * In the third stage of the algorithm, the multiplications decrease by one 00128 * for every iteration. */ 00129 00130 /* Set the output pointer to point to the firstIndex 00131 * of the output sample to be calculated. */ 00132 pOut = pDst + firstIndex; 00133 00134 /* -------------------------- 00135 * Initializations of stage1 00136 * -------------------------*/ 00137 00138 /* sum = x[0] * y[0] 00139 * sum = x[0] * y[1] + x[1] * y[0] 00140 * .... 00141 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00142 */ 00143 00144 /* In this stage the MAC operations are increased by 1 for every iteration. 00145 The count variable holds the number of MAC operations performed. 00146 Since the partial convolution starts from firstIndex 00147 Number of Macs to be performed is firstIndex + 1 */ 00148 count = 1u + firstIndex; 00149 00150 /* Working pointer of inputA */ 00151 px = pIn1; 00152 00153 /* Working pointer of inputB */ 00154 pSrc2 = pIn2 + firstIndex; 00155 py = pSrc2; 00156 00157 /* ------------------------ 00158 * Stage1 process 00159 * ----------------------*/ 00160 00161 /* For loop unrolling by 4, this stage is divided into two. */ 00162 /* First part of this stage computes the MAC operations less than 4 */ 00163 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00164 00165 /* The first part of the stage starts here */ 00166 while((count < 4u) && (blockSize1 > 0)) 00167 { 00168 /* Accumulator is made zero for every iteration */ 00169 sum = 0; 00170 00171 /* Loop over number of MAC operations between 00172 * inputA samples and inputB samples */ 00173 k = count; 00174 00175 while(k > 0u) 00176 { 00177 /* Perform the multiply-accumulates */ 00178 sum = __SMLAD(*px++, *py--, sum); 00179 00180 /* Decrement the loop counter */ 00181 k--; 00182 } 00183 00184 /* Store the result in the accumulator in the destination buffer. */ 00185 *pOut++ = (q15_t) (sum >> 15); 00186 00187 /* Update the inputA and inputB pointers for next MAC calculation */ 00188 py = ++pSrc2; 00189 px = pIn1; 00190 00191 /* Increment the MAC count */ 00192 count++; 00193 00194 /* Decrement the loop counter */ 00195 blockSize1--; 00196 } 00197 00198 /* The second part of the stage starts here */ 00199 /* The internal loop, over count, is unrolled by 4 */ 00200 /* To, read the last two inputB samples using SIMD: 00201 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00202 py = py - 1; 00203 00204 while(blockSize1 > 0) 00205 { 00206 /* Accumulator is made zero for every iteration */ 00207 sum = 0; 00208 00209 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00210 k = count >> 2u; 00211 00212 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00213 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00214 while(k > 0u) 00215 { 00216 /* Perform the multiply-accumulates */ 00217 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00218 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00219 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00220 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00221 00222 /* Decrement the loop counter */ 00223 k--; 00224 } 00225 00226 /* For the next MAC operations, the pointer py is used without SIMD 00227 * So, py is incremented by 1 */ 00228 py = py + 1u; 00229 00230 /* If the count is not a multiple of 4, compute any remaining MACs here. 00231 ** No loop unrolling is used. */ 00232 k = count % 0x4u; 00233 00234 while(k > 0u) 00235 { 00236 /* Perform the multiply-accumulates */ 00237 sum = __SMLAD(*px++, *py--, sum); 00238 00239 /* Decrement the loop counter */ 00240 k--; 00241 } 00242 00243 /* Store the result in the accumulator in the destination buffer. */ 00244 *pOut++ = (q15_t) (sum >> 15); 00245 00246 /* Update the inputA and inputB pointers for next MAC calculation */ 00247 py = ++pSrc2 - 1u; 00248 px = pIn1; 00249 00250 /* Increment the MAC count */ 00251 count++; 00252 00253 /* Decrement the loop counter */ 00254 blockSize1--; 00255 } 00256 00257 /* -------------------------- 00258 * Initializations of stage2 00259 * ------------------------*/ 00260 00261 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00262 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00263 * .... 00264 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00265 */ 00266 00267 /* Working pointer of inputA */ 00268 px = pIn1; 00269 00270 /* Working pointer of inputB */ 00271 pSrc2 = pIn2 + (srcBLen - 1u); 00272 py = pSrc2; 00273 00274 /* Initialize inputB pointer of type q31 */ 00275 pb = (q31_t *) (py - 1u); 00276 00277 /* count is the index by which the pointer pIn1 to be incremented */ 00278 count = 1u; 00279 00280 00281 /* -------------------- 00282 * Stage2 process 00283 * -------------------*/ 00284 00285 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00286 * So, to loop unroll over blockSize2, 00287 * srcBLen should be greater than or equal to 4 */ 00288 if(srcBLen >= 4u) 00289 { 00290 /* Loop unroll over blockSize2, by 4 */ 00291 blkCnt = ((uint32_t) blockSize2 >> 2u); 00292 00293 while(blkCnt > 0u) 00294 { 00295 /* Set all accumulators to zero */ 00296 acc0 = 0; 00297 acc1 = 0; 00298 acc2 = 0; 00299 acc3 = 0; 00300 00301 00302 /* read x[0], x[1] samples */ 00303 x0 = *(q31_t *) (px++); 00304 /* read x[1], x[2] samples */ 00305 x1 = *(q31_t *) (px++); 00306 00307 00308 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00309 k = srcBLen >> 2u; 00310 00311 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00312 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00313 do 00314 { 00315 /* Read the last two inputB samples using SIMD: 00316 * y[srcBLen - 1] and y[srcBLen - 2] */ 00317 c0 = *(pb--); 00318 00319 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00320 acc0 = __SMLADX(x0, c0, acc0); 00321 00322 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00323 acc1 = __SMLADX(x1, c0, acc1); 00324 00325 /* Read x[2], x[3] */ 00326 x2 = *(q31_t *) (px++); 00327 00328 /* Read x[3], x[4] */ 00329 x3 = *(q31_t *) (px++); 00330 00331 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00332 acc2 = __SMLADX(x2, c0, acc2); 00333 00334 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00335 acc3 = __SMLADX(x3, c0, acc3); 00336 00337 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00338 c0 = *(pb--); 00339 00340 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00341 acc0 = __SMLADX(x2, c0, acc0); 00342 00343 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00344 acc1 = __SMLADX(x3, c0, acc1); 00345 00346 /* Read x[4], x[5] */ 00347 x0 = *(q31_t *) (px++); 00348 00349 /* Read x[5], x[6] */ 00350 x1 = *(q31_t *) (px++); 00351 00352 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00353 acc2 = __SMLADX(x0, c0, acc2); 00354 00355 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00356 acc3 = __SMLADX(x1, c0, acc3); 00357 00358 } while(--k); 00359 00360 /* For the next MAC operations, SIMD is not used 00361 * So, the 16 bit pointer if inputB, py is updated */ 00362 py = (q15_t *) pb; 00363 py = py + 1; 00364 00365 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00366 ** No loop unrolling is used. */ 00367 k = srcBLen % 0x4u; 00368 00369 if(k == 1u) 00370 { 00371 /* Read y[srcBLen - 5] */ 00372 c0 = *(py); 00373 00374 /* Read x[7] */ 00375 x3 = *(q31_t *) px++; 00376 00377 /* Perform the multiply-accumulates */ 00378 acc0 = __SMLAD(x0, c0, acc0); 00379 acc1 = __SMLAD(x1, c0, acc1); 00380 acc2 = __SMLADX(x1, c0, acc2); 00381 acc3 = __SMLADX(x3, c0, acc3); 00382 } 00383 00384 if(k == 2u) 00385 { 00386 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00387 c0 = *(pb); 00388 00389 /* Read x[7], x[8] */ 00390 x3 = *(q31_t *) px++; 00391 00392 /* Read x[9] */ 00393 x2 = *(q31_t *) px++; 00394 00395 /* Perform the multiply-accumulates */ 00396 acc0 = __SMLADX(x0, c0, acc0); 00397 acc1 = __SMLADX(x1, c0, acc1); 00398 acc2 = __SMLADX(x3, c0, acc2); 00399 acc3 = __SMLADX(x2, c0, acc3); 00400 } 00401 00402 if(k == 3u) 00403 { 00404 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00405 c0 = *pb--; 00406 00407 /* Read x[7], x[8] */ 00408 x3 = *(q31_t *) px++; 00409 00410 /* Read x[9] */ 00411 x2 = *(q31_t *) px++; 00412 00413 /* Perform the multiply-accumulates */ 00414 acc0 = __SMLADX(x0, c0, acc0); 00415 acc1 = __SMLADX(x1, c0, acc1); 00416 acc2 = __SMLADX(x3, c0, acc2); 00417 acc3 = __SMLADX(x2, c0, acc3); 00418 00419 /* Read y[srcBLen - 7] */ 00420 c0 = (q15_t) (*pb >> 16); 00421 00422 /* Read x[10] */ 00423 x3 = *(q31_t *) px++; 00424 00425 /* Perform the multiply-accumulates */ 00426 acc0 = __SMLADX(x1, c0, acc0); 00427 acc1 = __SMLAD(x2, c0, acc1); 00428 acc2 = __SMLADX(x2, c0, acc2); 00429 acc3 = __SMLADX(x3, c0, acc3); 00430 } 00431 00432 /* Store the results in the accumulators in the destination buffer. */ 00433 *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16); 00434 *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16); 00435 00436 /* Update the inputA and inputB pointers for next MAC calculation */ 00437 px = pIn1 + (count * 4u); 00438 py = pSrc2; 00439 pb = (q31_t *) (py - 1); 00440 00441 /* Increment the pointer pIn1 index, count by 1 */ 00442 count++; 00443 00444 /* Decrement the loop counter */ 00445 blkCnt--; 00446 } 00447 00448 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00449 ** No loop unrolling is used. */ 00450 blkCnt = (uint32_t) blockSize2 % 0x4u; 00451 00452 while(blkCnt > 0u) 00453 { 00454 /* Accumulator is made zero for every iteration */ 00455 sum = 0; 00456 00457 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00458 k = srcBLen >> 2u; 00459 00460 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00461 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00462 while(k > 0u) 00463 { 00464 /* Perform the multiply-accumulates */ 00465 sum += ((q31_t) * px++ * *py--); 00466 sum += ((q31_t) * px++ * *py--); 00467 sum += ((q31_t) * px++ * *py--); 00468 sum += ((q31_t) * px++ * *py--); 00469 00470 /* Decrement the loop counter */ 00471 k--; 00472 } 00473 00474 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00475 ** No loop unrolling is used. */ 00476 k = srcBLen % 0x4u; 00477 00478 while(k > 0u) 00479 { 00480 /* Perform the multiply-accumulates */ 00481 sum += ((q31_t) * px++ * *py--); 00482 00483 /* Decrement the loop counter */ 00484 k--; 00485 } 00486 00487 /* Store the result in the accumulator in the destination buffer. */ 00488 *pOut++ = (q15_t) (sum >> 15); 00489 00490 /* Update the inputA and inputB pointers for next MAC calculation */ 00491 px = pIn1 + count; 00492 py = pSrc2; 00493 00494 /* Increment the pointer pIn1 index, count by 1 */ 00495 count++; 00496 00497 /* Decrement the loop counter */ 00498 blkCnt--; 00499 } 00500 } 00501 else 00502 { 00503 /* If the srcBLen is not a multiple of 4, 00504 * the blockSize2 loop cannot be unrolled by 4 */ 00505 blkCnt = (uint32_t) blockSize2; 00506 00507 while(blkCnt > 0u) 00508 { 00509 /* Accumulator is made zero for every iteration */ 00510 sum = 0; 00511 00512 /* srcBLen number of MACS should be performed */ 00513 k = srcBLen; 00514 00515 while(k > 0u) 00516 { 00517 /* Perform the multiply-accumulate */ 00518 sum += ((q31_t) * px++ * *py--); 00519 00520 /* Decrement the loop counter */ 00521 k--; 00522 } 00523 00524 /* Store the result in the accumulator in the destination buffer. */ 00525 *pOut++ = (q15_t) (sum >> 15); 00526 00527 /* Update the inputA and inputB pointers for next MAC calculation */ 00528 px = pIn1 + count; 00529 py = pSrc2; 00530 00531 /* Increment the MAC count */ 00532 count++; 00533 00534 /* Decrement the loop counter */ 00535 blkCnt--; 00536 } 00537 } 00538 00539 00540 /* -------------------------- 00541 * Initializations of stage3 00542 * -------------------------*/ 00543 00544 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00545 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00546 * .... 00547 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00548 * sum += x[srcALen-1] * y[srcBLen-1] 00549 */ 00550 00551 /* In this stage the MAC operations are decreased by 1 for every iteration. 00552 The count variable holds the number of MAC operations performed */ 00553 count = srcBLen - 1u; 00554 00555 /* Working pointer of inputA */ 00556 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00557 px = pSrc1; 00558 00559 /* Working pointer of inputB */ 00560 pSrc2 = pIn2 + (srcBLen - 1u); 00561 pIn2 = pSrc2 - 1u; 00562 py = pIn2; 00563 00564 /* ------------------- 00565 * Stage3 process 00566 * ------------------*/ 00567 00568 /* For loop unrolling by 4, this stage is divided into two. */ 00569 /* First part of this stage computes the MAC operations greater than 4 */ 00570 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00571 00572 /* The first part of the stage starts here */ 00573 j = count >> 2u; 00574 00575 while((j > 0u) && (blockSize3 > 0)) 00576 { 00577 /* Accumulator is made zero for every iteration */ 00578 sum = 0; 00579 00580 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00581 k = count >> 2u; 00582 00583 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00584 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00585 while(k > 0u) 00586 { 00587 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00588 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00589 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00590 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00591 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00592 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00593 00594 /* Decrement the loop counter */ 00595 k--; 00596 } 00597 00598 /* For the next MAC operations, the pointer py is used without SIMD 00599 * So, py is incremented by 1 */ 00600 py = py + 1u; 00601 00602 /* If the count is not a multiple of 4, compute any remaining MACs here. 00603 ** No loop unrolling is used. */ 00604 k = count % 0x4u; 00605 00606 while(k > 0u) 00607 { 00608 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00609 sum = __SMLAD(*px++, *py--, sum); 00610 00611 /* Decrement the loop counter */ 00612 k--; 00613 } 00614 00615 /* Store the result in the accumulator in the destination buffer. */ 00616 *pOut++ = (q15_t) (sum >> 15); 00617 00618 /* Update the inputA and inputB pointers for next MAC calculation */ 00619 px = ++pSrc1; 00620 py = pIn2; 00621 00622 /* Decrement the MAC count */ 00623 count--; 00624 00625 /* Decrement the loop counter */ 00626 blockSize3--; 00627 00628 j--; 00629 } 00630 00631 /* The second part of the stage starts here */ 00632 /* SIMD is not used for the next MAC operations, 00633 * so pointer py is updated to read only one sample at a time */ 00634 py = py + 1u; 00635 00636 while(blockSize3 > 0) 00637 { 00638 /* Accumulator is made zero for every iteration */ 00639 sum = 0; 00640 00641 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00642 k = count; 00643 00644 while(k > 0u) 00645 { 00646 /* Perform the multiply-accumulates */ 00647 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00648 sum = __SMLAD(*px++, *py--, sum); 00649 00650 /* Decrement the loop counter */ 00651 k--; 00652 } 00653 00654 /* Store the result in the accumulator in the destination buffer. */ 00655 *pOut++ = (q15_t) (sum >> 15); 00656 00657 /* Update the inputA and inputB pointers for next MAC calculation */ 00658 px = ++pSrc1; 00659 py = pSrc2; 00660 00661 /* Decrement the MAC count */ 00662 count--; 00663 00664 /* Decrement the loop counter */ 00665 blockSize3--; 00666 } 00667 00668 /* set status as ARM_MATH_SUCCESS */ 00669 status = ARM_MATH_SUCCESS; 00670 } 00671 00672 /* Return to application */ 00673 return (status); 00674 00675 } 00676 00677 /** 00678 * @} end of PartialConv group 00679 */
Generated on Tue Jul 12 2022 14:13:52 by 1.7.2