CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_conv_partial_f32.c
00001 /* ---------------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_f32.c 00009 * 00010 * Description: Partial convolution of floating-point sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @defgroup PartialConv Partial Convolution 00049 * 00050 * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated. 00051 * Each function has two additional arguments. 00052 * <code>firstIndex</code> specifies the starting index of the subset of output samples. 00053 * <code>numPoints</code> is the number of output samples to compute. 00054 * The function computes the output in the range 00055 * <code>[firstIndex, ..., firstIndex+numPoints-1]</code>. 00056 * The output array <code>pDst</code> contains <code>numPoints</code> values. 00057 * 00058 * The allowable range of output indices is [0 srcALen+srcBLen-2]. 00059 * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR. 00060 * Otherwise the functions return ARM_MATH_SUCCESS. 00061 * \note Refer arm_conv_f32() for details on fixed point behavior. 00062 * 00063 * 00064 * <b>Fast Versions</b> 00065 * 00066 * \par 00067 * Fast versions are supported for Q31 and Q15 of partial convolution. Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires 00068 * the input signals should be scaled down to avoid intermediate overflows. 00069 * 00070 * 00071 * <b>Opt Versions</b> 00072 * 00073 * \par 00074 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. 00075 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of partial convolution 00076 */ 00077 00078 /** 00079 * @addtogroup PartialConv 00080 * @{ 00081 */ 00082 00083 /** 00084 * @brief Partial convolution of floating-point sequences. 00085 * @param[in] *pSrcA points to the first input sequence. 00086 * @param[in] srcALen length of the first input sequence. 00087 * @param[in] *pSrcB points to the second input sequence. 00088 * @param[in] srcBLen length of the second input sequence. 00089 * @param[out] *pDst points to the location where the output result is written. 00090 * @param[in] firstIndex is the first output sample to start with. 00091 * @param[in] numPoints is the number of output points to be computed. 00092 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00093 */ 00094 00095 arm_status arm_conv_partial_f32( 00096 float32_t * pSrcA, 00097 uint32_t srcALen, 00098 float32_t * pSrcB, 00099 uint32_t srcBLen, 00100 float32_t * pDst, 00101 uint32_t firstIndex, 00102 uint32_t numPoints) 00103 { 00104 00105 00106 #ifndef ARM_MATH_CM0_FAMILY 00107 00108 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00109 00110 float32_t *pIn1 = pSrcA; /* inputA pointer */ 00111 float32_t *pIn2 = pSrcB; /* inputB pointer */ 00112 float32_t *pOut = pDst; /* output pointer */ 00113 float32_t *px; /* Intermediate inputA pointer */ 00114 float32_t *py; /* Intermediate inputB pointer */ 00115 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00116 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00117 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00118 uint32_t j, k, count = 0u, blkCnt, check; 00119 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00120 arm_status status; /* status of Partial convolution */ 00121 00122 00123 /* Check for range of output samples to be calculated */ 00124 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00125 { 00126 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00127 status = ARM_MATH_ARGUMENT_ERROR; 00128 } 00129 else 00130 { 00131 00132 /* The algorithm implementation is based on the lengths of the inputs. */ 00133 /* srcB is always made to slide across srcA. */ 00134 /* So srcBLen is always considered as shorter or equal to srcALen */ 00135 if(srcALen >= srcBLen) 00136 { 00137 /* Initialization of inputA pointer */ 00138 pIn1 = pSrcA; 00139 00140 /* Initialization of inputB pointer */ 00141 pIn2 = pSrcB; 00142 } 00143 else 00144 { 00145 /* Initialization of inputA pointer */ 00146 pIn1 = pSrcB; 00147 00148 /* Initialization of inputB pointer */ 00149 pIn2 = pSrcA; 00150 00151 /* srcBLen is always considered as shorter or equal to srcALen */ 00152 j = srcBLen; 00153 srcBLen = srcALen; 00154 srcALen = j; 00155 } 00156 00157 /* Conditions to check which loopCounter holds 00158 * the first and last indices of the output samples to be calculated. */ 00159 check = firstIndex + numPoints; 00160 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; 00161 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; 00162 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; 00163 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00164 (int32_t) numPoints) : 0; 00165 blockSize2 = ((int32_t) check - blockSize3) - 00166 (blockSize1 + (int32_t) firstIndex); 00167 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00168 00169 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00170 /* The function is internally 00171 * divided into three stages according to the number of multiplications that has to be 00172 * taken place between inputA samples and inputB samples. In the first stage of the 00173 * algorithm, the multiplications increase by one for every iteration. 00174 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00175 * In the third stage of the algorithm, the multiplications decrease by one 00176 * for every iteration. */ 00177 00178 /* Set the output pointer to point to the firstIndex 00179 * of the output sample to be calculated. */ 00180 pOut = pDst + firstIndex; 00181 00182 /* -------------------------- 00183 * Initializations of stage1 00184 * -------------------------*/ 00185 00186 /* sum = x[0] * y[0] 00187 * sum = x[0] * y[1] + x[1] * y[0] 00188 * .... 00189 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00190 */ 00191 00192 /* In this stage the MAC operations are increased by 1 for every iteration. 00193 The count variable holds the number of MAC operations performed. 00194 Since the partial convolution starts from from firstIndex 00195 Number of Macs to be performed is firstIndex + 1 */ 00196 count = 1u + firstIndex; 00197 00198 /* Working pointer of inputA */ 00199 px = pIn1; 00200 00201 /* Working pointer of inputB */ 00202 pSrc1 = pIn2 + firstIndex; 00203 py = pSrc1; 00204 00205 /* ------------------------ 00206 * Stage1 process 00207 * ----------------------*/ 00208 00209 /* The first stage starts here */ 00210 while(blockSize1 > 0) 00211 { 00212 /* Accumulator is made zero for every iteration */ 00213 sum = 0.0f; 00214 00215 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00216 k = count >> 2u; 00217 00218 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00219 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00220 while(k > 0u) 00221 { 00222 /* x[0] * y[srcBLen - 1] */ 00223 sum += *px++ * *py--; 00224 00225 /* x[1] * y[srcBLen - 2] */ 00226 sum += *px++ * *py--; 00227 00228 /* x[2] * y[srcBLen - 3] */ 00229 sum += *px++ * *py--; 00230 00231 /* x[3] * y[srcBLen - 4] */ 00232 sum += *px++ * *py--; 00233 00234 /* Decrement the loop counter */ 00235 k--; 00236 } 00237 00238 /* If the count is not a multiple of 4, compute any remaining MACs here. 00239 ** No loop unrolling is used. */ 00240 k = count % 0x4u; 00241 00242 while(k > 0u) 00243 { 00244 /* Perform the multiply-accumulates */ 00245 sum += *px++ * *py--; 00246 00247 /* Decrement the loop counter */ 00248 k--; 00249 } 00250 00251 /* Store the result in the accumulator in the destination buffer. */ 00252 *pOut++ = sum; 00253 00254 /* Update the inputA and inputB pointers for next MAC calculation */ 00255 py = ++pSrc1; 00256 px = pIn1; 00257 00258 /* Increment the MAC count */ 00259 count++; 00260 00261 /* Decrement the loop counter */ 00262 blockSize1--; 00263 } 00264 00265 /* -------------------------- 00266 * Initializations of stage2 00267 * ------------------------*/ 00268 00269 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00270 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00271 * .... 00272 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00273 */ 00274 00275 /* Working pointer of inputA */ 00276 if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00277 { 00278 px = pIn1 + firstIndex - srcBLen + 1; 00279 } 00280 else 00281 { 00282 px = pIn1; 00283 } 00284 00285 /* Working pointer of inputB */ 00286 pSrc2 = pIn2 + (srcBLen - 1u); 00287 py = pSrc2; 00288 00289 /* count is index by which the pointer pIn1 to be incremented */ 00290 count = 0u; 00291 00292 /* ------------------- 00293 * Stage2 process 00294 * ------------------*/ 00295 00296 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00297 * So, to loop unroll over blockSize2, 00298 * srcBLen should be greater than or equal to 4 */ 00299 if(srcBLen >= 4u) 00300 { 00301 /* Loop unroll over blockSize2, by 4 */ 00302 blkCnt = ((uint32_t) blockSize2 >> 2u); 00303 00304 while(blkCnt > 0u) 00305 { 00306 /* Set all accumulators to zero */ 00307 acc0 = 0.0f; 00308 acc1 = 0.0f; 00309 acc2 = 0.0f; 00310 acc3 = 0.0f; 00311 00312 /* read x[0], x[1], x[2] samples */ 00313 x0 = *(px++); 00314 x1 = *(px++); 00315 x2 = *(px++); 00316 00317 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00318 k = srcBLen >> 2u; 00319 00320 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00321 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00322 do 00323 { 00324 /* Read y[srcBLen - 1] sample */ 00325 c0 = *(py--); 00326 00327 /* Read x[3] sample */ 00328 x3 = *(px++); 00329 00330 /* Perform the multiply-accumulate */ 00331 /* acc0 += x[0] * y[srcBLen - 1] */ 00332 acc0 += x0 * c0; 00333 00334 /* acc1 += x[1] * y[srcBLen - 1] */ 00335 acc1 += x1 * c0; 00336 00337 /* acc2 += x[2] * y[srcBLen - 1] */ 00338 acc2 += x2 * c0; 00339 00340 /* acc3 += x[3] * y[srcBLen - 1] */ 00341 acc3 += x3 * c0; 00342 00343 /* Read y[srcBLen - 2] sample */ 00344 c0 = *(py--); 00345 00346 /* Read x[4] sample */ 00347 x0 = *(px++); 00348 00349 /* Perform the multiply-accumulate */ 00350 /* acc0 += x[1] * y[srcBLen - 2] */ 00351 acc0 += x1 * c0; 00352 /* acc1 += x[2] * y[srcBLen - 2] */ 00353 acc1 += x2 * c0; 00354 /* acc2 += x[3] * y[srcBLen - 2] */ 00355 acc2 += x3 * c0; 00356 /* acc3 += x[4] * y[srcBLen - 2] */ 00357 acc3 += x0 * c0; 00358 00359 /* Read y[srcBLen - 3] sample */ 00360 c0 = *(py--); 00361 00362 /* Read x[5] sample */ 00363 x1 = *(px++); 00364 00365 /* Perform the multiply-accumulates */ 00366 /* acc0 += x[2] * y[srcBLen - 3] */ 00367 acc0 += x2 * c0; 00368 /* acc1 += x[3] * y[srcBLen - 2] */ 00369 acc1 += x3 * c0; 00370 /* acc2 += x[4] * y[srcBLen - 2] */ 00371 acc2 += x0 * c0; 00372 /* acc3 += x[5] * y[srcBLen - 2] */ 00373 acc3 += x1 * c0; 00374 00375 /* Read y[srcBLen - 4] sample */ 00376 c0 = *(py--); 00377 00378 /* Read x[6] sample */ 00379 x2 = *(px++); 00380 00381 /* Perform the multiply-accumulates */ 00382 /* acc0 += x[3] * y[srcBLen - 4] */ 00383 acc0 += x3 * c0; 00384 /* acc1 += x[4] * y[srcBLen - 4] */ 00385 acc1 += x0 * c0; 00386 /* acc2 += x[5] * y[srcBLen - 4] */ 00387 acc2 += x1 * c0; 00388 /* acc3 += x[6] * y[srcBLen - 4] */ 00389 acc3 += x2 * c0; 00390 00391 00392 } while(--k); 00393 00394 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00395 ** No loop unrolling is used. */ 00396 k = srcBLen % 0x4u; 00397 00398 while(k > 0u) 00399 { 00400 /* Read y[srcBLen - 5] sample */ 00401 c0 = *(py--); 00402 00403 /* Read x[7] sample */ 00404 x3 = *(px++); 00405 00406 /* Perform the multiply-accumulates */ 00407 /* acc0 += x[4] * y[srcBLen - 5] */ 00408 acc0 += x0 * c0; 00409 /* acc1 += x[5] * y[srcBLen - 5] */ 00410 acc1 += x1 * c0; 00411 /* acc2 += x[6] * y[srcBLen - 5] */ 00412 acc2 += x2 * c0; 00413 /* acc3 += x[7] * y[srcBLen - 5] */ 00414 acc3 += x3 * c0; 00415 00416 /* Reuse the present samples for the next MAC */ 00417 x0 = x1; 00418 x1 = x2; 00419 x2 = x3; 00420 00421 /* Decrement the loop counter */ 00422 k--; 00423 } 00424 00425 /* Store the result in the accumulator in the destination buffer. */ 00426 *pOut++ = acc0; 00427 *pOut++ = acc1; 00428 *pOut++ = acc2; 00429 *pOut++ = acc3; 00430 00431 /* Increment the pointer pIn1 index, count by 1 */ 00432 count += 4u; 00433 00434 /* Update the inputA and inputB pointers for next MAC calculation */ 00435 px = pIn1 + count; 00436 py = pSrc2; 00437 00438 /* Decrement the loop counter */ 00439 blkCnt--; 00440 } 00441 00442 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00443 ** No loop unrolling is used. */ 00444 blkCnt = (uint32_t) blockSize2 % 0x4u; 00445 00446 while(blkCnt > 0u) 00447 { 00448 /* Accumulator is made zero for every iteration */ 00449 sum = 0.0f; 00450 00451 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00452 k = srcBLen >> 2u; 00453 00454 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00455 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00456 while(k > 0u) 00457 { 00458 /* Perform the multiply-accumulates */ 00459 sum += *px++ * *py--; 00460 sum += *px++ * *py--; 00461 sum += *px++ * *py--; 00462 sum += *px++ * *py--; 00463 00464 /* Decrement the loop counter */ 00465 k--; 00466 } 00467 00468 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00469 ** No loop unrolling is used. */ 00470 k = srcBLen % 0x4u; 00471 00472 while(k > 0u) 00473 { 00474 /* Perform the multiply-accumulate */ 00475 sum += *px++ * *py--; 00476 00477 /* Decrement the loop counter */ 00478 k--; 00479 } 00480 00481 /* Store the result in the accumulator in the destination buffer. */ 00482 *pOut++ = sum; 00483 00484 /* Increment the MAC count */ 00485 count++; 00486 00487 /* Update the inputA and inputB pointers for next MAC calculation */ 00488 px = pIn1 + count; 00489 py = pSrc2; 00490 00491 /* Decrement the loop counter */ 00492 blkCnt--; 00493 } 00494 } 00495 else 00496 { 00497 /* If the srcBLen is not a multiple of 4, 00498 * the blockSize2 loop cannot be unrolled by 4 */ 00499 blkCnt = (uint32_t) blockSize2; 00500 00501 while(blkCnt > 0u) 00502 { 00503 /* Accumulator is made zero for every iteration */ 00504 sum = 0.0f; 00505 00506 /* srcBLen number of MACS should be performed */ 00507 k = srcBLen; 00508 00509 while(k > 0u) 00510 { 00511 /* Perform the multiply-accumulate */ 00512 sum += *px++ * *py--; 00513 00514 /* Decrement the loop counter */ 00515 k--; 00516 } 00517 00518 /* Store the result in the accumulator in the destination buffer. */ 00519 *pOut++ = sum; 00520 00521 /* Increment the MAC count */ 00522 count++; 00523 00524 /* Update the inputA and inputB pointers for next MAC calculation */ 00525 px = pIn1 + count; 00526 py = pSrc2; 00527 00528 /* Decrement the loop counter */ 00529 blkCnt--; 00530 } 00531 } 00532 00533 00534 /* -------------------------- 00535 * Initializations of stage3 00536 * -------------------------*/ 00537 00538 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00539 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00540 * .... 00541 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00542 * sum += x[srcALen-1] * y[srcBLen-1] 00543 */ 00544 00545 /* In this stage the MAC operations are decreased by 1 for every iteration. 00546 The count variable holds the number of MAC operations performed */ 00547 count = srcBLen - 1u; 00548 00549 /* Working pointer of inputA */ 00550 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00551 px = pSrc1; 00552 00553 /* Working pointer of inputB */ 00554 pSrc2 = pIn2 + (srcBLen - 1u); 00555 py = pSrc2; 00556 00557 while(blockSize3 > 0) 00558 { 00559 /* Accumulator is made zero for every iteration */ 00560 sum = 0.0f; 00561 00562 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00563 k = count >> 2u; 00564 00565 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00566 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00567 while(k > 0u) 00568 { 00569 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00570 sum += *px++ * *py--; 00571 00572 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00573 sum += *px++ * *py--; 00574 00575 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00576 sum += *px++ * *py--; 00577 00578 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00579 sum += *px++ * *py--; 00580 00581 /* Decrement the loop counter */ 00582 k--; 00583 } 00584 00585 /* If the count is not a multiple of 4, compute any remaining MACs here. 00586 ** No loop unrolling is used. */ 00587 k = count % 0x4u; 00588 00589 while(k > 0u) 00590 { 00591 /* Perform the multiply-accumulates */ 00592 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00593 sum += *px++ * *py--; 00594 00595 /* Decrement the loop counter */ 00596 k--; 00597 } 00598 00599 /* Store the result in the accumulator in the destination buffer. */ 00600 *pOut++ = sum; 00601 00602 /* Update the inputA and inputB pointers for next MAC calculation */ 00603 px = ++pSrc1; 00604 py = pSrc2; 00605 00606 /* Decrement the MAC count */ 00607 count--; 00608 00609 /* Decrement the loop counter */ 00610 blockSize3--; 00611 00612 } 00613 00614 /* set status as ARM_MATH_SUCCESS */ 00615 status = ARM_MATH_SUCCESS; 00616 } 00617 00618 /* Return to application */ 00619 return (status); 00620 00621 #else 00622 00623 /* Run the below code for Cortex-M0 */ 00624 00625 float32_t *pIn1 = pSrcA; /* inputA pointer */ 00626 float32_t *pIn2 = pSrcB; /* inputB pointer */ 00627 float32_t sum; /* Accumulator */ 00628 uint32_t i, j; /* loop counters */ 00629 arm_status status; /* status of Partial convolution */ 00630 00631 /* Check for range of output samples to be calculated */ 00632 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00633 { 00634 /* Set status as ARM_ARGUMENT_ERROR */ 00635 status = ARM_MATH_ARGUMENT_ERROR; 00636 } 00637 else 00638 { 00639 /* Loop to calculate convolution for output length number of values */ 00640 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00641 { 00642 /* Initialize sum with zero to carry on MAC operations */ 00643 sum = 0.0f; 00644 00645 /* Loop to perform MAC operations according to convolution equation */ 00646 for (j = 0u; j <= i; j++) 00647 { 00648 /* Check the array limitations for inputs */ 00649 if((((i - j) < srcBLen) && (j < srcALen))) 00650 { 00651 /* z[i] += x[i-j] * y[j] */ 00652 sum += pIn1[j] * pIn2[i - j]; 00653 } 00654 } 00655 /* Store the output in the destination buffer */ 00656 pDst[i] = sum; 00657 } 00658 /* set status as ARM_SUCCESS as there are no argument errors */ 00659 status = ARM_MATH_SUCCESS; 00660 } 00661 return (status); 00662 00663 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00664 00665 } 00666 00667 /** 00668 * @} end of PartialConv group 00669 */
Generated on Tue Jul 12 2022 11:59:16 by 1.7.2