Aded CMSIS5 DSP and NN folder. Needs some work
Embed:
(wiki syntax)
Show/hide line numbers
arm_conv_f32.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_conv_f32.c 00004 * Description: Convolution of floating-point sequences 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @defgroup Conv Convolution 00037 * 00038 * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector. 00039 * Convolution is similar to correlation and is frequently used in filtering and data analysis. 00040 * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types. 00041 * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3. 00042 * 00043 * \par Algorithm 00044 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively. 00045 * Then the convolution 00046 * 00047 * <pre> 00048 * c[n] = a[n] * b[n] 00049 * </pre> 00050 * 00051 * \par 00052 * is defined as 00053 * \image html ConvolutionEquation.gif 00054 * \par 00055 * Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>. 00056 * <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and 00057 * <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>. 00058 * The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result. 00059 * 00060 * \par 00061 * Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved, 00062 * the signal <code>b[n]</code> slides over <code>a[n]</code>. 00063 * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together. 00064 * 00065 * \par 00066 * Note that convolution is a commutative operation: 00067 * 00068 * <pre> 00069 * a[n] * b[n] = b[n] * a[n]. 00070 * </pre> 00071 * 00072 * \par 00073 * This means that switching the A and B arguments to the convolution functions has no effect. 00074 * 00075 * <b>Fixed-Point Behavior</b> 00076 * 00077 * \par 00078 * Convolution requires summing up a large number of intermediate products. 00079 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation. 00080 * Refer to the function specific documentation below for further details of the particular algorithm used. 00081 * 00082 * 00083 * <b>Fast Versions</b> 00084 * 00085 * \par 00086 * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires 00087 * the input signals should be scaled down to avoid intermediate overflows. 00088 * 00089 * 00090 * <b>Opt Versions</b> 00091 * 00092 * \par 00093 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. 00094 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions 00095 */ 00096 00097 /** 00098 * @addtogroup Conv 00099 * @{ 00100 */ 00101 00102 /** 00103 * @brief Convolution of floating-point sequences. 00104 * @param[in] *pSrcA points to the first input sequence. 00105 * @param[in] srcALen length of the first input sequence. 00106 * @param[in] *pSrcB points to the second input sequence. 00107 * @param[in] srcBLen length of the second input sequence. 00108 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00109 * @return none. 00110 */ 00111 00112 void arm_conv_f32( 00113 float32_t * pSrcA, 00114 uint32_t srcALen, 00115 float32_t * pSrcB, 00116 uint32_t srcBLen, 00117 float32_t * pDst) 00118 { 00119 00120 00121 #if defined (ARM_MATH_DSP) 00122 00123 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00124 00125 float32_t *pIn1; /* inputA pointer */ 00126 float32_t *pIn2; /* inputB pointer */ 00127 float32_t *pOut = pDst; /* output pointer */ 00128 float32_t *px; /* Intermediate inputA pointer */ 00129 float32_t *py; /* Intermediate inputB pointer */ 00130 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00131 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00132 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00133 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */ 00134 00135 /* The algorithm implementation is based on the lengths of the inputs. */ 00136 /* srcB is always made to slide across srcA. */ 00137 /* So srcBLen is always considered as shorter or equal to srcALen */ 00138 if (srcALen >= srcBLen) 00139 { 00140 /* Initialization of inputA pointer */ 00141 pIn1 = pSrcA; 00142 00143 /* Initialization of inputB pointer */ 00144 pIn2 = pSrcB; 00145 } 00146 else 00147 { 00148 /* Initialization of inputA pointer */ 00149 pIn1 = pSrcB; 00150 00151 /* Initialization of inputB pointer */ 00152 pIn2 = pSrcA; 00153 00154 /* srcBLen is always considered as shorter or equal to srcALen */ 00155 j = srcBLen; 00156 srcBLen = srcALen; 00157 srcALen = j; 00158 } 00159 00160 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00161 /* The function is internally 00162 * divided into three stages according to the number of multiplications that has to be 00163 * taken place between inputA samples and inputB samples. In the first stage of the 00164 * algorithm, the multiplications increase by one for every iteration. 00165 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00166 * In the third stage of the algorithm, the multiplications decrease by one 00167 * for every iteration. */ 00168 00169 /* The algorithm is implemented in three stages. 00170 The loop counters of each stage is initiated here. */ 00171 blockSize1 = srcBLen - 1U; 00172 blockSize2 = srcALen - (srcBLen - 1U); 00173 blockSize3 = blockSize1; 00174 00175 /* -------------------------- 00176 * initializations of stage1 00177 * -------------------------*/ 00178 00179 /* sum = x[0] * y[0] 00180 * sum = x[0] * y[1] + x[1] * y[0] 00181 * .... 00182 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00183 */ 00184 00185 /* In this stage the MAC operations are increased by 1 for every iteration. 00186 The count variable holds the number of MAC operations performed */ 00187 count = 1U; 00188 00189 /* Working pointer of inputA */ 00190 px = pIn1; 00191 00192 /* Working pointer of inputB */ 00193 py = pIn2; 00194 00195 00196 /* ------------------------ 00197 * Stage1 process 00198 * ----------------------*/ 00199 00200 /* The first stage starts here */ 00201 while (blockSize1 > 0U) 00202 { 00203 /* Accumulator is made zero for every iteration */ 00204 sum = 0.0f; 00205 00206 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00207 k = count >> 2U; 00208 00209 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00210 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00211 while (k > 0U) 00212 { 00213 /* x[0] * y[srcBLen - 1] */ 00214 sum += *px++ * *py--; 00215 00216 /* x[1] * y[srcBLen - 2] */ 00217 sum += *px++ * *py--; 00218 00219 /* x[2] * y[srcBLen - 3] */ 00220 sum += *px++ * *py--; 00221 00222 /* x[3] * y[srcBLen - 4] */ 00223 sum += *px++ * *py--; 00224 00225 /* Decrement the loop counter */ 00226 k--; 00227 } 00228 00229 /* If the count is not a multiple of 4, compute any remaining MACs here. 00230 ** No loop unrolling is used. */ 00231 k = count % 0x4U; 00232 00233 while (k > 0U) 00234 { 00235 /* Perform the multiply-accumulate */ 00236 sum += *px++ * *py--; 00237 00238 /* Decrement the loop counter */ 00239 k--; 00240 } 00241 00242 /* Store the result in the accumulator in the destination buffer. */ 00243 *pOut++ = sum; 00244 00245 /* Update the inputA and inputB pointers for next MAC calculation */ 00246 py = pIn2 + count; 00247 px = pIn1; 00248 00249 /* Increment the MAC count */ 00250 count++; 00251 00252 /* Decrement the loop counter */ 00253 blockSize1--; 00254 } 00255 00256 /* -------------------------- 00257 * Initializations of stage2 00258 * ------------------------*/ 00259 00260 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00261 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00262 * .... 00263 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00264 */ 00265 00266 /* Working pointer of inputA */ 00267 px = pIn1; 00268 00269 /* Working pointer of inputB */ 00270 pSrc2 = pIn2 + (srcBLen - 1U); 00271 py = pSrc2; 00272 00273 /* count is index by which the pointer pIn1 to be incremented */ 00274 count = 0U; 00275 00276 /* ------------------- 00277 * Stage2 process 00278 * ------------------*/ 00279 00280 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00281 * So, to loop unroll over blockSize2, 00282 * srcBLen should be greater than or equal to 4 */ 00283 if (srcBLen >= 4U) 00284 { 00285 /* Loop unroll over blockSize2, by 4 */ 00286 blkCnt = blockSize2 >> 2U; 00287 00288 while (blkCnt > 0U) 00289 { 00290 /* Set all accumulators to zero */ 00291 acc0 = 0.0f; 00292 acc1 = 0.0f; 00293 acc2 = 0.0f; 00294 acc3 = 0.0f; 00295 00296 /* read x[0], x[1], x[2] samples */ 00297 x0 = *(px++); 00298 x1 = *(px++); 00299 x2 = *(px++); 00300 00301 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00302 k = srcBLen >> 2U; 00303 00304 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00305 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00306 do 00307 { 00308 /* Read y[srcBLen - 1] sample */ 00309 c0 = *(py--); 00310 00311 /* Read x[3] sample */ 00312 x3 = *(px); 00313 00314 /* Perform the multiply-accumulate */ 00315 /* acc0 += x[0] * y[srcBLen - 1] */ 00316 acc0 += x0 * c0; 00317 00318 /* acc1 += x[1] * y[srcBLen - 1] */ 00319 acc1 += x1 * c0; 00320 00321 /* acc2 += x[2] * y[srcBLen - 1] */ 00322 acc2 += x2 * c0; 00323 00324 /* acc3 += x[3] * y[srcBLen - 1] */ 00325 acc3 += x3 * c0; 00326 00327 /* Read y[srcBLen - 2] sample */ 00328 c0 = *(py--); 00329 00330 /* Read x[4] sample */ 00331 x0 = *(px + 1U); 00332 00333 /* Perform the multiply-accumulate */ 00334 /* acc0 += x[1] * y[srcBLen - 2] */ 00335 acc0 += x1 * c0; 00336 /* acc1 += x[2] * y[srcBLen - 2] */ 00337 acc1 += x2 * c0; 00338 /* acc2 += x[3] * y[srcBLen - 2] */ 00339 acc2 += x3 * c0; 00340 /* acc3 += x[4] * y[srcBLen - 2] */ 00341 acc3 += x0 * c0; 00342 00343 /* Read y[srcBLen - 3] sample */ 00344 c0 = *(py--); 00345 00346 /* Read x[5] sample */ 00347 x1 = *(px + 2U); 00348 00349 /* Perform the multiply-accumulates */ 00350 /* acc0 += x[2] * y[srcBLen - 3] */ 00351 acc0 += x2 * c0; 00352 /* acc1 += x[3] * y[srcBLen - 2] */ 00353 acc1 += x3 * c0; 00354 /* acc2 += x[4] * y[srcBLen - 2] */ 00355 acc2 += x0 * c0; 00356 /* acc3 += x[5] * y[srcBLen - 2] */ 00357 acc3 += x1 * c0; 00358 00359 /* Read y[srcBLen - 4] sample */ 00360 c0 = *(py--); 00361 00362 /* Read x[6] sample */ 00363 x2 = *(px + 3U); 00364 px += 4U; 00365 00366 /* Perform the multiply-accumulates */ 00367 /* acc0 += x[3] * y[srcBLen - 4] */ 00368 acc0 += x3 * c0; 00369 /* acc1 += x[4] * y[srcBLen - 4] */ 00370 acc1 += x0 * c0; 00371 /* acc2 += x[5] * y[srcBLen - 4] */ 00372 acc2 += x1 * c0; 00373 /* acc3 += x[6] * y[srcBLen - 4] */ 00374 acc3 += x2 * c0; 00375 00376 00377 } while (--k); 00378 00379 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00380 ** No loop unrolling is used. */ 00381 k = srcBLen % 0x4U; 00382 00383 while (k > 0U) 00384 { 00385 /* Read y[srcBLen - 5] sample */ 00386 c0 = *(py--); 00387 00388 /* Read x[7] sample */ 00389 x3 = *(px++); 00390 00391 /* Perform the multiply-accumulates */ 00392 /* acc0 += x[4] * y[srcBLen - 5] */ 00393 acc0 += x0 * c0; 00394 /* acc1 += x[5] * y[srcBLen - 5] */ 00395 acc1 += x1 * c0; 00396 /* acc2 += x[6] * y[srcBLen - 5] */ 00397 acc2 += x2 * c0; 00398 /* acc3 += x[7] * y[srcBLen - 5] */ 00399 acc3 += x3 * c0; 00400 00401 /* Reuse the present samples for the next MAC */ 00402 x0 = x1; 00403 x1 = x2; 00404 x2 = x3; 00405 00406 /* Decrement the loop counter */ 00407 k--; 00408 } 00409 00410 /* Store the result in the accumulator in the destination buffer. */ 00411 *pOut++ = acc0; 00412 *pOut++ = acc1; 00413 *pOut++ = acc2; 00414 *pOut++ = acc3; 00415 00416 /* Increment the pointer pIn1 index, count by 4 */ 00417 count += 4U; 00418 00419 /* Update the inputA and inputB pointers for next MAC calculation */ 00420 px = pIn1 + count; 00421 py = pSrc2; 00422 00423 00424 /* Decrement the loop counter */ 00425 blkCnt--; 00426 } 00427 00428 00429 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00430 ** No loop unrolling is used. */ 00431 blkCnt = blockSize2 % 0x4U; 00432 00433 while (blkCnt > 0U) 00434 { 00435 /* Accumulator is made zero for every iteration */ 00436 sum = 0.0f; 00437 00438 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00439 k = srcBLen >> 2U; 00440 00441 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00442 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00443 while (k > 0U) 00444 { 00445 /* Perform the multiply-accumulates */ 00446 sum += *px++ * *py--; 00447 sum += *px++ * *py--; 00448 sum += *px++ * *py--; 00449 sum += *px++ * *py--; 00450 00451 /* Decrement the loop counter */ 00452 k--; 00453 } 00454 00455 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00456 ** No loop unrolling is used. */ 00457 k = srcBLen % 0x4U; 00458 00459 while (k > 0U) 00460 { 00461 /* Perform the multiply-accumulate */ 00462 sum += *px++ * *py--; 00463 00464 /* Decrement the loop counter */ 00465 k--; 00466 } 00467 00468 /* Store the result in the accumulator in the destination buffer. */ 00469 *pOut++ = sum; 00470 00471 /* Increment the MAC count */ 00472 count++; 00473 00474 /* Update the inputA and inputB pointers for next MAC calculation */ 00475 px = pIn1 + count; 00476 py = pSrc2; 00477 00478 /* Decrement the loop counter */ 00479 blkCnt--; 00480 } 00481 } 00482 else 00483 { 00484 /* If the srcBLen is not a multiple of 4, 00485 * the blockSize2 loop cannot be unrolled by 4 */ 00486 blkCnt = blockSize2; 00487 00488 while (blkCnt > 0U) 00489 { 00490 /* Accumulator is made zero for every iteration */ 00491 sum = 0.0f; 00492 00493 /* srcBLen number of MACS should be performed */ 00494 k = srcBLen; 00495 00496 while (k > 0U) 00497 { 00498 /* Perform the multiply-accumulate */ 00499 sum += *px++ * *py--; 00500 00501 /* Decrement the loop counter */ 00502 k--; 00503 } 00504 00505 /* Store the result in the accumulator in the destination buffer. */ 00506 *pOut++ = sum; 00507 00508 /* Increment the MAC count */ 00509 count++; 00510 00511 /* Update the inputA and inputB pointers for next MAC calculation */ 00512 px = pIn1 + count; 00513 py = pSrc2; 00514 00515 /* Decrement the loop counter */ 00516 blkCnt--; 00517 } 00518 } 00519 00520 00521 /* -------------------------- 00522 * Initializations of stage3 00523 * -------------------------*/ 00524 00525 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00526 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00527 * .... 00528 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00529 * sum += x[srcALen-1] * y[srcBLen-1] 00530 */ 00531 00532 /* In this stage the MAC operations are decreased by 1 for every iteration. 00533 The blockSize3 variable holds the number of MAC operations performed */ 00534 00535 /* Working pointer of inputA */ 00536 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); 00537 px = pSrc1; 00538 00539 /* Working pointer of inputB */ 00540 pSrc2 = pIn2 + (srcBLen - 1U); 00541 py = pSrc2; 00542 00543 /* ------------------- 00544 * Stage3 process 00545 * ------------------*/ 00546 00547 while (blockSize3 > 0U) 00548 { 00549 /* Accumulator is made zero for every iteration */ 00550 sum = 0.0f; 00551 00552 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00553 k = blockSize3 >> 2U; 00554 00555 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00556 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00557 while (k > 0U) 00558 { 00559 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00560 sum += *px++ * *py--; 00561 00562 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00563 sum += *px++ * *py--; 00564 00565 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00566 sum += *px++ * *py--; 00567 00568 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00569 sum += *px++ * *py--; 00570 00571 /* Decrement the loop counter */ 00572 k--; 00573 } 00574 00575 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00576 ** No loop unrolling is used. */ 00577 k = blockSize3 % 0x4U; 00578 00579 while (k > 0U) 00580 { 00581 /* Perform the multiply-accumulates */ 00582 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00583 sum += *px++ * *py--; 00584 00585 /* Decrement the loop counter */ 00586 k--; 00587 } 00588 00589 /* Store the result in the accumulator in the destination buffer. */ 00590 *pOut++ = sum; 00591 00592 /* Update the inputA and inputB pointers for next MAC calculation */ 00593 px = ++pSrc1; 00594 py = pSrc2; 00595 00596 /* Decrement the loop counter */ 00597 blockSize3--; 00598 } 00599 00600 #else 00601 00602 /* Run the below code for Cortex-M0 */ 00603 00604 float32_t *pIn1 = pSrcA; /* inputA pointer */ 00605 float32_t *pIn2 = pSrcB; /* inputB pointer */ 00606 float32_t sum; /* Accumulator */ 00607 uint32_t i, j; /* loop counters */ 00608 00609 /* Loop to calculate convolution for output length number of times */ 00610 for (i = 0U; i < ((srcALen + srcBLen) - 1U); i++) 00611 { 00612 /* Initialize sum with zero to carry out MAC operations */ 00613 sum = 0.0f; 00614 00615 /* Loop to perform MAC operations according to convolution equation */ 00616 for (j = 0U; j <= i; j++) 00617 { 00618 /* Check the array limitations */ 00619 if ((((i - j) < srcBLen) && (j < srcALen))) 00620 { 00621 /* z[i] += x[i-j] * y[j] */ 00622 sum += pIn1[j] * pIn2[i - j]; 00623 } 00624 } 00625 /* Store the output in the destination buffer */ 00626 pDst[i] = sum; 00627 } 00628 00629 #endif /* #if defined (ARM_MATH_DSP) */ 00630 00631 } 00632 00633 /** 00634 * @} end of Conv group 00635 */ 00636
Generated on Tue Jul 12 2022 16:46:23 by 1.7.2