Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_f32.c Source File

arm_conv_f32.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_f32.c
00004  * Description:  Convolution of floating-point sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @defgroup Conv Convolution
00037  *
00038  * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
00039  * Convolution is similar to correlation and is frequently used in filtering and data analysis.
00040  * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
00041  * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.
00042  *
00043  * \par Algorithm
00044  * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
00045  * Then the convolution
00046  *
00047  * <pre>
00048  *                   c[n] = a[n] * b[n]
00049  * </pre>
00050  *
00051  * \par
00052  * is defined as
00053  * \image html ConvolutionEquation.gif
00054  * \par
00055  * Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.
00056  * <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and
00057  * <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
00058  * The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.
00059  *
00060  * \par
00061  * Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,
00062  * the signal <code>b[n]</code> slides over <code>a[n]</code>.
00063  * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
00064  *
00065  * \par
00066  * Note that convolution is a commutative operation:
00067  *
00068  * <pre>
00069  *                   a[n] * b[n] = b[n] * a[n].
00070  * </pre>
00071  *
00072  * \par
00073  * This means that switching the A and B arguments to the convolution functions has no effect.
00074  *
00075  * <b>Fixed-Point Behavior</b>
00076  *
00077  * \par
00078  * Convolution requires summing up a large number of intermediate products.
00079  * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
00080  * Refer to the function specific documentation below for further details of the particular algorithm used.
00081  *
00082  *
00083  * <b>Fast Versions</b>
00084  *
00085  * \par
00086  * Fast versions are supported for Q31 and Q15.  Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
00087  * the input signals should be scaled down to avoid intermediate overflows.
00088  *
00089  *
00090  * <b>Opt Versions</b>
00091  *
00092  * \par
00093  * Opt versions are supported for Q15 and Q7.  Design uses internal scratch buffer for getting good optimisation.
00094  * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions
00095  */
00096 
00097 /**
00098  * @addtogroup Conv
00099  * @{
00100  */
00101 
00102 /**
00103  * @brief Convolution of floating-point sequences.
00104  * @param[in] *pSrcA points to the first input sequence.
00105  * @param[in] srcALen length of the first input sequence.
00106  * @param[in] *pSrcB points to the second input sequence.
00107  * @param[in] srcBLen length of the second input sequence.
00108  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
00109  * @return none.
00110  */
00111 
00112 void arm_conv_f32(
00113   float32_t * pSrcA,
00114   uint32_t srcALen,
00115   float32_t * pSrcB,
00116   uint32_t srcBLen,
00117   float32_t * pDst)
00118 {
00119 
00120 
00121 #if defined (ARM_MATH_DSP)
00122 
00123   /* Run the below code for Cortex-M4 and Cortex-M3 */
00124 
00125   float32_t *pIn1;                               /* inputA pointer */
00126   float32_t *pIn2;                               /* inputB pointer */
00127   float32_t *pOut = pDst;                        /* output pointer */
00128   float32_t *px;                                 /* Intermediate inputA pointer */
00129   float32_t *py;                                 /* Intermediate inputB pointer */
00130   float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
00131   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */
00132   float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */
00133   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counters */
00134 
00135   /* The algorithm implementation is based on the lengths of the inputs. */
00136   /* srcB is always made to slide across srcA. */
00137   /* So srcBLen is always considered as shorter or equal to srcALen */
00138   if (srcALen >= srcBLen)
00139   {
00140     /* Initialization of inputA pointer */
00141     pIn1 = pSrcA;
00142 
00143     /* Initialization of inputB pointer */
00144     pIn2 = pSrcB;
00145   }
00146   else
00147   {
00148     /* Initialization of inputA pointer */
00149     pIn1 = pSrcB;
00150 
00151     /* Initialization of inputB pointer */
00152     pIn2 = pSrcA;
00153 
00154     /* srcBLen is always considered as shorter or equal to srcALen */
00155     j = srcBLen;
00156     srcBLen = srcALen;
00157     srcALen = j;
00158   }
00159 
00160   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00161   /* The function is internally
00162    * divided into three stages according to the number of multiplications that has to be
00163    * taken place between inputA samples and inputB samples. In the first stage of the
00164    * algorithm, the multiplications increase by one for every iteration.
00165    * In the second stage of the algorithm, srcBLen number of multiplications are done.
00166    * In the third stage of the algorithm, the multiplications decrease by one
00167    * for every iteration. */
00168 
00169   /* The algorithm is implemented in three stages.
00170      The loop counters of each stage is initiated here. */
00171   blockSize1 = srcBLen - 1U;
00172   blockSize2 = srcALen - (srcBLen - 1U);
00173   blockSize3 = blockSize1;
00174 
00175   /* --------------------------
00176    * initializations of stage1
00177    * -------------------------*/
00178 
00179   /* sum = x[0] * y[0]
00180    * sum = x[0] * y[1] + x[1] * y[0]
00181    * ....
00182    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
00183    */
00184 
00185   /* In this stage the MAC operations are increased by 1 for every iteration.
00186      The count variable holds the number of MAC operations performed */
00187   count = 1U;
00188 
00189   /* Working pointer of inputA */
00190   px = pIn1;
00191 
00192   /* Working pointer of inputB */
00193   py = pIn2;
00194 
00195 
00196   /* ------------------------
00197    * Stage1 process
00198    * ----------------------*/
00199 
00200   /* The first stage starts here */
00201   while (blockSize1 > 0U)
00202   {
00203     /* Accumulator is made zero for every iteration */
00204     sum = 0.0f;
00205 
00206     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00207     k = count >> 2U;
00208 
00209     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00210      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00211     while (k > 0U)
00212     {
00213       /* x[0] * y[srcBLen - 1] */
00214       sum += *px++ * *py--;
00215 
00216       /* x[1] * y[srcBLen - 2] */
00217       sum += *px++ * *py--;
00218 
00219       /* x[2] * y[srcBLen - 3] */
00220       sum += *px++ * *py--;
00221 
00222       /* x[3] * y[srcBLen - 4] */
00223       sum += *px++ * *py--;
00224 
00225       /* Decrement the loop counter */
00226       k--;
00227     }
00228 
00229     /* If the count is not a multiple of 4, compute any remaining MACs here.
00230      ** No loop unrolling is used. */
00231     k = count % 0x4U;
00232 
00233     while (k > 0U)
00234     {
00235       /* Perform the multiply-accumulate */
00236       sum += *px++ * *py--;
00237 
00238       /* Decrement the loop counter */
00239       k--;
00240     }
00241 
00242     /* Store the result in the accumulator in the destination buffer. */
00243     *pOut++ = sum;
00244 
00245     /* Update the inputA and inputB pointers for next MAC calculation */
00246     py = pIn2 + count;
00247     px = pIn1;
00248 
00249     /* Increment the MAC count */
00250     count++;
00251 
00252     /* Decrement the loop counter */
00253     blockSize1--;
00254   }
00255 
00256   /* --------------------------
00257    * Initializations of stage2
00258    * ------------------------*/
00259 
00260   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
00261    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
00262    * ....
00263    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
00264    */
00265 
00266   /* Working pointer of inputA */
00267   px = pIn1;
00268 
00269   /* Working pointer of inputB */
00270   pSrc2 = pIn2 + (srcBLen - 1U);
00271   py = pSrc2;
00272 
00273   /* count is index by which the pointer pIn1 to be incremented */
00274   count = 0U;
00275 
00276   /* -------------------
00277    * Stage2 process
00278    * ------------------*/
00279 
00280   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00281    * So, to loop unroll over blockSize2,
00282    * srcBLen should be greater than or equal to 4 */
00283   if (srcBLen >= 4U)
00284   {
00285     /* Loop unroll over blockSize2, by 4 */
00286     blkCnt = blockSize2 >> 2U;
00287 
00288     while (blkCnt > 0U)
00289     {
00290       /* Set all accumulators to zero */
00291       acc0 = 0.0f;
00292       acc1 = 0.0f;
00293       acc2 = 0.0f;
00294       acc3 = 0.0f;
00295 
00296       /* read x[0], x[1], x[2] samples */
00297       x0 = *(px++);
00298       x1 = *(px++);
00299       x2 = *(px++);
00300 
00301       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00302       k = srcBLen >> 2U;
00303 
00304       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00305        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00306       do
00307       {
00308         /* Read y[srcBLen - 1] sample */
00309         c0 = *(py--);
00310 
00311         /* Read x[3] sample */
00312         x3 = *(px);
00313 
00314         /* Perform the multiply-accumulate */
00315         /* acc0 +=  x[0] * y[srcBLen - 1] */
00316         acc0 += x0 * c0;
00317 
00318         /* acc1 +=  x[1] * y[srcBLen - 1] */
00319         acc1 += x1 * c0;
00320 
00321         /* acc2 +=  x[2] * y[srcBLen - 1] */
00322         acc2 += x2 * c0;
00323 
00324         /* acc3 +=  x[3] * y[srcBLen - 1] */
00325         acc3 += x3 * c0;
00326 
00327         /* Read y[srcBLen - 2] sample */
00328         c0 = *(py--);
00329 
00330         /* Read x[4] sample */
00331         x0 = *(px + 1U);
00332 
00333         /* Perform the multiply-accumulate */
00334         /* acc0 +=  x[1] * y[srcBLen - 2] */
00335         acc0 += x1 * c0;
00336         /* acc1 +=  x[2] * y[srcBLen - 2] */
00337         acc1 += x2 * c0;
00338         /* acc2 +=  x[3] * y[srcBLen - 2] */
00339         acc2 += x3 * c0;
00340         /* acc3 +=  x[4] * y[srcBLen - 2] */
00341         acc3 += x0 * c0;
00342 
00343         /* Read y[srcBLen - 3] sample */
00344         c0 = *(py--);
00345 
00346         /* Read x[5] sample */
00347         x1 = *(px + 2U);
00348 
00349         /* Perform the multiply-accumulates */
00350         /* acc0 +=  x[2] * y[srcBLen - 3] */
00351         acc0 += x2 * c0;
00352         /* acc1 +=  x[3] * y[srcBLen - 2] */
00353         acc1 += x3 * c0;
00354         /* acc2 +=  x[4] * y[srcBLen - 2] */
00355         acc2 += x0 * c0;
00356         /* acc3 +=  x[5] * y[srcBLen - 2] */
00357         acc3 += x1 * c0;
00358 
00359         /* Read y[srcBLen - 4] sample */
00360         c0 = *(py--);
00361 
00362         /* Read x[6] sample */
00363         x2 = *(px + 3U);
00364         px += 4U;
00365 
00366         /* Perform the multiply-accumulates */
00367         /* acc0 +=  x[3] * y[srcBLen - 4] */
00368         acc0 += x3 * c0;
00369         /* acc1 +=  x[4] * y[srcBLen - 4] */
00370         acc1 += x0 * c0;
00371         /* acc2 +=  x[5] * y[srcBLen - 4] */
00372         acc2 += x1 * c0;
00373         /* acc3 +=  x[6] * y[srcBLen - 4] */
00374         acc3 += x2 * c0;
00375 
00376 
00377       } while (--k);
00378 
00379       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00380        ** No loop unrolling is used. */
00381       k = srcBLen % 0x4U;
00382 
00383       while (k > 0U)
00384       {
00385         /* Read y[srcBLen - 5] sample */
00386         c0 = *(py--);
00387 
00388         /* Read x[7] sample */
00389         x3 = *(px++);
00390 
00391         /* Perform the multiply-accumulates */
00392         /* acc0 +=  x[4] * y[srcBLen - 5] */
00393         acc0 += x0 * c0;
00394         /* acc1 +=  x[5] * y[srcBLen - 5] */
00395         acc1 += x1 * c0;
00396         /* acc2 +=  x[6] * y[srcBLen - 5] */
00397         acc2 += x2 * c0;
00398         /* acc3 +=  x[7] * y[srcBLen - 5] */
00399         acc3 += x3 * c0;
00400 
00401         /* Reuse the present samples for the next MAC */
00402         x0 = x1;
00403         x1 = x2;
00404         x2 = x3;
00405 
00406         /* Decrement the loop counter */
00407         k--;
00408       }
00409 
00410       /* Store the result in the accumulator in the destination buffer. */
00411       *pOut++ = acc0;
00412       *pOut++ = acc1;
00413       *pOut++ = acc2;
00414       *pOut++ = acc3;
00415 
00416       /* Increment the pointer pIn1 index, count by 4 */
00417       count += 4U;
00418 
00419       /* Update the inputA and inputB pointers for next MAC calculation */
00420       px = pIn1 + count;
00421       py = pSrc2;
00422 
00423 
00424       /* Decrement the loop counter */
00425       blkCnt--;
00426     }
00427 
00428 
00429     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
00430      ** No loop unrolling is used. */
00431     blkCnt = blockSize2 % 0x4U;
00432 
00433     while (blkCnt > 0U)
00434     {
00435       /* Accumulator is made zero for every iteration */
00436       sum = 0.0f;
00437 
00438       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00439       k = srcBLen >> 2U;
00440 
00441       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00442        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00443       while (k > 0U)
00444       {
00445         /* Perform the multiply-accumulates */
00446         sum += *px++ * *py--;
00447         sum += *px++ * *py--;
00448         sum += *px++ * *py--;
00449         sum += *px++ * *py--;
00450 
00451         /* Decrement the loop counter */
00452         k--;
00453       }
00454 
00455       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00456        ** No loop unrolling is used. */
00457       k = srcBLen % 0x4U;
00458 
00459       while (k > 0U)
00460       {
00461         /* Perform the multiply-accumulate */
00462         sum += *px++ * *py--;
00463 
00464         /* Decrement the loop counter */
00465         k--;
00466       }
00467 
00468       /* Store the result in the accumulator in the destination buffer. */
00469       *pOut++ = sum;
00470 
00471       /* Increment the MAC count */
00472       count++;
00473 
00474       /* Update the inputA and inputB pointers for next MAC calculation */
00475       px = pIn1 + count;
00476       py = pSrc2;
00477 
00478       /* Decrement the loop counter */
00479       blkCnt--;
00480     }
00481   }
00482   else
00483   {
00484     /* If the srcBLen is not a multiple of 4,
00485      * the blockSize2 loop cannot be unrolled by 4 */
00486     blkCnt = blockSize2;
00487 
00488     while (blkCnt > 0U)
00489     {
00490       /* Accumulator is made zero for every iteration */
00491       sum = 0.0f;
00492 
00493       /* srcBLen number of MACS should be performed */
00494       k = srcBLen;
00495 
00496       while (k > 0U)
00497       {
00498         /* Perform the multiply-accumulate */
00499         sum += *px++ * *py--;
00500 
00501         /* Decrement the loop counter */
00502         k--;
00503       }
00504 
00505       /* Store the result in the accumulator in the destination buffer. */
00506       *pOut++ = sum;
00507 
00508       /* Increment the MAC count */
00509       count++;
00510 
00511       /* Update the inputA and inputB pointers for next MAC calculation */
00512       px = pIn1 + count;
00513       py = pSrc2;
00514 
00515       /* Decrement the loop counter */
00516       blkCnt--;
00517     }
00518   }
00519 
00520 
00521   /* --------------------------
00522    * Initializations of stage3
00523    * -------------------------*/
00524 
00525   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
00526    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
00527    * ....
00528    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
00529    * sum +=  x[srcALen-1] * y[srcBLen-1]
00530    */
00531 
00532   /* In this stage the MAC operations are decreased by 1 for every iteration.
00533      The blockSize3 variable holds the number of MAC operations performed */
00534 
00535   /* Working pointer of inputA */
00536   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
00537   px = pSrc1;
00538 
00539   /* Working pointer of inputB */
00540   pSrc2 = pIn2 + (srcBLen - 1U);
00541   py = pSrc2;
00542 
00543   /* -------------------
00544    * Stage3 process
00545    * ------------------*/
00546 
00547   while (blockSize3 > 0U)
00548   {
00549     /* Accumulator is made zero for every iteration */
00550     sum = 0.0f;
00551 
00552     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00553     k = blockSize3 >> 2U;
00554 
00555     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00556      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00557     while (k > 0U)
00558     {
00559       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00560       sum += *px++ * *py--;
00561 
00562       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00563       sum += *px++ * *py--;
00564 
00565       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00566       sum += *px++ * *py--;
00567 
00568       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00569       sum += *px++ * *py--;
00570 
00571       /* Decrement the loop counter */
00572       k--;
00573     }
00574 
00575     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
00576      ** No loop unrolling is used. */
00577     k = blockSize3 % 0x4U;
00578 
00579     while (k > 0U)
00580     {
00581       /* Perform the multiply-accumulates */
00582       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00583       sum += *px++ * *py--;
00584 
00585       /* Decrement the loop counter */
00586       k--;
00587     }
00588 
00589     /* Store the result in the accumulator in the destination buffer. */
00590     *pOut++ = sum;
00591 
00592     /* Update the inputA and inputB pointers for next MAC calculation */
00593     px = ++pSrc1;
00594     py = pSrc2;
00595 
00596     /* Decrement the loop counter */
00597     blockSize3--;
00598   }
00599 
00600 #else
00601 
00602   /* Run the below code for Cortex-M0 */
00603 
00604   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
00605   float32_t *pIn2 = pSrcB;                       /* inputB pointer */
00606   float32_t sum;                                 /* Accumulator */
00607   uint32_t i, j;                                 /* loop counters */
00608 
00609   /* Loop to calculate convolution for output length number of times */
00610   for (i = 0U; i < ((srcALen + srcBLen) - 1U); i++)
00611   {
00612     /* Initialize sum with zero to carry out MAC operations */
00613     sum = 0.0f;
00614 
00615     /* Loop to perform MAC operations according to convolution equation */
00616     for (j = 0U; j <= i; j++)
00617     {
00618       /* Check the array limitations */
00619       if ((((i - j) < srcBLen) && (j < srcALen)))
00620       {
00621         /* z[i] += x[i-j] * y[j] */
00622         sum += pIn1[j] * pIn2[i - j];
00623       }
00624     }
00625     /* Store the output in the destination buffer */
00626     pDst[i] = sum;
00627   }
00628 
00629 #endif /*   #if defined (ARM_MATH_DSP)        */
00630 
00631 }
00632 
00633 /**
00634  * @} end of Conv group
00635  */
00636