Marco Zecchini / Mbed OS Example_RTOS
Committer:
marcozecchini
Date:
Sat Feb 23 12:13:36 2019 +0000
Revision:
0:9fca2b23d0ba
final commit

Who changed what in which revision?

UserRevisionLine numberNew contents of line
marcozecchini 0:9fca2b23d0ba 1 /* ----------------------------------------------------------------------------
marcozecchini 0:9fca2b23d0ba 2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
marcozecchini 0:9fca2b23d0ba 3 *
marcozecchini 0:9fca2b23d0ba 4 * $Date: 19. March 2015
marcozecchini 0:9fca2b23d0ba 5 * $Revision: V.1.4.5
marcozecchini 0:9fca2b23d0ba 6 *
marcozecchini 0:9fca2b23d0ba 7 * Project: CMSIS DSP Library
marcozecchini 0:9fca2b23d0ba 8 * Title: arm_conv_f32.c
marcozecchini 0:9fca2b23d0ba 9 *
marcozecchini 0:9fca2b23d0ba 10 * Description: Convolution of floating-point sequences.
marcozecchini 0:9fca2b23d0ba 11 *
marcozecchini 0:9fca2b23d0ba 12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
marcozecchini 0:9fca2b23d0ba 13 *
marcozecchini 0:9fca2b23d0ba 14 * Redistribution and use in source and binary forms, with or without
marcozecchini 0:9fca2b23d0ba 15 * modification, are permitted provided that the following conditions
marcozecchini 0:9fca2b23d0ba 16 * are met:
marcozecchini 0:9fca2b23d0ba 17 * - Redistributions of source code must retain the above copyright
marcozecchini 0:9fca2b23d0ba 18 * notice, this list of conditions and the following disclaimer.
marcozecchini 0:9fca2b23d0ba 19 * - Redistributions in binary form must reproduce the above copyright
marcozecchini 0:9fca2b23d0ba 20 * notice, this list of conditions and the following disclaimer in
marcozecchini 0:9fca2b23d0ba 21 * the documentation and/or other materials provided with the
marcozecchini 0:9fca2b23d0ba 22 * distribution.
marcozecchini 0:9fca2b23d0ba 23 * - Neither the name of ARM LIMITED nor the names of its contributors
marcozecchini 0:9fca2b23d0ba 24 * may be used to endorse or promote products derived from this
marcozecchini 0:9fca2b23d0ba 25 * software without specific prior written permission.
marcozecchini 0:9fca2b23d0ba 26 *
marcozecchini 0:9fca2b23d0ba 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
marcozecchini 0:9fca2b23d0ba 28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
marcozecchini 0:9fca2b23d0ba 29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
marcozecchini 0:9fca2b23d0ba 30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
marcozecchini 0:9fca2b23d0ba 31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
marcozecchini 0:9fca2b23d0ba 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
marcozecchini 0:9fca2b23d0ba 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
marcozecchini 0:9fca2b23d0ba 34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
marcozecchini 0:9fca2b23d0ba 35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
marcozecchini 0:9fca2b23d0ba 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
marcozecchini 0:9fca2b23d0ba 37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
marcozecchini 0:9fca2b23d0ba 38 * POSSIBILITY OF SUCH DAMAGE.
marcozecchini 0:9fca2b23d0ba 39 * -------------------------------------------------------------------------- */
marcozecchini 0:9fca2b23d0ba 40
marcozecchini 0:9fca2b23d0ba 41 #include "arm_math.h"
marcozecchini 0:9fca2b23d0ba 42
marcozecchini 0:9fca2b23d0ba 43 /**
marcozecchini 0:9fca2b23d0ba 44 * @ingroup groupFilters
marcozecchini 0:9fca2b23d0ba 45 */
marcozecchini 0:9fca2b23d0ba 46
marcozecchini 0:9fca2b23d0ba 47 /**
marcozecchini 0:9fca2b23d0ba 48 * @defgroup Conv Convolution
marcozecchini 0:9fca2b23d0ba 49 *
marcozecchini 0:9fca2b23d0ba 50 * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
marcozecchini 0:9fca2b23d0ba 51 * Convolution is similar to correlation and is frequently used in filtering and data analysis.
marcozecchini 0:9fca2b23d0ba 52 * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
marcozecchini 0:9fca2b23d0ba 53 * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.
marcozecchini 0:9fca2b23d0ba 54 *
marcozecchini 0:9fca2b23d0ba 55 * \par Algorithm
marcozecchini 0:9fca2b23d0ba 56 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
marcozecchini 0:9fca2b23d0ba 57 * Then the convolution
marcozecchini 0:9fca2b23d0ba 58 *
marcozecchini 0:9fca2b23d0ba 59 * <pre>
marcozecchini 0:9fca2b23d0ba 60 * c[n] = a[n] * b[n]
marcozecchini 0:9fca2b23d0ba 61 * </pre>
marcozecchini 0:9fca2b23d0ba 62 *
marcozecchini 0:9fca2b23d0ba 63 * \par
marcozecchini 0:9fca2b23d0ba 64 * is defined as
marcozecchini 0:9fca2b23d0ba 65 * \image html ConvolutionEquation.gif
marcozecchini 0:9fca2b23d0ba 66 * \par
marcozecchini 0:9fca2b23d0ba 67 * Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.
marcozecchini 0:9fca2b23d0ba 68 * <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and
marcozecchini 0:9fca2b23d0ba 69 * <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
marcozecchini 0:9fca2b23d0ba 70 * The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.
marcozecchini 0:9fca2b23d0ba 71 *
marcozecchini 0:9fca2b23d0ba 72 * \par
marcozecchini 0:9fca2b23d0ba 73 * Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,
marcozecchini 0:9fca2b23d0ba 74 * the signal <code>b[n]</code> slides over <code>a[n]</code>.
marcozecchini 0:9fca2b23d0ba 75 * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
marcozecchini 0:9fca2b23d0ba 76 *
marcozecchini 0:9fca2b23d0ba 77 * \par
marcozecchini 0:9fca2b23d0ba 78 * Note that convolution is a commutative operation:
marcozecchini 0:9fca2b23d0ba 79 *
marcozecchini 0:9fca2b23d0ba 80 * <pre>
marcozecchini 0:9fca2b23d0ba 81 * a[n] * b[n] = b[n] * a[n].
marcozecchini 0:9fca2b23d0ba 82 * </pre>
marcozecchini 0:9fca2b23d0ba 83 *
marcozecchini 0:9fca2b23d0ba 84 * \par
marcozecchini 0:9fca2b23d0ba 85 * This means that switching the A and B arguments to the convolution functions has no effect.
marcozecchini 0:9fca2b23d0ba 86 *
marcozecchini 0:9fca2b23d0ba 87 * <b>Fixed-Point Behavior</b>
marcozecchini 0:9fca2b23d0ba 88 *
marcozecchini 0:9fca2b23d0ba 89 * \par
marcozecchini 0:9fca2b23d0ba 90 * Convolution requires summing up a large number of intermediate products.
marcozecchini 0:9fca2b23d0ba 91 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
marcozecchini 0:9fca2b23d0ba 92 * Refer to the function specific documentation below for further details of the particular algorithm used.
marcozecchini 0:9fca2b23d0ba 93 *
marcozecchini 0:9fca2b23d0ba 94 *
marcozecchini 0:9fca2b23d0ba 95 * <b>Fast Versions</b>
marcozecchini 0:9fca2b23d0ba 96 *
marcozecchini 0:9fca2b23d0ba 97 * \par
marcozecchini 0:9fca2b23d0ba 98 * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
marcozecchini 0:9fca2b23d0ba 99 * the input signals should be scaled down to avoid intermediate overflows.
marcozecchini 0:9fca2b23d0ba 100 *
marcozecchini 0:9fca2b23d0ba 101 *
marcozecchini 0:9fca2b23d0ba 102 * <b>Opt Versions</b>
marcozecchini 0:9fca2b23d0ba 103 *
marcozecchini 0:9fca2b23d0ba 104 * \par
marcozecchini 0:9fca2b23d0ba 105 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
marcozecchini 0:9fca2b23d0ba 106 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions
marcozecchini 0:9fca2b23d0ba 107 */
marcozecchini 0:9fca2b23d0ba 108
marcozecchini 0:9fca2b23d0ba 109 /**
marcozecchini 0:9fca2b23d0ba 110 * @addtogroup Conv
marcozecchini 0:9fca2b23d0ba 111 * @{
marcozecchini 0:9fca2b23d0ba 112 */
marcozecchini 0:9fca2b23d0ba 113
marcozecchini 0:9fca2b23d0ba 114 /**
marcozecchini 0:9fca2b23d0ba 115 * @brief Convolution of floating-point sequences.
marcozecchini 0:9fca2b23d0ba 116 * @param[in] *pSrcA points to the first input sequence.
marcozecchini 0:9fca2b23d0ba 117 * @param[in] srcALen length of the first input sequence.
marcozecchini 0:9fca2b23d0ba 118 * @param[in] *pSrcB points to the second input sequence.
marcozecchini 0:9fca2b23d0ba 119 * @param[in] srcBLen length of the second input sequence.
marcozecchini 0:9fca2b23d0ba 120 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
marcozecchini 0:9fca2b23d0ba 121 * @return none.
marcozecchini 0:9fca2b23d0ba 122 */
marcozecchini 0:9fca2b23d0ba 123
marcozecchini 0:9fca2b23d0ba 124 void arm_conv_f32(
marcozecchini 0:9fca2b23d0ba 125 float32_t * pSrcA,
marcozecchini 0:9fca2b23d0ba 126 uint32_t srcALen,
marcozecchini 0:9fca2b23d0ba 127 float32_t * pSrcB,
marcozecchini 0:9fca2b23d0ba 128 uint32_t srcBLen,
marcozecchini 0:9fca2b23d0ba 129 float32_t * pDst)
marcozecchini 0:9fca2b23d0ba 130 {
marcozecchini 0:9fca2b23d0ba 131
marcozecchini 0:9fca2b23d0ba 132
marcozecchini 0:9fca2b23d0ba 133 #ifndef ARM_MATH_CM0_FAMILY
marcozecchini 0:9fca2b23d0ba 134
marcozecchini 0:9fca2b23d0ba 135 /* Run the below code for Cortex-M4 and Cortex-M3 */
marcozecchini 0:9fca2b23d0ba 136
marcozecchini 0:9fca2b23d0ba 137 float32_t *pIn1; /* inputA pointer */
marcozecchini 0:9fca2b23d0ba 138 float32_t *pIn2; /* inputB pointer */
marcozecchini 0:9fca2b23d0ba 139 float32_t *pOut = pDst; /* output pointer */
marcozecchini 0:9fca2b23d0ba 140 float32_t *px; /* Intermediate inputA pointer */
marcozecchini 0:9fca2b23d0ba 141 float32_t *py; /* Intermediate inputB pointer */
marcozecchini 0:9fca2b23d0ba 142 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
marcozecchini 0:9fca2b23d0ba 143 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
marcozecchini 0:9fca2b23d0ba 144 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
marcozecchini 0:9fca2b23d0ba 145 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */
marcozecchini 0:9fca2b23d0ba 146
marcozecchini 0:9fca2b23d0ba 147 /* The algorithm implementation is based on the lengths of the inputs. */
marcozecchini 0:9fca2b23d0ba 148 /* srcB is always made to slide across srcA. */
marcozecchini 0:9fca2b23d0ba 149 /* So srcBLen is always considered as shorter or equal to srcALen */
marcozecchini 0:9fca2b23d0ba 150 if(srcALen >= srcBLen)
marcozecchini 0:9fca2b23d0ba 151 {
marcozecchini 0:9fca2b23d0ba 152 /* Initialization of inputA pointer */
marcozecchini 0:9fca2b23d0ba 153 pIn1 = pSrcA;
marcozecchini 0:9fca2b23d0ba 154
marcozecchini 0:9fca2b23d0ba 155 /* Initialization of inputB pointer */
marcozecchini 0:9fca2b23d0ba 156 pIn2 = pSrcB;
marcozecchini 0:9fca2b23d0ba 157 }
marcozecchini 0:9fca2b23d0ba 158 else
marcozecchini 0:9fca2b23d0ba 159 {
marcozecchini 0:9fca2b23d0ba 160 /* Initialization of inputA pointer */
marcozecchini 0:9fca2b23d0ba 161 pIn1 = pSrcB;
marcozecchini 0:9fca2b23d0ba 162
marcozecchini 0:9fca2b23d0ba 163 /* Initialization of inputB pointer */
marcozecchini 0:9fca2b23d0ba 164 pIn2 = pSrcA;
marcozecchini 0:9fca2b23d0ba 165
marcozecchini 0:9fca2b23d0ba 166 /* srcBLen is always considered as shorter or equal to srcALen */
marcozecchini 0:9fca2b23d0ba 167 j = srcBLen;
marcozecchini 0:9fca2b23d0ba 168 srcBLen = srcALen;
marcozecchini 0:9fca2b23d0ba 169 srcALen = j;
marcozecchini 0:9fca2b23d0ba 170 }
marcozecchini 0:9fca2b23d0ba 171
marcozecchini 0:9fca2b23d0ba 172 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
marcozecchini 0:9fca2b23d0ba 173 /* The function is internally
marcozecchini 0:9fca2b23d0ba 174 * divided into three stages according to the number of multiplications that has to be
marcozecchini 0:9fca2b23d0ba 175 * taken place between inputA samples and inputB samples. In the first stage of the
marcozecchini 0:9fca2b23d0ba 176 * algorithm, the multiplications increase by one for every iteration.
marcozecchini 0:9fca2b23d0ba 177 * In the second stage of the algorithm, srcBLen number of multiplications are done.
marcozecchini 0:9fca2b23d0ba 178 * In the third stage of the algorithm, the multiplications decrease by one
marcozecchini 0:9fca2b23d0ba 179 * for every iteration. */
marcozecchini 0:9fca2b23d0ba 180
marcozecchini 0:9fca2b23d0ba 181 /* The algorithm is implemented in three stages.
marcozecchini 0:9fca2b23d0ba 182 The loop counters of each stage is initiated here. */
marcozecchini 0:9fca2b23d0ba 183 blockSize1 = srcBLen - 1u;
marcozecchini 0:9fca2b23d0ba 184 blockSize2 = srcALen - (srcBLen - 1u);
marcozecchini 0:9fca2b23d0ba 185 blockSize3 = blockSize1;
marcozecchini 0:9fca2b23d0ba 186
marcozecchini 0:9fca2b23d0ba 187 /* --------------------------
marcozecchini 0:9fca2b23d0ba 188 * initializations of stage1
marcozecchini 0:9fca2b23d0ba 189 * -------------------------*/
marcozecchini 0:9fca2b23d0ba 190
marcozecchini 0:9fca2b23d0ba 191 /* sum = x[0] * y[0]
marcozecchini 0:9fca2b23d0ba 192 * sum = x[0] * y[1] + x[1] * y[0]
marcozecchini 0:9fca2b23d0ba 193 * ....
marcozecchini 0:9fca2b23d0ba 194 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
marcozecchini 0:9fca2b23d0ba 195 */
marcozecchini 0:9fca2b23d0ba 196
marcozecchini 0:9fca2b23d0ba 197 /* In this stage the MAC operations are increased by 1 for every iteration.
marcozecchini 0:9fca2b23d0ba 198 The count variable holds the number of MAC operations performed */
marcozecchini 0:9fca2b23d0ba 199 count = 1u;
marcozecchini 0:9fca2b23d0ba 200
marcozecchini 0:9fca2b23d0ba 201 /* Working pointer of inputA */
marcozecchini 0:9fca2b23d0ba 202 px = pIn1;
marcozecchini 0:9fca2b23d0ba 203
marcozecchini 0:9fca2b23d0ba 204 /* Working pointer of inputB */
marcozecchini 0:9fca2b23d0ba 205 py = pIn2;
marcozecchini 0:9fca2b23d0ba 206
marcozecchini 0:9fca2b23d0ba 207
marcozecchini 0:9fca2b23d0ba 208 /* ------------------------
marcozecchini 0:9fca2b23d0ba 209 * Stage1 process
marcozecchini 0:9fca2b23d0ba 210 * ----------------------*/
marcozecchini 0:9fca2b23d0ba 211
marcozecchini 0:9fca2b23d0ba 212 /* The first stage starts here */
marcozecchini 0:9fca2b23d0ba 213 while(blockSize1 > 0u)
marcozecchini 0:9fca2b23d0ba 214 {
marcozecchini 0:9fca2b23d0ba 215 /* Accumulator is made zero for every iteration */
marcozecchini 0:9fca2b23d0ba 216 sum = 0.0f;
marcozecchini 0:9fca2b23d0ba 217
marcozecchini 0:9fca2b23d0ba 218 /* Apply loop unrolling and compute 4 MACs simultaneously. */
marcozecchini 0:9fca2b23d0ba 219 k = count >> 2u;
marcozecchini 0:9fca2b23d0ba 220
marcozecchini 0:9fca2b23d0ba 221 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
marcozecchini 0:9fca2b23d0ba 222 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
marcozecchini 0:9fca2b23d0ba 223 while(k > 0u)
marcozecchini 0:9fca2b23d0ba 224 {
marcozecchini 0:9fca2b23d0ba 225 /* x[0] * y[srcBLen - 1] */
marcozecchini 0:9fca2b23d0ba 226 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 227
marcozecchini 0:9fca2b23d0ba 228 /* x[1] * y[srcBLen - 2] */
marcozecchini 0:9fca2b23d0ba 229 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 230
marcozecchini 0:9fca2b23d0ba 231 /* x[2] * y[srcBLen - 3] */
marcozecchini 0:9fca2b23d0ba 232 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 233
marcozecchini 0:9fca2b23d0ba 234 /* x[3] * y[srcBLen - 4] */
marcozecchini 0:9fca2b23d0ba 235 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 236
marcozecchini 0:9fca2b23d0ba 237 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 238 k--;
marcozecchini 0:9fca2b23d0ba 239 }
marcozecchini 0:9fca2b23d0ba 240
marcozecchini 0:9fca2b23d0ba 241 /* If the count is not a multiple of 4, compute any remaining MACs here.
marcozecchini 0:9fca2b23d0ba 242 ** No loop unrolling is used. */
marcozecchini 0:9fca2b23d0ba 243 k = count % 0x4u;
marcozecchini 0:9fca2b23d0ba 244
marcozecchini 0:9fca2b23d0ba 245 while(k > 0u)
marcozecchini 0:9fca2b23d0ba 246 {
marcozecchini 0:9fca2b23d0ba 247 /* Perform the multiply-accumulate */
marcozecchini 0:9fca2b23d0ba 248 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 249
marcozecchini 0:9fca2b23d0ba 250 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 251 k--;
marcozecchini 0:9fca2b23d0ba 252 }
marcozecchini 0:9fca2b23d0ba 253
marcozecchini 0:9fca2b23d0ba 254 /* Store the result in the accumulator in the destination buffer. */
marcozecchini 0:9fca2b23d0ba 255 *pOut++ = sum;
marcozecchini 0:9fca2b23d0ba 256
marcozecchini 0:9fca2b23d0ba 257 /* Update the inputA and inputB pointers for next MAC calculation */
marcozecchini 0:9fca2b23d0ba 258 py = pIn2 + count;
marcozecchini 0:9fca2b23d0ba 259 px = pIn1;
marcozecchini 0:9fca2b23d0ba 260
marcozecchini 0:9fca2b23d0ba 261 /* Increment the MAC count */
marcozecchini 0:9fca2b23d0ba 262 count++;
marcozecchini 0:9fca2b23d0ba 263
marcozecchini 0:9fca2b23d0ba 264 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 265 blockSize1--;
marcozecchini 0:9fca2b23d0ba 266 }
marcozecchini 0:9fca2b23d0ba 267
marcozecchini 0:9fca2b23d0ba 268 /* --------------------------
marcozecchini 0:9fca2b23d0ba 269 * Initializations of stage2
marcozecchini 0:9fca2b23d0ba 270 * ------------------------*/
marcozecchini 0:9fca2b23d0ba 271
marcozecchini 0:9fca2b23d0ba 272 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
marcozecchini 0:9fca2b23d0ba 273 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
marcozecchini 0:9fca2b23d0ba 274 * ....
marcozecchini 0:9fca2b23d0ba 275 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
marcozecchini 0:9fca2b23d0ba 276 */
marcozecchini 0:9fca2b23d0ba 277
marcozecchini 0:9fca2b23d0ba 278 /* Working pointer of inputA */
marcozecchini 0:9fca2b23d0ba 279 px = pIn1;
marcozecchini 0:9fca2b23d0ba 280
marcozecchini 0:9fca2b23d0ba 281 /* Working pointer of inputB */
marcozecchini 0:9fca2b23d0ba 282 pSrc2 = pIn2 + (srcBLen - 1u);
marcozecchini 0:9fca2b23d0ba 283 py = pSrc2;
marcozecchini 0:9fca2b23d0ba 284
marcozecchini 0:9fca2b23d0ba 285 /* count is index by which the pointer pIn1 to be incremented */
marcozecchini 0:9fca2b23d0ba 286 count = 0u;
marcozecchini 0:9fca2b23d0ba 287
marcozecchini 0:9fca2b23d0ba 288 /* -------------------
marcozecchini 0:9fca2b23d0ba 289 * Stage2 process
marcozecchini 0:9fca2b23d0ba 290 * ------------------*/
marcozecchini 0:9fca2b23d0ba 291
marcozecchini 0:9fca2b23d0ba 292 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
marcozecchini 0:9fca2b23d0ba 293 * So, to loop unroll over blockSize2,
marcozecchini 0:9fca2b23d0ba 294 * srcBLen should be greater than or equal to 4 */
marcozecchini 0:9fca2b23d0ba 295 if(srcBLen >= 4u)
marcozecchini 0:9fca2b23d0ba 296 {
marcozecchini 0:9fca2b23d0ba 297 /* Loop unroll over blockSize2, by 4 */
marcozecchini 0:9fca2b23d0ba 298 blkCnt = blockSize2 >> 2u;
marcozecchini 0:9fca2b23d0ba 299
marcozecchini 0:9fca2b23d0ba 300 while(blkCnt > 0u)
marcozecchini 0:9fca2b23d0ba 301 {
marcozecchini 0:9fca2b23d0ba 302 /* Set all accumulators to zero */
marcozecchini 0:9fca2b23d0ba 303 acc0 = 0.0f;
marcozecchini 0:9fca2b23d0ba 304 acc1 = 0.0f;
marcozecchini 0:9fca2b23d0ba 305 acc2 = 0.0f;
marcozecchini 0:9fca2b23d0ba 306 acc3 = 0.0f;
marcozecchini 0:9fca2b23d0ba 307
marcozecchini 0:9fca2b23d0ba 308 /* read x[0], x[1], x[2] samples */
marcozecchini 0:9fca2b23d0ba 309 x0 = *(px++);
marcozecchini 0:9fca2b23d0ba 310 x1 = *(px++);
marcozecchini 0:9fca2b23d0ba 311 x2 = *(px++);
marcozecchini 0:9fca2b23d0ba 312
marcozecchini 0:9fca2b23d0ba 313 /* Apply loop unrolling and compute 4 MACs simultaneously. */
marcozecchini 0:9fca2b23d0ba 314 k = srcBLen >> 2u;
marcozecchini 0:9fca2b23d0ba 315
marcozecchini 0:9fca2b23d0ba 316 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
marcozecchini 0:9fca2b23d0ba 317 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
marcozecchini 0:9fca2b23d0ba 318 do
marcozecchini 0:9fca2b23d0ba 319 {
marcozecchini 0:9fca2b23d0ba 320 /* Read y[srcBLen - 1] sample */
marcozecchini 0:9fca2b23d0ba 321 c0 = *(py--);
marcozecchini 0:9fca2b23d0ba 322
marcozecchini 0:9fca2b23d0ba 323 /* Read x[3] sample */
marcozecchini 0:9fca2b23d0ba 324 x3 = *(px);
marcozecchini 0:9fca2b23d0ba 325
marcozecchini 0:9fca2b23d0ba 326 /* Perform the multiply-accumulate */
marcozecchini 0:9fca2b23d0ba 327 /* acc0 += x[0] * y[srcBLen - 1] */
marcozecchini 0:9fca2b23d0ba 328 acc0 += x0 * c0;
marcozecchini 0:9fca2b23d0ba 329
marcozecchini 0:9fca2b23d0ba 330 /* acc1 += x[1] * y[srcBLen - 1] */
marcozecchini 0:9fca2b23d0ba 331 acc1 += x1 * c0;
marcozecchini 0:9fca2b23d0ba 332
marcozecchini 0:9fca2b23d0ba 333 /* acc2 += x[2] * y[srcBLen - 1] */
marcozecchini 0:9fca2b23d0ba 334 acc2 += x2 * c0;
marcozecchini 0:9fca2b23d0ba 335
marcozecchini 0:9fca2b23d0ba 336 /* acc3 += x[3] * y[srcBLen - 1] */
marcozecchini 0:9fca2b23d0ba 337 acc3 += x3 * c0;
marcozecchini 0:9fca2b23d0ba 338
marcozecchini 0:9fca2b23d0ba 339 /* Read y[srcBLen - 2] sample */
marcozecchini 0:9fca2b23d0ba 340 c0 = *(py--);
marcozecchini 0:9fca2b23d0ba 341
marcozecchini 0:9fca2b23d0ba 342 /* Read x[4] sample */
marcozecchini 0:9fca2b23d0ba 343 x0 = *(px + 1u);
marcozecchini 0:9fca2b23d0ba 344
marcozecchini 0:9fca2b23d0ba 345 /* Perform the multiply-accumulate */
marcozecchini 0:9fca2b23d0ba 346 /* acc0 += x[1] * y[srcBLen - 2] */
marcozecchini 0:9fca2b23d0ba 347 acc0 += x1 * c0;
marcozecchini 0:9fca2b23d0ba 348 /* acc1 += x[2] * y[srcBLen - 2] */
marcozecchini 0:9fca2b23d0ba 349 acc1 += x2 * c0;
marcozecchini 0:9fca2b23d0ba 350 /* acc2 += x[3] * y[srcBLen - 2] */
marcozecchini 0:9fca2b23d0ba 351 acc2 += x3 * c0;
marcozecchini 0:9fca2b23d0ba 352 /* acc3 += x[4] * y[srcBLen - 2] */
marcozecchini 0:9fca2b23d0ba 353 acc3 += x0 * c0;
marcozecchini 0:9fca2b23d0ba 354
marcozecchini 0:9fca2b23d0ba 355 /* Read y[srcBLen - 3] sample */
marcozecchini 0:9fca2b23d0ba 356 c0 = *(py--);
marcozecchini 0:9fca2b23d0ba 357
marcozecchini 0:9fca2b23d0ba 358 /* Read x[5] sample */
marcozecchini 0:9fca2b23d0ba 359 x1 = *(px + 2u);
marcozecchini 0:9fca2b23d0ba 360
marcozecchini 0:9fca2b23d0ba 361 /* Perform the multiply-accumulates */
marcozecchini 0:9fca2b23d0ba 362 /* acc0 += x[2] * y[srcBLen - 3] */
marcozecchini 0:9fca2b23d0ba 363 acc0 += x2 * c0;
marcozecchini 0:9fca2b23d0ba 364 /* acc1 += x[3] * y[srcBLen - 2] */
marcozecchini 0:9fca2b23d0ba 365 acc1 += x3 * c0;
marcozecchini 0:9fca2b23d0ba 366 /* acc2 += x[4] * y[srcBLen - 2] */
marcozecchini 0:9fca2b23d0ba 367 acc2 += x0 * c0;
marcozecchini 0:9fca2b23d0ba 368 /* acc3 += x[5] * y[srcBLen - 2] */
marcozecchini 0:9fca2b23d0ba 369 acc3 += x1 * c0;
marcozecchini 0:9fca2b23d0ba 370
marcozecchini 0:9fca2b23d0ba 371 /* Read y[srcBLen - 4] sample */
marcozecchini 0:9fca2b23d0ba 372 c0 = *(py--);
marcozecchini 0:9fca2b23d0ba 373
marcozecchini 0:9fca2b23d0ba 374 /* Read x[6] sample */
marcozecchini 0:9fca2b23d0ba 375 x2 = *(px + 3u);
marcozecchini 0:9fca2b23d0ba 376 px += 4u;
marcozecchini 0:9fca2b23d0ba 377
marcozecchini 0:9fca2b23d0ba 378 /* Perform the multiply-accumulates */
marcozecchini 0:9fca2b23d0ba 379 /* acc0 += x[3] * y[srcBLen - 4] */
marcozecchini 0:9fca2b23d0ba 380 acc0 += x3 * c0;
marcozecchini 0:9fca2b23d0ba 381 /* acc1 += x[4] * y[srcBLen - 4] */
marcozecchini 0:9fca2b23d0ba 382 acc1 += x0 * c0;
marcozecchini 0:9fca2b23d0ba 383 /* acc2 += x[5] * y[srcBLen - 4] */
marcozecchini 0:9fca2b23d0ba 384 acc2 += x1 * c0;
marcozecchini 0:9fca2b23d0ba 385 /* acc3 += x[6] * y[srcBLen - 4] */
marcozecchini 0:9fca2b23d0ba 386 acc3 += x2 * c0;
marcozecchini 0:9fca2b23d0ba 387
marcozecchini 0:9fca2b23d0ba 388
marcozecchini 0:9fca2b23d0ba 389 } while(--k);
marcozecchini 0:9fca2b23d0ba 390
marcozecchini 0:9fca2b23d0ba 391 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
marcozecchini 0:9fca2b23d0ba 392 ** No loop unrolling is used. */
marcozecchini 0:9fca2b23d0ba 393 k = srcBLen % 0x4u;
marcozecchini 0:9fca2b23d0ba 394
marcozecchini 0:9fca2b23d0ba 395 while(k > 0u)
marcozecchini 0:9fca2b23d0ba 396 {
marcozecchini 0:9fca2b23d0ba 397 /* Read y[srcBLen - 5] sample */
marcozecchini 0:9fca2b23d0ba 398 c0 = *(py--);
marcozecchini 0:9fca2b23d0ba 399
marcozecchini 0:9fca2b23d0ba 400 /* Read x[7] sample */
marcozecchini 0:9fca2b23d0ba 401 x3 = *(px++);
marcozecchini 0:9fca2b23d0ba 402
marcozecchini 0:9fca2b23d0ba 403 /* Perform the multiply-accumulates */
marcozecchini 0:9fca2b23d0ba 404 /* acc0 += x[4] * y[srcBLen - 5] */
marcozecchini 0:9fca2b23d0ba 405 acc0 += x0 * c0;
marcozecchini 0:9fca2b23d0ba 406 /* acc1 += x[5] * y[srcBLen - 5] */
marcozecchini 0:9fca2b23d0ba 407 acc1 += x1 * c0;
marcozecchini 0:9fca2b23d0ba 408 /* acc2 += x[6] * y[srcBLen - 5] */
marcozecchini 0:9fca2b23d0ba 409 acc2 += x2 * c0;
marcozecchini 0:9fca2b23d0ba 410 /* acc3 += x[7] * y[srcBLen - 5] */
marcozecchini 0:9fca2b23d0ba 411 acc3 += x3 * c0;
marcozecchini 0:9fca2b23d0ba 412
marcozecchini 0:9fca2b23d0ba 413 /* Reuse the present samples for the next MAC */
marcozecchini 0:9fca2b23d0ba 414 x0 = x1;
marcozecchini 0:9fca2b23d0ba 415 x1 = x2;
marcozecchini 0:9fca2b23d0ba 416 x2 = x3;
marcozecchini 0:9fca2b23d0ba 417
marcozecchini 0:9fca2b23d0ba 418 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 419 k--;
marcozecchini 0:9fca2b23d0ba 420 }
marcozecchini 0:9fca2b23d0ba 421
marcozecchini 0:9fca2b23d0ba 422 /* Store the result in the accumulator in the destination buffer. */
marcozecchini 0:9fca2b23d0ba 423 *pOut++ = acc0;
marcozecchini 0:9fca2b23d0ba 424 *pOut++ = acc1;
marcozecchini 0:9fca2b23d0ba 425 *pOut++ = acc2;
marcozecchini 0:9fca2b23d0ba 426 *pOut++ = acc3;
marcozecchini 0:9fca2b23d0ba 427
marcozecchini 0:9fca2b23d0ba 428 /* Increment the pointer pIn1 index, count by 4 */
marcozecchini 0:9fca2b23d0ba 429 count += 4u;
marcozecchini 0:9fca2b23d0ba 430
marcozecchini 0:9fca2b23d0ba 431 /* Update the inputA and inputB pointers for next MAC calculation */
marcozecchini 0:9fca2b23d0ba 432 px = pIn1 + count;
marcozecchini 0:9fca2b23d0ba 433 py = pSrc2;
marcozecchini 0:9fca2b23d0ba 434
marcozecchini 0:9fca2b23d0ba 435
marcozecchini 0:9fca2b23d0ba 436 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 437 blkCnt--;
marcozecchini 0:9fca2b23d0ba 438 }
marcozecchini 0:9fca2b23d0ba 439
marcozecchini 0:9fca2b23d0ba 440
marcozecchini 0:9fca2b23d0ba 441 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
marcozecchini 0:9fca2b23d0ba 442 ** No loop unrolling is used. */
marcozecchini 0:9fca2b23d0ba 443 blkCnt = blockSize2 % 0x4u;
marcozecchini 0:9fca2b23d0ba 444
marcozecchini 0:9fca2b23d0ba 445 while(blkCnt > 0u)
marcozecchini 0:9fca2b23d0ba 446 {
marcozecchini 0:9fca2b23d0ba 447 /* Accumulator is made zero for every iteration */
marcozecchini 0:9fca2b23d0ba 448 sum = 0.0f;
marcozecchini 0:9fca2b23d0ba 449
marcozecchini 0:9fca2b23d0ba 450 /* Apply loop unrolling and compute 4 MACs simultaneously. */
marcozecchini 0:9fca2b23d0ba 451 k = srcBLen >> 2u;
marcozecchini 0:9fca2b23d0ba 452
marcozecchini 0:9fca2b23d0ba 453 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
marcozecchini 0:9fca2b23d0ba 454 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
marcozecchini 0:9fca2b23d0ba 455 while(k > 0u)
marcozecchini 0:9fca2b23d0ba 456 {
marcozecchini 0:9fca2b23d0ba 457 /* Perform the multiply-accumulates */
marcozecchini 0:9fca2b23d0ba 458 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 459 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 460 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 461 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 462
marcozecchini 0:9fca2b23d0ba 463 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 464 k--;
marcozecchini 0:9fca2b23d0ba 465 }
marcozecchini 0:9fca2b23d0ba 466
marcozecchini 0:9fca2b23d0ba 467 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
marcozecchini 0:9fca2b23d0ba 468 ** No loop unrolling is used. */
marcozecchini 0:9fca2b23d0ba 469 k = srcBLen % 0x4u;
marcozecchini 0:9fca2b23d0ba 470
marcozecchini 0:9fca2b23d0ba 471 while(k > 0u)
marcozecchini 0:9fca2b23d0ba 472 {
marcozecchini 0:9fca2b23d0ba 473 /* Perform the multiply-accumulate */
marcozecchini 0:9fca2b23d0ba 474 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 475
marcozecchini 0:9fca2b23d0ba 476 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 477 k--;
marcozecchini 0:9fca2b23d0ba 478 }
marcozecchini 0:9fca2b23d0ba 479
marcozecchini 0:9fca2b23d0ba 480 /* Store the result in the accumulator in the destination buffer. */
marcozecchini 0:9fca2b23d0ba 481 *pOut++ = sum;
marcozecchini 0:9fca2b23d0ba 482
marcozecchini 0:9fca2b23d0ba 483 /* Increment the MAC count */
marcozecchini 0:9fca2b23d0ba 484 count++;
marcozecchini 0:9fca2b23d0ba 485
marcozecchini 0:9fca2b23d0ba 486 /* Update the inputA and inputB pointers for next MAC calculation */
marcozecchini 0:9fca2b23d0ba 487 px = pIn1 + count;
marcozecchini 0:9fca2b23d0ba 488 py = pSrc2;
marcozecchini 0:9fca2b23d0ba 489
marcozecchini 0:9fca2b23d0ba 490 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 491 blkCnt--;
marcozecchini 0:9fca2b23d0ba 492 }
marcozecchini 0:9fca2b23d0ba 493 }
marcozecchini 0:9fca2b23d0ba 494 else
marcozecchini 0:9fca2b23d0ba 495 {
marcozecchini 0:9fca2b23d0ba 496 /* If the srcBLen is not a multiple of 4,
marcozecchini 0:9fca2b23d0ba 497 * the blockSize2 loop cannot be unrolled by 4 */
marcozecchini 0:9fca2b23d0ba 498 blkCnt = blockSize2;
marcozecchini 0:9fca2b23d0ba 499
marcozecchini 0:9fca2b23d0ba 500 while(blkCnt > 0u)
marcozecchini 0:9fca2b23d0ba 501 {
marcozecchini 0:9fca2b23d0ba 502 /* Accumulator is made zero for every iteration */
marcozecchini 0:9fca2b23d0ba 503 sum = 0.0f;
marcozecchini 0:9fca2b23d0ba 504
marcozecchini 0:9fca2b23d0ba 505 /* srcBLen number of MACS should be performed */
marcozecchini 0:9fca2b23d0ba 506 k = srcBLen;
marcozecchini 0:9fca2b23d0ba 507
marcozecchini 0:9fca2b23d0ba 508 while(k > 0u)
marcozecchini 0:9fca2b23d0ba 509 {
marcozecchini 0:9fca2b23d0ba 510 /* Perform the multiply-accumulate */
marcozecchini 0:9fca2b23d0ba 511 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 512
marcozecchini 0:9fca2b23d0ba 513 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 514 k--;
marcozecchini 0:9fca2b23d0ba 515 }
marcozecchini 0:9fca2b23d0ba 516
marcozecchini 0:9fca2b23d0ba 517 /* Store the result in the accumulator in the destination buffer. */
marcozecchini 0:9fca2b23d0ba 518 *pOut++ = sum;
marcozecchini 0:9fca2b23d0ba 519
marcozecchini 0:9fca2b23d0ba 520 /* Increment the MAC count */
marcozecchini 0:9fca2b23d0ba 521 count++;
marcozecchini 0:9fca2b23d0ba 522
marcozecchini 0:9fca2b23d0ba 523 /* Update the inputA and inputB pointers for next MAC calculation */
marcozecchini 0:9fca2b23d0ba 524 px = pIn1 + count;
marcozecchini 0:9fca2b23d0ba 525 py = pSrc2;
marcozecchini 0:9fca2b23d0ba 526
marcozecchini 0:9fca2b23d0ba 527 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 528 blkCnt--;
marcozecchini 0:9fca2b23d0ba 529 }
marcozecchini 0:9fca2b23d0ba 530 }
marcozecchini 0:9fca2b23d0ba 531
marcozecchini 0:9fca2b23d0ba 532
marcozecchini 0:9fca2b23d0ba 533 /* --------------------------
marcozecchini 0:9fca2b23d0ba 534 * Initializations of stage3
marcozecchini 0:9fca2b23d0ba 535 * -------------------------*/
marcozecchini 0:9fca2b23d0ba 536
marcozecchini 0:9fca2b23d0ba 537 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
marcozecchini 0:9fca2b23d0ba 538 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
marcozecchini 0:9fca2b23d0ba 539 * ....
marcozecchini 0:9fca2b23d0ba 540 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
marcozecchini 0:9fca2b23d0ba 541 * sum += x[srcALen-1] * y[srcBLen-1]
marcozecchini 0:9fca2b23d0ba 542 */
marcozecchini 0:9fca2b23d0ba 543
marcozecchini 0:9fca2b23d0ba 544 /* In this stage the MAC operations are decreased by 1 for every iteration.
marcozecchini 0:9fca2b23d0ba 545 The blockSize3 variable holds the number of MAC operations performed */
marcozecchini 0:9fca2b23d0ba 546
marcozecchini 0:9fca2b23d0ba 547 /* Working pointer of inputA */
marcozecchini 0:9fca2b23d0ba 548 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
marcozecchini 0:9fca2b23d0ba 549 px = pSrc1;
marcozecchini 0:9fca2b23d0ba 550
marcozecchini 0:9fca2b23d0ba 551 /* Working pointer of inputB */
marcozecchini 0:9fca2b23d0ba 552 pSrc2 = pIn2 + (srcBLen - 1u);
marcozecchini 0:9fca2b23d0ba 553 py = pSrc2;
marcozecchini 0:9fca2b23d0ba 554
marcozecchini 0:9fca2b23d0ba 555 /* -------------------
marcozecchini 0:9fca2b23d0ba 556 * Stage3 process
marcozecchini 0:9fca2b23d0ba 557 * ------------------*/
marcozecchini 0:9fca2b23d0ba 558
marcozecchini 0:9fca2b23d0ba 559 while(blockSize3 > 0u)
marcozecchini 0:9fca2b23d0ba 560 {
marcozecchini 0:9fca2b23d0ba 561 /* Accumulator is made zero for every iteration */
marcozecchini 0:9fca2b23d0ba 562 sum = 0.0f;
marcozecchini 0:9fca2b23d0ba 563
marcozecchini 0:9fca2b23d0ba 564 /* Apply loop unrolling and compute 4 MACs simultaneously. */
marcozecchini 0:9fca2b23d0ba 565 k = blockSize3 >> 2u;
marcozecchini 0:9fca2b23d0ba 566
marcozecchini 0:9fca2b23d0ba 567 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
marcozecchini 0:9fca2b23d0ba 568 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
marcozecchini 0:9fca2b23d0ba 569 while(k > 0u)
marcozecchini 0:9fca2b23d0ba 570 {
marcozecchini 0:9fca2b23d0ba 571 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
marcozecchini 0:9fca2b23d0ba 572 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 573
marcozecchini 0:9fca2b23d0ba 574 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
marcozecchini 0:9fca2b23d0ba 575 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 576
marcozecchini 0:9fca2b23d0ba 577 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
marcozecchini 0:9fca2b23d0ba 578 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 579
marcozecchini 0:9fca2b23d0ba 580 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
marcozecchini 0:9fca2b23d0ba 581 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 582
marcozecchini 0:9fca2b23d0ba 583 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 584 k--;
marcozecchini 0:9fca2b23d0ba 585 }
marcozecchini 0:9fca2b23d0ba 586
marcozecchini 0:9fca2b23d0ba 587 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
marcozecchini 0:9fca2b23d0ba 588 ** No loop unrolling is used. */
marcozecchini 0:9fca2b23d0ba 589 k = blockSize3 % 0x4u;
marcozecchini 0:9fca2b23d0ba 590
marcozecchini 0:9fca2b23d0ba 591 while(k > 0u)
marcozecchini 0:9fca2b23d0ba 592 {
marcozecchini 0:9fca2b23d0ba 593 /* Perform the multiply-accumulates */
marcozecchini 0:9fca2b23d0ba 594 /* sum += x[srcALen-1] * y[srcBLen-1] */
marcozecchini 0:9fca2b23d0ba 595 sum += *px++ * *py--;
marcozecchini 0:9fca2b23d0ba 596
marcozecchini 0:9fca2b23d0ba 597 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 598 k--;
marcozecchini 0:9fca2b23d0ba 599 }
marcozecchini 0:9fca2b23d0ba 600
marcozecchini 0:9fca2b23d0ba 601 /* Store the result in the accumulator in the destination buffer. */
marcozecchini 0:9fca2b23d0ba 602 *pOut++ = sum;
marcozecchini 0:9fca2b23d0ba 603
marcozecchini 0:9fca2b23d0ba 604 /* Update the inputA and inputB pointers for next MAC calculation */
marcozecchini 0:9fca2b23d0ba 605 px = ++pSrc1;
marcozecchini 0:9fca2b23d0ba 606 py = pSrc2;
marcozecchini 0:9fca2b23d0ba 607
marcozecchini 0:9fca2b23d0ba 608 /* Decrement the loop counter */
marcozecchini 0:9fca2b23d0ba 609 blockSize3--;
marcozecchini 0:9fca2b23d0ba 610 }
marcozecchini 0:9fca2b23d0ba 611
marcozecchini 0:9fca2b23d0ba 612 #else
marcozecchini 0:9fca2b23d0ba 613
marcozecchini 0:9fca2b23d0ba 614 /* Run the below code for Cortex-M0 */
marcozecchini 0:9fca2b23d0ba 615
marcozecchini 0:9fca2b23d0ba 616 float32_t *pIn1 = pSrcA; /* inputA pointer */
marcozecchini 0:9fca2b23d0ba 617 float32_t *pIn2 = pSrcB; /* inputB pointer */
marcozecchini 0:9fca2b23d0ba 618 float32_t sum; /* Accumulator */
marcozecchini 0:9fca2b23d0ba 619 uint32_t i, j; /* loop counters */
marcozecchini 0:9fca2b23d0ba 620
marcozecchini 0:9fca2b23d0ba 621 /* Loop to calculate convolution for output length number of times */
marcozecchini 0:9fca2b23d0ba 622 for (i = 0u; i < ((srcALen + srcBLen) - 1u); i++)
marcozecchini 0:9fca2b23d0ba 623 {
marcozecchini 0:9fca2b23d0ba 624 /* Initialize sum with zero to carry out MAC operations */
marcozecchini 0:9fca2b23d0ba 625 sum = 0.0f;
marcozecchini 0:9fca2b23d0ba 626
marcozecchini 0:9fca2b23d0ba 627 /* Loop to perform MAC operations according to convolution equation */
marcozecchini 0:9fca2b23d0ba 628 for (j = 0u; j <= i; j++)
marcozecchini 0:9fca2b23d0ba 629 {
marcozecchini 0:9fca2b23d0ba 630 /* Check the array limitations */
marcozecchini 0:9fca2b23d0ba 631 if((((i - j) < srcBLen) && (j < srcALen)))
marcozecchini 0:9fca2b23d0ba 632 {
marcozecchini 0:9fca2b23d0ba 633 /* z[i] += x[i-j] * y[j] */
marcozecchini 0:9fca2b23d0ba 634 sum += pIn1[j] * pIn2[i - j];
marcozecchini 0:9fca2b23d0ba 635 }
marcozecchini 0:9fca2b23d0ba 636 }
marcozecchini 0:9fca2b23d0ba 637 /* Store the output in the destination buffer */
marcozecchini 0:9fca2b23d0ba 638 pDst[i] = sum;
marcozecchini 0:9fca2b23d0ba 639 }
marcozecchini 0:9fca2b23d0ba 640
marcozecchini 0:9fca2b23d0ba 641 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
marcozecchini 0:9fca2b23d0ba 642
marcozecchini 0:9fca2b23d0ba 643 }
marcozecchini 0:9fca2b23d0ba 644
marcozecchini 0:9fca2b23d0ba 645 /**
marcozecchini 0:9fca2b23d0ba 646 * @} end of Conv group
marcozecchini 0:9fca2b23d0ba 647 */