Dependencies:   DHT mbed

Committer:
elt14lpo
Date:
Fri May 12 08:01:26 2017 +0000
Revision:
4:a89e836d9faf
Parent:
3:3f71950ceb71

        

Who changed what in which revision?

UserRevisionLine numberNew contents of line
elt14lpo 2:1161fea84522 1 /* ----------------------------------------------------------------------------
elt14lpo 2:1161fea84522 2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
elt14lpo 2:1161fea84522 3 *
elt14lpo 2:1161fea84522 4 * $Date: 19. March 2015
elt14lpo 2:1161fea84522 5 * $Revision: V.1.4.5
elt14lpo 2:1161fea84522 6 *
elt14lpo 2:1161fea84522 7 * Project: CMSIS DSP Library
elt14lpo 2:1161fea84522 8 * Title: arm_correlate_f32.c
elt14lpo 2:1161fea84522 9 *
elt14lpo 2:1161fea84522 10 * Description: Correlation of floating-point sequences.
elt14lpo 2:1161fea84522 11 *
elt14lpo 2:1161fea84522 12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
elt14lpo 2:1161fea84522 13 *
elt14lpo 2:1161fea84522 14 * Redistribution and use in source and binary forms, with or without
elt14lpo 2:1161fea84522 15 * modification, are permitted provided that the following conditions
elt14lpo 2:1161fea84522 16 * are met:
elt14lpo 2:1161fea84522 17 * - Redistributions of source code must retain the above copyright
elt14lpo 2:1161fea84522 18 * notice, this list of conditions and the following disclaimer.
elt14lpo 2:1161fea84522 19 * - Redistributions in binary form must reproduce the above copyright
elt14lpo 2:1161fea84522 20 * notice, this list of conditions and the following disclaimer in
elt14lpo 2:1161fea84522 21 * the documentation and/or other materials provided with the
elt14lpo 2:1161fea84522 22 * distribution.
elt14lpo 2:1161fea84522 23 * - Neither the name of ARM LIMITED nor the names of its contributors
elt14lpo 2:1161fea84522 24 * may be used to endorse or promote products derived from this
elt14lpo 2:1161fea84522 25 * software without specific prior written permission.
elt14lpo 1:d0884279b41d 26 *
elt14lpo 2:1161fea84522 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
elt14lpo 2:1161fea84522 28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
elt14lpo 2:1161fea84522 29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
elt14lpo 2:1161fea84522 30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
elt14lpo 2:1161fea84522 31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
elt14lpo 2:1161fea84522 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
elt14lpo 2:1161fea84522 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
elt14lpo 2:1161fea84522 34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
elt14lpo 2:1161fea84522 35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
elt14lpo 2:1161fea84522 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
elt14lpo 2:1161fea84522 37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
elt14lpo 2:1161fea84522 38 * POSSIBILITY OF SUCH DAMAGE.
elt14lpo 1:d0884279b41d 39 * -------------------------------------------------------------------------- */
elt14lpo 1:d0884279b41d 40
elt14lpo 1:d0884279b41d 41 #include "arm_math.h"
elt14lpo 3:3f71950ceb71 42 #include "luke_correlate_f32.h"
elt14lpo 1:d0884279b41d 43
elt14lpo 2:1161fea84522 44 /**
elt14lpo 2:1161fea84522 45 * @ingroup groupFilters
elt14lpo 1:d0884279b41d 46 */
elt14lpo 1:d0884279b41d 47
elt14lpo 2:1161fea84522 48 /**
elt14lpo 2:1161fea84522 49 * @defgroup Corr Correlation
elt14lpo 2:1161fea84522 50 *
elt14lpo 2:1161fea84522 51 * Correlation is a mathematical operation that is similar to convolution.
elt14lpo 2:1161fea84522 52 * As with convolution, correlation uses two signals to produce a third signal.
elt14lpo 2:1161fea84522 53 * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
elt14lpo 2:1161fea84522 54 * Correlation is commonly used to measure the similarity between two signals.
elt14lpo 2:1161fea84522 55 * It has applications in pattern recognition, cryptanalysis, and searching.
elt14lpo 2:1161fea84522 56 * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
elt14lpo 2:1161fea84522 57 * Fast versions of the Q15 and Q31 functions are also provided.
elt14lpo 2:1161fea84522 58 *
elt14lpo 2:1161fea84522 59 * \par Algorithm
elt14lpo 2:1161fea84522 60 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
elt14lpo 2:1161fea84522 61 * The convolution of the two signals is denoted by
elt14lpo 2:1161fea84522 62 * <pre>
elt14lpo 2:1161fea84522 63 * c[n] = a[n] * b[n]
elt14lpo 2:1161fea84522 64 * </pre>
elt14lpo 2:1161fea84522 65 * In correlation, one of the signals is flipped in time
elt14lpo 2:1161fea84522 66 * <pre>
elt14lpo 2:1161fea84522 67 * c[n] = a[n] * b[-n]
elt14lpo 2:1161fea84522 68 * </pre>
elt14lpo 2:1161fea84522 69 *
elt14lpo 2:1161fea84522 70 * \par
elt14lpo 2:1161fea84522 71 * and this is mathematically defined as
elt14lpo 2:1161fea84522 72 * \image html CorrelateEquation.gif
elt14lpo 2:1161fea84522 73 * \par
elt14lpo 2:1161fea84522 74 * The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
elt14lpo 2:1161fea84522 75 * The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
elt14lpo 2:1161fea84522 76 * The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
elt14lpo 2:1161fea84522 77 *
elt14lpo 2:1161fea84522 78 * <b>Note</b>
elt14lpo 2:1161fea84522 79 * \par
elt14lpo 2:1161fea84522 80 * The <code>pDst</code> should be initialized to all zeros before being used.
elt14lpo 2:1161fea84522 81 *
elt14lpo 2:1161fea84522 82 * <b>Fixed-Point Behavior</b>
elt14lpo 2:1161fea84522 83 * \par
elt14lpo 2:1161fea84522 84 * Correlation requires summing up a large number of intermediate products.
elt14lpo 2:1161fea84522 85 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
elt14lpo 2:1161fea84522 86 * Refer to the function specific documentation below for further details of the particular algorithm used.
elt14lpo 1:d0884279b41d 87 *
elt14lpo 2:1161fea84522 88 *
elt14lpo 2:1161fea84522 89 * <b>Fast Versions</b>
elt14lpo 2:1161fea84522 90 *
elt14lpo 2:1161fea84522 91 * \par
elt14lpo 2:1161fea84522 92 * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
elt14lpo 2:1161fea84522 93 * the input signals should be scaled down to avoid intermediate overflows.
elt14lpo 1:d0884279b41d 94 *
elt14lpo 2:1161fea84522 95 *
elt14lpo 2:1161fea84522 96 * <b>Opt Versions</b>
elt14lpo 1:d0884279b41d 97 *
elt14lpo 2:1161fea84522 98 * \par
elt14lpo 2:1161fea84522 99 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
elt14lpo 2:1161fea84522 100 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of correlate
elt14lpo 1:d0884279b41d 101 */
elt14lpo 1:d0884279b41d 102
elt14lpo 2:1161fea84522 103 /**
elt14lpo 2:1161fea84522 104 * @addtogroup Corr
elt14lpo 2:1161fea84522 105 * @{
elt14lpo 1:d0884279b41d 106 */
elt14lpo 2:1161fea84522 107 /**
elt14lpo 2:1161fea84522 108 * @brief Correlation of floating-point sequences.
elt14lpo 2:1161fea84522 109 * @param[in] *pSrcA points to the first input sequence.
elt14lpo 2:1161fea84522 110 * @param[in] srcALen length of the first input sequence.
elt14lpo 2:1161fea84522 111 * @param[in] *pSrcB points to the second input sequence.
elt14lpo 2:1161fea84522 112 * @param[in] srcBLen length of the second input sequence.
elt14lpo 2:1161fea84522 113 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
elt14lpo 2:1161fea84522 114 * @return none.
elt14lpo 1:d0884279b41d 115 */
elt14lpo 1:d0884279b41d 116
elt14lpo 3:3f71950ceb71 117 void luke_correlate_f32(
elt14lpo 4:a89e836d9faf 118 float* pSrcA,
elt14lpo 2:1161fea84522 119 int srcALen,
elt14lpo 4:a89e836d9faf 120 float* pSrcB,
elt14lpo 2:1161fea84522 121 int srcBLen,
elt14lpo 4:a89e836d9faf 122 float* pDst)
elt14lpo 2:1161fea84522 123 {
elt14lpo 2:1161fea84522 124
elt14lpo 2:1161fea84522 125
elt14lpo 2:1161fea84522 126 #ifndef ARM_MATH_CM0_FAMILY
elt14lpo 2:1161fea84522 127
elt14lpo 2:1161fea84522 128 /* Run the below code for Cortex-M4 and Cortex-M3 */
elt14lpo 2:1161fea84522 129
elt14lpo 2:1161fea84522 130 float *pIn1; /* inputA pointer */
elt14lpo 2:1161fea84522 131 float *pIn2; /* inputB pointer */
elt14lpo 2:1161fea84522 132 float *pOut = pDst; /* output pointer */
elt14lpo 2:1161fea84522 133 float *px; /* Intermediate inputA pointer */
elt14lpo 2:1161fea84522 134 float *py; /* Intermediate inputB pointer */
elt14lpo 2:1161fea84522 135 float *pSrc1; /* Intermediate pointers */
elt14lpo 2:1161fea84522 136 float sum, acc0, acc1, acc2, acc3; /* Accumulators */
elt14lpo 2:1161fea84522 137 float x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
elt14lpo 2:1161fea84522 138 int j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */
elt14lpo 2:1161fea84522 139 int32_t inc = 1; /* Destination address modifier */
elt14lpo 1:d0884279b41d 140
elt14lpo 1:d0884279b41d 141
elt14lpo 2:1161fea84522 142 /* The algorithm implementation is based on the lengths of the inputs. */
elt14lpo 2:1161fea84522 143 /* srcB is always made to slide across srcA. */
elt14lpo 2:1161fea84522 144 /* So srcBLen is always considered as shorter or equal to srcALen */
elt14lpo 2:1161fea84522 145 /* But CORR(x, y) is reverse of CORR(y, x) */
elt14lpo 2:1161fea84522 146 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
elt14lpo 2:1161fea84522 147 /* and the destination pointer modifier, inc is set to -1 */
elt14lpo 2:1161fea84522 148 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
elt14lpo 2:1161fea84522 149 /* But to improve the performance,
elt14lpo 2:1161fea84522 150 * we assume zeroes in the output instead of zero padding either of the the inputs*/
elt14lpo 2:1161fea84522 151 /* If srcALen > srcBLen,
elt14lpo 2:1161fea84522 152 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
elt14lpo 2:1161fea84522 153 /* If srcALen < srcBLen,
elt14lpo 2:1161fea84522 154 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
elt14lpo 2:1161fea84522 155 if(srcALen >= srcBLen)
elt14lpo 2:1161fea84522 156 {
elt14lpo 2:1161fea84522 157 /* Initialization of inputA pointer */
elt14lpo 2:1161fea84522 158 pIn1 = pSrcA;
elt14lpo 2:1161fea84522 159
elt14lpo 2:1161fea84522 160 /* Initialization of inputB pointer */
elt14lpo 2:1161fea84522 161 pIn2 = pSrcB;
elt14lpo 2:1161fea84522 162
elt14lpo 2:1161fea84522 163 /* Number of output samples is calculated */
elt14lpo 2:1161fea84522 164 outBlockSize = (2u * srcALen) - 1u;
elt14lpo 2:1161fea84522 165
elt14lpo 2:1161fea84522 166 /* When srcALen > srcBLen, zero padding has to be done to srcB
elt14lpo 2:1161fea84522 167 * to make their lengths equal.
elt14lpo 2:1161fea84522 168 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
elt14lpo 2:1161fea84522 169 * number of output samples are made zero */
elt14lpo 2:1161fea84522 170 j = outBlockSize - (srcALen + (srcBLen - 1u));
elt14lpo 1:d0884279b41d 171
elt14lpo 2:1161fea84522 172 /* Updating the pointer position to non zero value */
elt14lpo 2:1161fea84522 173 pOut += j;
elt14lpo 2:1161fea84522 174
elt14lpo 2:1161fea84522 175 //while(j > 0u)
elt14lpo 2:1161fea84522 176 //{
elt14lpo 2:1161fea84522 177 // /* Zero is stored in the destination buffer */
elt14lpo 2:1161fea84522 178 // *pOut++ = 0.0f;
elt14lpo 2:1161fea84522 179
elt14lpo 2:1161fea84522 180 // /* Decrement the loop counter */
elt14lpo 2:1161fea84522 181 // j--;
elt14lpo 2:1161fea84522 182 //}
elt14lpo 1:d0884279b41d 183
elt14lpo 2:1161fea84522 184 }
elt14lpo 2:1161fea84522 185 else
elt14lpo 2:1161fea84522 186 {
elt14lpo 2:1161fea84522 187 /* Initialization of inputA pointer */
elt14lpo 2:1161fea84522 188 pIn1 = pSrcB;
elt14lpo 2:1161fea84522 189
elt14lpo 2:1161fea84522 190 /* Initialization of inputB pointer */
elt14lpo 2:1161fea84522 191 pIn2 = pSrcA;
elt14lpo 2:1161fea84522 192
elt14lpo 2:1161fea84522 193 /* srcBLen is always considered as shorter or equal to srcALen */
elt14lpo 2:1161fea84522 194 j = srcBLen;
elt14lpo 2:1161fea84522 195 srcBLen = srcALen;
elt14lpo 2:1161fea84522 196 srcALen = j;
elt14lpo 2:1161fea84522 197
elt14lpo 2:1161fea84522 198 /* CORR(x, y) = Reverse order(CORR(y, x)) */
elt14lpo 2:1161fea84522 199 /* Hence set the destination pointer to point to the last output sample */
elt14lpo 2:1161fea84522 200 pOut = pDst + ((srcALen + srcBLen) - 2u);
elt14lpo 2:1161fea84522 201
elt14lpo 2:1161fea84522 202 /* Destination address modifier is set to -1 */
elt14lpo 2:1161fea84522 203 inc = -1;
elt14lpo 2:1161fea84522 204
elt14lpo 2:1161fea84522 205 }
elt14lpo 1:d0884279b41d 206
elt14lpo 2:1161fea84522 207 /* The function is internally
elt14lpo 2:1161fea84522 208 * divided into three parts according to the number of multiplications that has to be
elt14lpo 2:1161fea84522 209 * taken place between inputA samples and inputB samples. In the first part of the
elt14lpo 2:1161fea84522 210 * algorithm, the multiplications increase by one for every iteration.
elt14lpo 2:1161fea84522 211 * In the second part of the algorithm, srcBLen number of multiplications are done.
elt14lpo 2:1161fea84522 212 * In the third part of the algorithm, the multiplications decrease by one
elt14lpo 2:1161fea84522 213 * for every iteration.*/
elt14lpo 2:1161fea84522 214 /* The algorithm is implemented in three stages.
elt14lpo 2:1161fea84522 215 * The loop counters of each stage is initiated here. */
elt14lpo 2:1161fea84522 216 blockSize1 = srcBLen - 1u;
elt14lpo 2:1161fea84522 217 blockSize2 = srcALen - (srcBLen - 1u);
elt14lpo 2:1161fea84522 218 blockSize3 = blockSize1;
elt14lpo 2:1161fea84522 219
elt14lpo 2:1161fea84522 220 /* --------------------------
elt14lpo 2:1161fea84522 221 * Initializations of stage1
elt14lpo 2:1161fea84522 222 * -------------------------*/
elt14lpo 1:d0884279b41d 223
elt14lpo 2:1161fea84522 224 /* sum = x[0] * y[srcBlen - 1]
elt14lpo 2:1161fea84522 225 * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
elt14lpo 2:1161fea84522 226 * ....
elt14lpo 2:1161fea84522 227 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
elt14lpo 2:1161fea84522 228 */
elt14lpo 2:1161fea84522 229
elt14lpo 2:1161fea84522 230 /* In this stage the MAC operations are increased by 1 for every iteration.
elt14lpo 2:1161fea84522 231 The count variable holds the number of MAC operations performed */
elt14lpo 2:1161fea84522 232 count = 1u;
elt14lpo 1:d0884279b41d 233
elt14lpo 2:1161fea84522 234 /* Working pointer of inputA */
elt14lpo 2:1161fea84522 235 px = pIn1;
elt14lpo 2:1161fea84522 236
elt14lpo 2:1161fea84522 237 /* Working pointer of inputB */
elt14lpo 2:1161fea84522 238 pSrc1 = pIn2 + (srcBLen - 1u);
elt14lpo 2:1161fea84522 239 py = pSrc1;
elt14lpo 2:1161fea84522 240
elt14lpo 2:1161fea84522 241 /* ------------------------
elt14lpo 2:1161fea84522 242 * Stage1 process
elt14lpo 2:1161fea84522 243 * ----------------------*/
elt14lpo 1:d0884279b41d 244
elt14lpo 2:1161fea84522 245 /* The first stage starts here */
elt14lpo 2:1161fea84522 246 while(blockSize1 > 0u)
elt14lpo 2:1161fea84522 247 {
elt14lpo 2:1161fea84522 248 /* Accumulator is made zero for every iteration */
elt14lpo 2:1161fea84522 249 sum = 0.0f;
elt14lpo 1:d0884279b41d 250
elt14lpo 2:1161fea84522 251 /* Apply loop unrolling and compute 4 MACs simultaneously. */
elt14lpo 2:1161fea84522 252 k = count >> 2u;
elt14lpo 1:d0884279b41d 253
elt14lpo 2:1161fea84522 254 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
elt14lpo 2:1161fea84522 255 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
elt14lpo 2:1161fea84522 256 while(k > 0u)
elt14lpo 2:1161fea84522 257 {
elt14lpo 2:1161fea84522 258 /* x[0] * y[srcBLen - 4] */
elt14lpo 2:1161fea84522 259 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 260 /* x[1] * y[srcBLen - 3] */
elt14lpo 2:1161fea84522 261 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 262 /* x[2] * y[srcBLen - 2] */
elt14lpo 2:1161fea84522 263 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 264 /* x[3] * y[srcBLen - 1] */
elt14lpo 2:1161fea84522 265 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 266
elt14lpo 2:1161fea84522 267 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 268 k--;
elt14lpo 2:1161fea84522 269 }
elt14lpo 2:1161fea84522 270
elt14lpo 2:1161fea84522 271 /* If the count is not a multiple of 4, compute any remaining MACs here.
elt14lpo 2:1161fea84522 272 ** No loop unrolling is used. */
elt14lpo 2:1161fea84522 273 k = count % 0x4u;
elt14lpo 1:d0884279b41d 274
elt14lpo 2:1161fea84522 275 while(k > 0u)
elt14lpo 2:1161fea84522 276 {
elt14lpo 2:1161fea84522 277 /* Perform the multiply-accumulate */
elt14lpo 2:1161fea84522 278 /* x[0] * y[srcBLen - 1] */
elt14lpo 2:1161fea84522 279 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 280
elt14lpo 2:1161fea84522 281 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 282 k--;
elt14lpo 1:d0884279b41d 283 }
elt14lpo 1:d0884279b41d 284
elt14lpo 2:1161fea84522 285 /* Store the result in the accumulator in the destination buffer. */
elt14lpo 2:1161fea84522 286 *pOut = sum;
elt14lpo 2:1161fea84522 287 /* Destination pointer is updated according to the address modifier, inc */
elt14lpo 2:1161fea84522 288 pOut += inc;
elt14lpo 1:d0884279b41d 289
elt14lpo 2:1161fea84522 290 /* Update the inputA and inputB pointers for next MAC calculation */
elt14lpo 2:1161fea84522 291 py = pSrc1 - count;
elt14lpo 1:d0884279b41d 292 px = pIn1;
elt14lpo 1:d0884279b41d 293
elt14lpo 2:1161fea84522 294 /* Increment the MAC count */
elt14lpo 2:1161fea84522 295 count++;
elt14lpo 2:1161fea84522 296
elt14lpo 2:1161fea84522 297 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 298 blockSize1--;
elt14lpo 2:1161fea84522 299 }
elt14lpo 2:1161fea84522 300
elt14lpo 2:1161fea84522 301 /* --------------------------
elt14lpo 2:1161fea84522 302 * Initializations of stage2
elt14lpo 2:1161fea84522 303 * ------------------------*/
elt14lpo 1:d0884279b41d 304
elt14lpo 2:1161fea84522 305 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
elt14lpo 2:1161fea84522 306 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
elt14lpo 2:1161fea84522 307 * ....
elt14lpo 2:1161fea84522 308 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
elt14lpo 2:1161fea84522 309 */
elt14lpo 2:1161fea84522 310
elt14lpo 2:1161fea84522 311 /* Working pointer of inputA */
elt14lpo 2:1161fea84522 312 px = pIn1;
elt14lpo 1:d0884279b41d 313
elt14lpo 2:1161fea84522 314 /* Working pointer of inputB */
elt14lpo 2:1161fea84522 315 py = pIn2;
elt14lpo 2:1161fea84522 316
elt14lpo 2:1161fea84522 317 /* count is index by which the pointer pIn1 to be incremented */
elt14lpo 2:1161fea84522 318 count = 0u;
elt14lpo 2:1161fea84522 319
elt14lpo 2:1161fea84522 320 /* -------------------
elt14lpo 2:1161fea84522 321 * Stage2 process
elt14lpo 2:1161fea84522 322 * ------------------*/
elt14lpo 1:d0884279b41d 323
elt14lpo 2:1161fea84522 324 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
elt14lpo 2:1161fea84522 325 * So, to loop unroll over blockSize2,
elt14lpo 2:1161fea84522 326 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
elt14lpo 2:1161fea84522 327 if(srcBLen >= 4u)
elt14lpo 2:1161fea84522 328 {
elt14lpo 2:1161fea84522 329 /* Loop unroll over blockSize2, by 4 */
elt14lpo 2:1161fea84522 330 blkCnt = blockSize2 >> 2u;
elt14lpo 2:1161fea84522 331
elt14lpo 2:1161fea84522 332 while(blkCnt > 0u)
elt14lpo 2:1161fea84522 333 {
elt14lpo 2:1161fea84522 334 /* Set all accumulators to zero */
elt14lpo 2:1161fea84522 335 acc0 = 0.0f;
elt14lpo 2:1161fea84522 336 acc1 = 0.0f;
elt14lpo 2:1161fea84522 337 acc2 = 0.0f;
elt14lpo 2:1161fea84522 338 acc3 = 0.0f;
elt14lpo 2:1161fea84522 339
elt14lpo 2:1161fea84522 340 /* read x[0], x[1], x[2] samples */
elt14lpo 2:1161fea84522 341 x0 = *(px++);
elt14lpo 2:1161fea84522 342 x1 = *(px++);
elt14lpo 2:1161fea84522 343 x2 = *(px++);
elt14lpo 1:d0884279b41d 344
elt14lpo 2:1161fea84522 345 /* Apply loop unrolling and compute 4 MACs simultaneously. */
elt14lpo 2:1161fea84522 346 k = srcBLen >> 2u;
elt14lpo 2:1161fea84522 347
elt14lpo 2:1161fea84522 348 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
elt14lpo 2:1161fea84522 349 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
elt14lpo 2:1161fea84522 350 do
elt14lpo 2:1161fea84522 351 {
elt14lpo 2:1161fea84522 352 /* Read y[0] sample */
elt14lpo 2:1161fea84522 353 c0 = *(py++);
elt14lpo 2:1161fea84522 354
elt14lpo 2:1161fea84522 355 /* Read x[3] sample */
elt14lpo 2:1161fea84522 356 x3 = *(px++);
elt14lpo 2:1161fea84522 357
elt14lpo 2:1161fea84522 358 /* Perform the multiply-accumulate */
elt14lpo 2:1161fea84522 359 /* acc0 += x[0] * y[0] */
elt14lpo 2:1161fea84522 360 acc0 += x0 * c0;
elt14lpo 2:1161fea84522 361 /* acc1 += x[1] * y[0] */
elt14lpo 2:1161fea84522 362 acc1 += x1 * c0;
elt14lpo 2:1161fea84522 363 /* acc2 += x[2] * y[0] */
elt14lpo 2:1161fea84522 364 acc2 += x2 * c0;
elt14lpo 2:1161fea84522 365 /* acc3 += x[3] * y[0] */
elt14lpo 2:1161fea84522 366 acc3 += x3 * c0;
elt14lpo 1:d0884279b41d 367
elt14lpo 2:1161fea84522 368 /* Read y[1] sample */
elt14lpo 2:1161fea84522 369 c0 = *(py++);
elt14lpo 2:1161fea84522 370
elt14lpo 2:1161fea84522 371 /* Read x[4] sample */
elt14lpo 2:1161fea84522 372 x0 = *(px++);
elt14lpo 2:1161fea84522 373
elt14lpo 2:1161fea84522 374 /* Perform the multiply-accumulate */
elt14lpo 2:1161fea84522 375 /* acc0 += x[1] * y[1] */
elt14lpo 2:1161fea84522 376 acc0 += x1 * c0;
elt14lpo 2:1161fea84522 377 /* acc1 += x[2] * y[1] */
elt14lpo 2:1161fea84522 378 acc1 += x2 * c0;
elt14lpo 2:1161fea84522 379 /* acc2 += x[3] * y[1] */
elt14lpo 2:1161fea84522 380 acc2 += x3 * c0;
elt14lpo 2:1161fea84522 381 /* acc3 += x[4] * y[1] */
elt14lpo 2:1161fea84522 382 acc3 += x0 * c0;
elt14lpo 2:1161fea84522 383
elt14lpo 2:1161fea84522 384 /* Read y[2] sample */
elt14lpo 2:1161fea84522 385 c0 = *(py++);
elt14lpo 1:d0884279b41d 386
elt14lpo 2:1161fea84522 387 /* Read x[5] sample */
elt14lpo 2:1161fea84522 388 x1 = *(px++);
elt14lpo 1:d0884279b41d 389
elt14lpo 2:1161fea84522 390 /* Perform the multiply-accumulates */
elt14lpo 2:1161fea84522 391 /* acc0 += x[2] * y[2] */
elt14lpo 2:1161fea84522 392 acc0 += x2 * c0;
elt14lpo 2:1161fea84522 393 /* acc1 += x[3] * y[2] */
elt14lpo 2:1161fea84522 394 acc1 += x3 * c0;
elt14lpo 2:1161fea84522 395 /* acc2 += x[4] * y[2] */
elt14lpo 2:1161fea84522 396 acc2 += x0 * c0;
elt14lpo 2:1161fea84522 397 /* acc3 += x[5] * y[2] */
elt14lpo 2:1161fea84522 398 acc3 += x1 * c0;
elt14lpo 2:1161fea84522 399
elt14lpo 2:1161fea84522 400 /* Read y[3] sample */
elt14lpo 2:1161fea84522 401 c0 = *(py++);
elt14lpo 2:1161fea84522 402
elt14lpo 2:1161fea84522 403 /* Read x[6] sample */
elt14lpo 2:1161fea84522 404 x2 = *(px++);
elt14lpo 1:d0884279b41d 405
elt14lpo 2:1161fea84522 406 /* Perform the multiply-accumulates */
elt14lpo 2:1161fea84522 407 /* acc0 += x[3] * y[3] */
elt14lpo 2:1161fea84522 408 acc0 += x3 * c0;
elt14lpo 2:1161fea84522 409 /* acc1 += x[4] * y[3] */
elt14lpo 2:1161fea84522 410 acc1 += x0 * c0;
elt14lpo 2:1161fea84522 411 /* acc2 += x[5] * y[3] */
elt14lpo 2:1161fea84522 412 acc2 += x1 * c0;
elt14lpo 2:1161fea84522 413 /* acc3 += x[6] * y[3] */
elt14lpo 2:1161fea84522 414 acc3 += x2 * c0;
elt14lpo 2:1161fea84522 415
elt14lpo 2:1161fea84522 416
elt14lpo 2:1161fea84522 417 } while(--k);
elt14lpo 2:1161fea84522 418
elt14lpo 2:1161fea84522 419 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
elt14lpo 2:1161fea84522 420 ** No loop unrolling is used. */
elt14lpo 2:1161fea84522 421 k = srcBLen % 0x4u;
elt14lpo 1:d0884279b41d 422
elt14lpo 2:1161fea84522 423 while(k > 0u)
elt14lpo 2:1161fea84522 424 {
elt14lpo 2:1161fea84522 425 /* Read y[4] sample */
elt14lpo 2:1161fea84522 426 c0 = *(py++);
elt14lpo 2:1161fea84522 427
elt14lpo 2:1161fea84522 428 /* Read x[7] sample */
elt14lpo 2:1161fea84522 429 x3 = *(px++);
elt14lpo 1:d0884279b41d 430
elt14lpo 2:1161fea84522 431 /* Perform the multiply-accumulates */
elt14lpo 2:1161fea84522 432 /* acc0 += x[4] * y[4] */
elt14lpo 2:1161fea84522 433 acc0 += x0 * c0;
elt14lpo 2:1161fea84522 434 /* acc1 += x[5] * y[4] */
elt14lpo 2:1161fea84522 435 acc1 += x1 * c0;
elt14lpo 2:1161fea84522 436 /* acc2 += x[6] * y[4] */
elt14lpo 2:1161fea84522 437 acc2 += x2 * c0;
elt14lpo 2:1161fea84522 438 /* acc3 += x[7] * y[4] */
elt14lpo 2:1161fea84522 439 acc3 += x3 * c0;
elt14lpo 1:d0884279b41d 440
elt14lpo 2:1161fea84522 441 /* Reuse the present samples for the next MAC */
elt14lpo 2:1161fea84522 442 x0 = x1;
elt14lpo 2:1161fea84522 443 x1 = x2;
elt14lpo 2:1161fea84522 444 x2 = x3;
elt14lpo 1:d0884279b41d 445
elt14lpo 1:d0884279b41d 446 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 447 k--;
elt14lpo 2:1161fea84522 448 }
elt14lpo 2:1161fea84522 449
elt14lpo 2:1161fea84522 450 /* Store the result in the accumulator in the destination buffer. */
elt14lpo 2:1161fea84522 451 *pOut = acc0;
elt14lpo 2:1161fea84522 452 /* Destination pointer is updated according to the address modifier, inc */
elt14lpo 2:1161fea84522 453 pOut += inc;
elt14lpo 2:1161fea84522 454
elt14lpo 2:1161fea84522 455 *pOut = acc1;
elt14lpo 2:1161fea84522 456 pOut += inc;
elt14lpo 2:1161fea84522 457
elt14lpo 2:1161fea84522 458 *pOut = acc2;
elt14lpo 2:1161fea84522 459 pOut += inc;
elt14lpo 2:1161fea84522 460
elt14lpo 2:1161fea84522 461 *pOut = acc3;
elt14lpo 2:1161fea84522 462 pOut += inc;
elt14lpo 2:1161fea84522 463
elt14lpo 2:1161fea84522 464 /* Increment the pointer pIn1 index, count by 4 */
elt14lpo 2:1161fea84522 465 count += 4u;
elt14lpo 2:1161fea84522 466
elt14lpo 2:1161fea84522 467 /* Update the inputA and inputB pointers for next MAC calculation */
elt14lpo 2:1161fea84522 468 px = pIn1 + count;
elt14lpo 2:1161fea84522 469 py = pIn2;
elt14lpo 2:1161fea84522 470
elt14lpo 2:1161fea84522 471 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 472 blkCnt--;
elt14lpo 1:d0884279b41d 473 }
elt14lpo 1:d0884279b41d 474
elt14lpo 2:1161fea84522 475 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
elt14lpo 2:1161fea84522 476 ** No loop unrolling is used. */
elt14lpo 2:1161fea84522 477 blkCnt = blockSize2 % 0x4u;
elt14lpo 2:1161fea84522 478
elt14lpo 2:1161fea84522 479 while(blkCnt > 0u)
elt14lpo 2:1161fea84522 480 {
elt14lpo 2:1161fea84522 481 /* Accumulator is made zero for every iteration */
elt14lpo 2:1161fea84522 482 sum = 0.0f;
elt14lpo 2:1161fea84522 483
elt14lpo 2:1161fea84522 484 /* Apply loop unrolling and compute 4 MACs simultaneously. */
elt14lpo 2:1161fea84522 485 k = srcBLen >> 2u;
elt14lpo 2:1161fea84522 486
elt14lpo 2:1161fea84522 487 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
elt14lpo 2:1161fea84522 488 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
elt14lpo 2:1161fea84522 489 while(k > 0u)
elt14lpo 2:1161fea84522 490 {
elt14lpo 2:1161fea84522 491 /* Perform the multiply-accumulates */
elt14lpo 2:1161fea84522 492 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 493 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 494 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 495 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 496
elt14lpo 2:1161fea84522 497 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 498 k--;
elt14lpo 2:1161fea84522 499 }
elt14lpo 2:1161fea84522 500
elt14lpo 2:1161fea84522 501 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
elt14lpo 2:1161fea84522 502 ** No loop unrolling is used. */
elt14lpo 2:1161fea84522 503 k = srcBLen % 0x4u;
elt14lpo 2:1161fea84522 504
elt14lpo 2:1161fea84522 505 while(k > 0u)
elt14lpo 2:1161fea84522 506 {
elt14lpo 2:1161fea84522 507 /* Perform the multiply-accumulate */
elt14lpo 2:1161fea84522 508 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 509
elt14lpo 2:1161fea84522 510 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 511 k--;
elt14lpo 2:1161fea84522 512 }
elt14lpo 2:1161fea84522 513
elt14lpo 2:1161fea84522 514 /* Store the result in the accumulator in the destination buffer. */
elt14lpo 2:1161fea84522 515 *pOut = sum;
elt14lpo 2:1161fea84522 516 /* Destination pointer is updated according to the address modifier, inc */
elt14lpo 2:1161fea84522 517 pOut += inc;
elt14lpo 2:1161fea84522 518
elt14lpo 2:1161fea84522 519 /* Increment the pointer pIn1 index, count by 1 */
elt14lpo 2:1161fea84522 520 count++;
elt14lpo 2:1161fea84522 521
elt14lpo 2:1161fea84522 522 /* Update the inputA and inputB pointers for next MAC calculation */
elt14lpo 2:1161fea84522 523 px = pIn1 + count;
elt14lpo 2:1161fea84522 524 py = pIn2;
elt14lpo 2:1161fea84522 525
elt14lpo 2:1161fea84522 526 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 527 blkCnt--;
elt14lpo 2:1161fea84522 528 }
elt14lpo 2:1161fea84522 529 }
elt14lpo 2:1161fea84522 530 else
elt14lpo 2:1161fea84522 531 {
elt14lpo 2:1161fea84522 532 /* If the srcBLen is not a multiple of 4,
elt14lpo 2:1161fea84522 533 * the blockSize2 loop cannot be unrolled by 4 */
elt14lpo 2:1161fea84522 534 blkCnt = blockSize2;
elt14lpo 2:1161fea84522 535
elt14lpo 2:1161fea84522 536 while(blkCnt > 0u)
elt14lpo 2:1161fea84522 537 {
elt14lpo 2:1161fea84522 538 /* Accumulator is made zero for every iteration */
elt14lpo 2:1161fea84522 539 sum = 0.0f;
elt14lpo 2:1161fea84522 540
elt14lpo 2:1161fea84522 541 /* Loop over srcBLen */
elt14lpo 2:1161fea84522 542 k = srcBLen;
elt14lpo 2:1161fea84522 543
elt14lpo 2:1161fea84522 544 while(k > 0u)
elt14lpo 2:1161fea84522 545 {
elt14lpo 2:1161fea84522 546 /* Perform the multiply-accumulate */
elt14lpo 2:1161fea84522 547 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 548
elt14lpo 2:1161fea84522 549 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 550 k--;
elt14lpo 2:1161fea84522 551 }
elt14lpo 2:1161fea84522 552
elt14lpo 2:1161fea84522 553 /* Store the result in the accumulator in the destination buffer. */
elt14lpo 2:1161fea84522 554 *pOut = sum;
elt14lpo 2:1161fea84522 555 /* Destination pointer is updated according to the address modifier, inc */
elt14lpo 2:1161fea84522 556 pOut += inc;
elt14lpo 1:d0884279b41d 557
elt14lpo 2:1161fea84522 558 /* Increment the pointer pIn1 index, count by 1 */
elt14lpo 2:1161fea84522 559 count++;
elt14lpo 2:1161fea84522 560
elt14lpo 2:1161fea84522 561 /* Update the inputA and inputB pointers for next MAC calculation */
elt14lpo 2:1161fea84522 562 px = pIn1 + count;
elt14lpo 2:1161fea84522 563 py = pIn2;
elt14lpo 2:1161fea84522 564
elt14lpo 2:1161fea84522 565 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 566 blkCnt--;
elt14lpo 2:1161fea84522 567 }
elt14lpo 2:1161fea84522 568 }
elt14lpo 2:1161fea84522 569
elt14lpo 2:1161fea84522 570 /* --------------------------
elt14lpo 2:1161fea84522 571 * Initializations of stage3
elt14lpo 2:1161fea84522 572 * -------------------------*/
elt14lpo 2:1161fea84522 573
elt14lpo 2:1161fea84522 574 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
elt14lpo 2:1161fea84522 575 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
elt14lpo 2:1161fea84522 576 * ....
elt14lpo 2:1161fea84522 577 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
elt14lpo 2:1161fea84522 578 * sum += x[srcALen-1] * y[0]
elt14lpo 2:1161fea84522 579 */
elt14lpo 2:1161fea84522 580
elt14lpo 2:1161fea84522 581 /* In this stage the MAC operations are decreased by 1 for every iteration.
elt14lpo 2:1161fea84522 582 The count variable holds the number of MAC operations performed */
elt14lpo 2:1161fea84522 583 count = srcBLen - 1u;
elt14lpo 2:1161fea84522 584
elt14lpo 2:1161fea84522 585 /* Working pointer of inputA */
elt14lpo 2:1161fea84522 586 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
elt14lpo 2:1161fea84522 587 px = pSrc1;
elt14lpo 2:1161fea84522 588
elt14lpo 2:1161fea84522 589 /* Working pointer of inputB */
elt14lpo 2:1161fea84522 590 py = pIn2;
elt14lpo 2:1161fea84522 591
elt14lpo 2:1161fea84522 592 /* -------------------
elt14lpo 2:1161fea84522 593 * Stage3 process
elt14lpo 2:1161fea84522 594 * ------------------*/
elt14lpo 1:d0884279b41d 595
elt14lpo 2:1161fea84522 596 while(blockSize3 > 0u)
elt14lpo 2:1161fea84522 597 {
elt14lpo 2:1161fea84522 598 /* Accumulator is made zero for every iteration */
elt14lpo 2:1161fea84522 599 sum = 0.0f;
elt14lpo 2:1161fea84522 600
elt14lpo 2:1161fea84522 601 /* Apply loop unrolling and compute 4 MACs simultaneously. */
elt14lpo 2:1161fea84522 602 k = count >> 2u;
elt14lpo 2:1161fea84522 603
elt14lpo 2:1161fea84522 604 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
elt14lpo 2:1161fea84522 605 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
elt14lpo 2:1161fea84522 606 while(k > 0u)
elt14lpo 2:1161fea84522 607 {
elt14lpo 2:1161fea84522 608 /* Perform the multiply-accumulates */
elt14lpo 2:1161fea84522 609 /* sum += x[srcALen - srcBLen + 4] * y[3] */
elt14lpo 2:1161fea84522 610 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 611 /* sum += x[srcALen - srcBLen + 3] * y[2] */
elt14lpo 2:1161fea84522 612 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 613 /* sum += x[srcALen - srcBLen + 2] * y[1] */
elt14lpo 2:1161fea84522 614 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 615 /* sum += x[srcALen - srcBLen + 1] * y[0] */
elt14lpo 2:1161fea84522 616 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 617
elt14lpo 2:1161fea84522 618 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 619 k--;
elt14lpo 2:1161fea84522 620 }
elt14lpo 2:1161fea84522 621
elt14lpo 2:1161fea84522 622 /* If the count is not a multiple of 4, compute any remaining MACs here.
elt14lpo 2:1161fea84522 623 ** No loop unrolling is used. */
elt14lpo 2:1161fea84522 624 k = count % 0x4u;
elt14lpo 2:1161fea84522 625
elt14lpo 2:1161fea84522 626 while(k > 0u)
elt14lpo 2:1161fea84522 627 {
elt14lpo 2:1161fea84522 628 /* Perform the multiply-accumulates */
elt14lpo 2:1161fea84522 629 sum += *px++ * *py++;
elt14lpo 2:1161fea84522 630
elt14lpo 2:1161fea84522 631 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 632 k--;
elt14lpo 2:1161fea84522 633 }
elt14lpo 2:1161fea84522 634
elt14lpo 2:1161fea84522 635 /* Store the result in the accumulator in the destination buffer. */
elt14lpo 2:1161fea84522 636 *pOut = sum;
elt14lpo 2:1161fea84522 637 /* Destination pointer is updated according to the address modifier, inc */
elt14lpo 2:1161fea84522 638 pOut += inc;
elt14lpo 2:1161fea84522 639
elt14lpo 2:1161fea84522 640 /* Update the inputA and inputB pointers for next MAC calculation */
elt14lpo 2:1161fea84522 641 px = ++pSrc1;
elt14lpo 1:d0884279b41d 642 py = pIn2;
elt14lpo 1:d0884279b41d 643
elt14lpo 2:1161fea84522 644 /* Decrement the MAC count */
elt14lpo 2:1161fea84522 645 count--;
elt14lpo 1:d0884279b41d 646
elt14lpo 2:1161fea84522 647 /* Decrement the loop counter */
elt14lpo 2:1161fea84522 648 blockSize3--;
elt14lpo 2:1161fea84522 649 }
elt14lpo 1:d0884279b41d 650
elt14lpo 2:1161fea84522 651 #else
elt14lpo 1:d0884279b41d 652
elt14lpo 2:1161fea84522 653 /* Run the below code for Cortex-M0 */
elt14lpo 1:d0884279b41d 654
elt14lpo 2:1161fea84522 655 float *pIn1 = pSrcA; /* inputA pointer */
elt14lpo 2:1161fea84522 656 float *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */
elt14lpo 2:1161fea84522 657 float sum; /* Accumulator */
elt14lpo 2:1161fea84522 658 int i = 0u, j; /* loop counters */
elt14lpo 2:1161fea84522 659 int inv = 0u; /* Reverse order flag */
elt14lpo 2:1161fea84522 660 int tot = 0u; /* Length */
elt14lpo 1:d0884279b41d 661
elt14lpo 2:1161fea84522 662 /* The algorithm implementation is based on the lengths of the inputs. */
elt14lpo 2:1161fea84522 663 /* srcB is always made to slide across srcA. */
elt14lpo 2:1161fea84522 664 /* So srcBLen is always considered as shorter or equal to srcALen */
elt14lpo 2:1161fea84522 665 /* But CORR(x, y) is reverse of CORR(y, x) */
elt14lpo 2:1161fea84522 666 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
elt14lpo 2:1161fea84522 667 /* and a varaible, inv is set to 1 */
elt14lpo 2:1161fea84522 668 /* If lengths are not equal then zero pad has to be done to make the two
elt14lpo 2:1161fea84522 669 * inputs of same length. But to improve the performance, we assume zeroes
elt14lpo 2:1161fea84522 670 * in the output instead of zero padding either of the the inputs*/
elt14lpo 2:1161fea84522 671 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
elt14lpo 2:1161fea84522 672 * starting of the output buffer */
elt14lpo 2:1161fea84522 673 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
elt14lpo 2:1161fea84522 674 * ending of the output buffer */
elt14lpo 2:1161fea84522 675 /* Once the zero padding is done the remaining of the output is calcualted
elt14lpo 2:1161fea84522 676 * using convolution but with the shorter signal time shifted. */
elt14lpo 1:d0884279b41d 677
elt14lpo 2:1161fea84522 678 /* Calculate the length of the remaining sequence */
elt14lpo 2:1161fea84522 679 tot = ((srcALen + srcBLen) - 2u);
elt14lpo 1:d0884279b41d 680
elt14lpo 2:1161fea84522 681 if(srcALen > srcBLen)
elt14lpo 2:1161fea84522 682 {
elt14lpo 2:1161fea84522 683 /* Calculating the number of zeros to be padded to the output */
elt14lpo 2:1161fea84522 684 j = srcALen - srcBLen;
elt14lpo 1:d0884279b41d 685
elt14lpo 2:1161fea84522 686 /* Initialise the pointer after zero padding */
elt14lpo 2:1161fea84522 687 pDst += j;
elt14lpo 2:1161fea84522 688 }
elt14lpo 1:d0884279b41d 689
elt14lpo 2:1161fea84522 690 else if(srcALen < srcBLen)
elt14lpo 2:1161fea84522 691 {
elt14lpo 2:1161fea84522 692 /* Initialization to inputB pointer */
elt14lpo 2:1161fea84522 693 pIn1 = pSrcB;
elt14lpo 1:d0884279b41d 694
elt14lpo 2:1161fea84522 695 /* Initialization to the end of inputA pointer */
elt14lpo 2:1161fea84522 696 pIn2 = pSrcA + (srcALen - 1u);
elt14lpo 1:d0884279b41d 697
elt14lpo 2:1161fea84522 698 /* Initialisation of the pointer after zero padding */
elt14lpo 2:1161fea84522 699 pDst = pDst + tot;
elt14lpo 1:d0884279b41d 700
elt14lpo 2:1161fea84522 701 /* Swapping the lengths */
elt14lpo 2:1161fea84522 702 j = srcALen;
elt14lpo 2:1161fea84522 703 srcALen = srcBLen;
elt14lpo 2:1161fea84522 704 srcBLen = j;
elt14lpo 1:d0884279b41d 705
elt14lpo 2:1161fea84522 706 /* Setting the reverse flag */
elt14lpo 2:1161fea84522 707 inv = 1;
elt14lpo 1:d0884279b41d 708
elt14lpo 2:1161fea84522 709 }
elt14lpo 1:d0884279b41d 710
elt14lpo 2:1161fea84522 711 /* Loop to calculate convolution for output length number of times */
elt14lpo 2:1161fea84522 712 for (i = 0u; i <= tot; i++)
elt14lpo 2:1161fea84522 713 {
elt14lpo 2:1161fea84522 714 /* Initialize sum with zero to carry on MAC operations */
elt14lpo 2:1161fea84522 715 sum = 0.0f;
elt14lpo 1:d0884279b41d 716
elt14lpo 2:1161fea84522 717 /* Loop to perform MAC operations according to convolution equation */
elt14lpo 2:1161fea84522 718 for (j = 0u; j <= i; j++)
elt14lpo 2:1161fea84522 719 {
elt14lpo 2:1161fea84522 720 /* Check the array limitations */
elt14lpo 2:1161fea84522 721 if((((i - j) < srcBLen) && (j < srcALen)))
elt14lpo 2:1161fea84522 722 {
elt14lpo 2:1161fea84522 723 /* z[i] += x[i-j] * y[j] */
elt14lpo 2:1161fea84522 724 sum += pIn1[j] * pIn2[-((int32_t) i - j)];
elt14lpo 2:1161fea84522 725 }
elt14lpo 2:1161fea84522 726 }
elt14lpo 2:1161fea84522 727 /* Store the output in the destination buffer */
elt14lpo 2:1161fea84522 728 if(inv == 1)
elt14lpo 2:1161fea84522 729 *pDst-- = sum;
elt14lpo 2:1161fea84522 730 else
elt14lpo 2:1161fea84522 731 *pDst++ = sum;
elt14lpo 2:1161fea84522 732 }
elt14lpo 1:d0884279b41d 733
elt14lpo 2:1161fea84522 734 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
elt14lpo 1:d0884279b41d 735
elt14lpo 1:d0884279b41d 736 }
elt14lpo 1:d0884279b41d 737
elt14lpo 2:1161fea84522 738 /**
elt14lpo 2:1161fea84522 739 * @} end of Corr group
elt14lpo 2:1161fea84522 740 */