Dependencies:   DHT mbed

Committer:
elt14lpo
Date:
Fri May 12 06:27:31 2017 +0000
Revision:
1:d0884279b41d
Child:
2:1161fea84522
;

Who changed what in which revision?

UserRevisionLine numberNew contents of line
elt14lpo 1:d0884279b41d 1 /* ----------------------------------------------------------------------------
elt14lpo 1:d0884279b41d 2 * Copyright (C) 2010 ARM Limited. All rights reserved.
elt14lpo 1:d0884279b41d 3 *
elt14lpo 1:d0884279b41d 4 * $Date: 29. November 2010
elt14lpo 1:d0884279b41d 5 * $Revision: V1.0.3
elt14lpo 1:d0884279b41d 6 *
elt14lpo 1:d0884279b41d 7 * Project: CMSIS DSP Library
elt14lpo 1:d0884279b41d 8 * Title: arm_correlate_f32.c
elt14lpo 1:d0884279b41d 9 *
elt14lpo 1:d0884279b41d 10 * Description: Correlation for floating-point sequences.
elt14lpo 1:d0884279b41d 11 *
elt14lpo 1:d0884279b41d 12 * Target Processor: Cortex-M4/Cortex-M3
elt14lpo 1:d0884279b41d 13 *
elt14lpo 1:d0884279b41d 14 * Version 1.0.3 2010/11/29
elt14lpo 1:d0884279b41d 15 * Re-organized the CMSIS folders and updated documentation.
elt14lpo 1:d0884279b41d 16 *
elt14lpo 1:d0884279b41d 17 * Version 1.0.2 2010/11/11
elt14lpo 1:d0884279b41d 18 * Documentation updated.
elt14lpo 1:d0884279b41d 19 *
elt14lpo 1:d0884279b41d 20 * Version 1.0.1 2010/10/05
elt14lpo 1:d0884279b41d 21 * Production release and review comments incorporated.
elt14lpo 1:d0884279b41d 22 *
elt14lpo 1:d0884279b41d 23 * Version 1.0.0 2010/09/20
elt14lpo 1:d0884279b41d 24 * Production release and review comments incorporated
elt14lpo 1:d0884279b41d 25 *
elt14lpo 1:d0884279b41d 26 * Version 0.0.7 2010/06/10
elt14lpo 1:d0884279b41d 27 * Misra-C changes done
elt14lpo 1:d0884279b41d 28 *
elt14lpo 1:d0884279b41d 29 * -------------------------------------------------------------------------- */
elt14lpo 1:d0884279b41d 30
elt14lpo 1:d0884279b41d 31 #include "arm_math.h"
elt14lpo 1:d0884279b41d 32 #include "arm_correlate_f32.h"
elt14lpo 1:d0884279b41d 33
elt14lpo 1:d0884279b41d 34 /**
elt14lpo 1:d0884279b41d 35 * @ingroup groupFilters
elt14lpo 1:d0884279b41d 36 */
elt14lpo 1:d0884279b41d 37
elt14lpo 1:d0884279b41d 38 /**
elt14lpo 1:d0884279b41d 39 * @defgroup Corr Correlation
elt14lpo 1:d0884279b41d 40 *
elt14lpo 1:d0884279b41d 41 * Correlation is a mathematical operation that is similar to convolution.
elt14lpo 1:d0884279b41d 42 * As with convolution, correlation uses two signals to produce a third signal.
elt14lpo 1:d0884279b41d 43 * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
elt14lpo 1:d0884279b41d 44 * Correlation is commonly used to measure the similarity between two signals.
elt14lpo 1:d0884279b41d 45 * It has applications in pattern recognition, cryptanalysis, and searching.
elt14lpo 1:d0884279b41d 46 * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
elt14lpo 1:d0884279b41d 47 * Fast versions of the Q15 and Q31 functions are also provided.
elt14lpo 1:d0884279b41d 48 *
elt14lpo 1:d0884279b41d 49 * \par Algorithm
elt14lpo 1:d0884279b41d 50 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
elt14lpo 1:d0884279b41d 51 * The convolution of the two signals is denoted by
elt14lpo 1:d0884279b41d 52 * <pre>
elt14lpo 1:d0884279b41d 53 * c[n] = a[n] * b[n]
elt14lpo 1:d0884279b41d 54 * </pre>
elt14lpo 1:d0884279b41d 55 * In correlation, one of the signals is flipped in time
elt14lpo 1:d0884279b41d 56 * <pre>
elt14lpo 1:d0884279b41d 57 * c[n] = a[n] * b[-n]
elt14lpo 1:d0884279b41d 58 * </pre>
elt14lpo 1:d0884279b41d 59 *
elt14lpo 1:d0884279b41d 60 * \par
elt14lpo 1:d0884279b41d 61 * and this is mathematically defined as
elt14lpo 1:d0884279b41d 62 * \image html CorrelateEquation.gif
elt14lpo 1:d0884279b41d 63 * \par
elt14lpo 1:d0884279b41d 64 * The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
elt14lpo 1:d0884279b41d 65 * The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
elt14lpo 1:d0884279b41d 66 * The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
elt14lpo 1:d0884279b41d 67 *
elt14lpo 1:d0884279b41d 68 * <b>Fixed-Point Behavior</b>
elt14lpo 1:d0884279b41d 69 * \par
elt14lpo 1:d0884279b41d 70 * Correlation requires summing up a large number of intermediate products.
elt14lpo 1:d0884279b41d 71 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
elt14lpo 1:d0884279b41d 72 * Refer to the function specific documentation below for further details of the particular algorithm used.
elt14lpo 1:d0884279b41d 73 */
elt14lpo 1:d0884279b41d 74
elt14lpo 1:d0884279b41d 75 /**
elt14lpo 1:d0884279b41d 76 * @addtogroup Corr
elt14lpo 1:d0884279b41d 77 * @{
elt14lpo 1:d0884279b41d 78 */
elt14lpo 1:d0884279b41d 79 /**
elt14lpo 1:d0884279b41d 80 * @brief Correlation of floating-point sequences
elt14lpo 1:d0884279b41d 81 * @param[in] *pSrcA points to the first input sequence.
elt14lpo 1:d0884279b41d 82 * @param[in] srcALen length of the first input sequence.
elt14lpo 1:d0884279b41d 83 * @param[in] *pSrcB points to the second input sequence.
elt14lpo 1:d0884279b41d 84 * @param[in] srcBLen length of the second input sequence.
elt14lpo 1:d0884279b41d 85 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
elt14lpo 1:d0884279b41d 86 * @return none.
elt14lpo 1:d0884279b41d 87 */
elt14lpo 1:d0884279b41d 88
elt14lpo 1:d0884279b41d 89 void arm_correlate_f32(float32_t * pSrcA, uint32_t srcALen, float32_t * pSrcB, uint32_t srcBLen, float32_t * pDst){
elt14lpo 1:d0884279b41d 90 float32_t *pIn1; /* inputA pointer */
elt14lpo 1:d0884279b41d 91 float32_t *pIn2; /* inputB pointer */
elt14lpo 1:d0884279b41d 92 float32_t *pOut = pDst; /* output pointer */
elt14lpo 1:d0884279b41d 93 float32_t *px; /* Intermediate inputA pointer */
elt14lpo 1:d0884279b41d 94 float32_t *py; /* Intermediate inputB pointer */
elt14lpo 1:d0884279b41d 95 float32_t *pSrc1; /* Intermediate pointers */
elt14lpo 1:d0884279b41d 96 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
elt14lpo 1:d0884279b41d 97 float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
elt14lpo 1:d0884279b41d 98 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */
elt14lpo 1:d0884279b41d 99 int32_t inc = 1; /* Destination address modifier */
elt14lpo 1:d0884279b41d 100
elt14lpo 1:d0884279b41d 101
elt14lpo 1:d0884279b41d 102 /* The algorithm implementation is based on the lengths of the inputs. */
elt14lpo 1:d0884279b41d 103 /* srcB is always made to slide across srcA. */
elt14lpo 1:d0884279b41d 104 /* So srcBLen is always considered as shorter or equal to srcALen */
elt14lpo 1:d0884279b41d 105 /* But CORR(x, y) is reverse of CORR(y, x) */
elt14lpo 1:d0884279b41d 106 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
elt14lpo 1:d0884279b41d 107 /* and the destination pointer modifier, inc is set to -1 */
elt14lpo 1:d0884279b41d 108 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
elt14lpo 1:d0884279b41d 109 /* But to improve the performance,
elt14lpo 1:d0884279b41d 110 * we include zeroes in the output instead of zero padding either of the the inputs*/
elt14lpo 1:d0884279b41d 111 /* If srcALen > srcBLen,
elt14lpo 1:d0884279b41d 112 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
elt14lpo 1:d0884279b41d 113 /* If srcALen < srcBLen,
elt14lpo 1:d0884279b41d 114 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
elt14lpo 1:d0884279b41d 115 if(srcALen >= srcBLen) {
elt14lpo 1:d0884279b41d 116 /* Initialization of inputA pointer */
elt14lpo 1:d0884279b41d 117 pIn1 = pSrcA;
elt14lpo 1:d0884279b41d 118
elt14lpo 1:d0884279b41d 119 /* Initialization of inputB pointer */
elt14lpo 1:d0884279b41d 120 pIn2 = pSrcB;
elt14lpo 1:d0884279b41d 121
elt14lpo 1:d0884279b41d 122 /* Number of output samples is calculated */
elt14lpo 1:d0884279b41d 123 outBlockSize = (2u * srcALen) - 1u;
elt14lpo 1:d0884279b41d 124
elt14lpo 1:d0884279b41d 125 /* When srcALen > srcBLen, zero padding has to be done to srcB
elt14lpo 1:d0884279b41d 126 * to make their lengths equal.
elt14lpo 1:d0884279b41d 127 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
elt14lpo 1:d0884279b41d 128 * number of output samples are made zero */
elt14lpo 1:d0884279b41d 129 j = outBlockSize - (srcALen + (srcBLen - 1u));
elt14lpo 1:d0884279b41d 130
elt14lpo 1:d0884279b41d 131 while(j > 0u) {
elt14lpo 1:d0884279b41d 132 /* Zero is stored in the destination buffer */
elt14lpo 1:d0884279b41d 133 *pOut++ = 0.0f;
elt14lpo 1:d0884279b41d 134
elt14lpo 1:d0884279b41d 135 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 136 j--;
elt14lpo 1:d0884279b41d 137 }
elt14lpo 1:d0884279b41d 138
elt14lpo 1:d0884279b41d 139 } else {
elt14lpo 1:d0884279b41d 140 /* Initialization of inputA pointer */
elt14lpo 1:d0884279b41d 141 pIn1 = pSrcB;
elt14lpo 1:d0884279b41d 142
elt14lpo 1:d0884279b41d 143 /* Initialization of inputB pointer */
elt14lpo 1:d0884279b41d 144 pIn2 = pSrcA;
elt14lpo 1:d0884279b41d 145
elt14lpo 1:d0884279b41d 146 /* srcBLen is always considered as shorter or equal to srcALen */
elt14lpo 1:d0884279b41d 147 j = srcBLen;
elt14lpo 1:d0884279b41d 148 srcBLen = srcALen;
elt14lpo 1:d0884279b41d 149 srcALen = j;
elt14lpo 1:d0884279b41d 150
elt14lpo 1:d0884279b41d 151 /* CORR(x, y) = Reverse order(CORR(y, x)) */
elt14lpo 1:d0884279b41d 152 /* Hence set the destination pointer to point to the last output sample */
elt14lpo 1:d0884279b41d 153 pOut = pDst + ((srcALen + srcBLen) - 2u);
elt14lpo 1:d0884279b41d 154
elt14lpo 1:d0884279b41d 155 /* Destination address modifier is set to -1 */
elt14lpo 1:d0884279b41d 156 inc = -1;
elt14lpo 1:d0884279b41d 157
elt14lpo 1:d0884279b41d 158 }
elt14lpo 1:d0884279b41d 159
elt14lpo 1:d0884279b41d 160 /* The function is internally
elt14lpo 1:d0884279b41d 161 * divided into three parts according to the number of multiplications that has to be
elt14lpo 1:d0884279b41d 162 * taken place between inputA samples and inputB samples. In the first part of the
elt14lpo 1:d0884279b41d 163 * algorithm, the multiplications increase by one for every iteration.
elt14lpo 1:d0884279b41d 164 * In the second part of the algorithm, srcBLen number of multiplications are done.
elt14lpo 1:d0884279b41d 165 * In the third part of the algorithm, the multiplications decrease by one
elt14lpo 1:d0884279b41d 166 * for every iteration.*/
elt14lpo 1:d0884279b41d 167 /* The algorithm is implemented in three stages.
elt14lpo 1:d0884279b41d 168 * The loop counters of each stage is initiated here. */
elt14lpo 1:d0884279b41d 169 blockSize1 = srcBLen - 1u;
elt14lpo 1:d0884279b41d 170 blockSize2 = srcALen - (srcBLen - 1u);
elt14lpo 1:d0884279b41d 171 blockSize3 = blockSize1;
elt14lpo 1:d0884279b41d 172
elt14lpo 1:d0884279b41d 173 /* --------------------------
elt14lpo 1:d0884279b41d 174 * Initializations of stage1
elt14lpo 1:d0884279b41d 175 * -------------------------*/
elt14lpo 1:d0884279b41d 176
elt14lpo 1:d0884279b41d 177 /* sum = x[0] * y[srcBlen - 1]
elt14lpo 1:d0884279b41d 178 * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
elt14lpo 1:d0884279b41d 179 * ....
elt14lpo 1:d0884279b41d 180 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
elt14lpo 1:d0884279b41d 181 */
elt14lpo 1:d0884279b41d 182
elt14lpo 1:d0884279b41d 183 /* In this stage the MAC operations are increased by 1 for every iteration.
elt14lpo 1:d0884279b41d 184 The count variable holds the number of MAC operations performed */
elt14lpo 1:d0884279b41d 185 count = 1u;
elt14lpo 1:d0884279b41d 186
elt14lpo 1:d0884279b41d 187 /* Working pointer of inputA */
elt14lpo 1:d0884279b41d 188 px = pIn1;
elt14lpo 1:d0884279b41d 189
elt14lpo 1:d0884279b41d 190 /* Working pointer of inputB */
elt14lpo 1:d0884279b41d 191 pSrc1 = pIn2 + (srcBLen - 1u);
elt14lpo 1:d0884279b41d 192 py = pSrc1;
elt14lpo 1:d0884279b41d 193
elt14lpo 1:d0884279b41d 194 /* ------------------------
elt14lpo 1:d0884279b41d 195 * Stage1 process
elt14lpo 1:d0884279b41d 196 * ----------------------*/
elt14lpo 1:d0884279b41d 197
elt14lpo 1:d0884279b41d 198 /* The first stage starts here */
elt14lpo 1:d0884279b41d 199 while(blockSize1 > 0u) {
elt14lpo 1:d0884279b41d 200 /* Accumulator is made zero for every iteration */
elt14lpo 1:d0884279b41d 201 sum = 0.0f;
elt14lpo 1:d0884279b41d 202
elt14lpo 1:d0884279b41d 203 /* Apply loop unrolling and compute 4 MACs simultaneously. */
elt14lpo 1:d0884279b41d 204 k = count >> 2u;
elt14lpo 1:d0884279b41d 205
elt14lpo 1:d0884279b41d 206 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
elt14lpo 1:d0884279b41d 207 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
elt14lpo 1:d0884279b41d 208 while(k > 0u) {
elt14lpo 1:d0884279b41d 209 /* x[0] * y[srcBLen - 4] */
elt14lpo 1:d0884279b41d 210 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 211 /* x[1] * y[srcBLen - 3] */
elt14lpo 1:d0884279b41d 212 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 213 /* x[2] * y[srcBLen - 2] */
elt14lpo 1:d0884279b41d 214 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 215 /* x[3] * y[srcBLen - 1] */
elt14lpo 1:d0884279b41d 216 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 217
elt14lpo 1:d0884279b41d 218 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 219 k--;
elt14lpo 1:d0884279b41d 220 }
elt14lpo 1:d0884279b41d 221
elt14lpo 1:d0884279b41d 222 /* If the count is not a multiple of 4, compute any remaining MACs here.
elt14lpo 1:d0884279b41d 223 ** No loop unrolling is used. */
elt14lpo 1:d0884279b41d 224 k = count % 0x4u;
elt14lpo 1:d0884279b41d 225
elt14lpo 1:d0884279b41d 226 while(k > 0u) {
elt14lpo 1:d0884279b41d 227 /* Perform the multiply-accumulate */
elt14lpo 1:d0884279b41d 228 /* x[0] * y[srcBLen - 1] */
elt14lpo 1:d0884279b41d 229 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 230
elt14lpo 1:d0884279b41d 231 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 232 k--;
elt14lpo 1:d0884279b41d 233 }
elt14lpo 1:d0884279b41d 234
elt14lpo 1:d0884279b41d 235 /* Store the result in the accumulator in the destination buffer. */
elt14lpo 1:d0884279b41d 236 *pOut = sum;
elt14lpo 1:d0884279b41d 237 /* Destination pointer is updated according to the address modifier, inc */
elt14lpo 1:d0884279b41d 238 pOut += inc;
elt14lpo 1:d0884279b41d 239
elt14lpo 1:d0884279b41d 240 /* Update the inputA and inputB pointers for next MAC calculation */
elt14lpo 1:d0884279b41d 241 py = pSrc1 - count;
elt14lpo 1:d0884279b41d 242 px = pIn1;
elt14lpo 1:d0884279b41d 243
elt14lpo 1:d0884279b41d 244 /* Increment the MAC count */
elt14lpo 1:d0884279b41d 245 count++;
elt14lpo 1:d0884279b41d 246
elt14lpo 1:d0884279b41d 247 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 248 blockSize1--;
elt14lpo 1:d0884279b41d 249 }
elt14lpo 1:d0884279b41d 250
elt14lpo 1:d0884279b41d 251 /* --------------------------
elt14lpo 1:d0884279b41d 252 * Initializations of stage2
elt14lpo 1:d0884279b41d 253 * ------------------------*/
elt14lpo 1:d0884279b41d 254
elt14lpo 1:d0884279b41d 255 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
elt14lpo 1:d0884279b41d 256 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
elt14lpo 1:d0884279b41d 257 * ....
elt14lpo 1:d0884279b41d 258 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
elt14lpo 1:d0884279b41d 259 */
elt14lpo 1:d0884279b41d 260
elt14lpo 1:d0884279b41d 261 /* Working pointer of inputA */
elt14lpo 1:d0884279b41d 262 px = pIn1;
elt14lpo 1:d0884279b41d 263
elt14lpo 1:d0884279b41d 264 /* Working pointer of inputB */
elt14lpo 1:d0884279b41d 265 py = pIn2;
elt14lpo 1:d0884279b41d 266
elt14lpo 1:d0884279b41d 267 /* count is index by which the pointer pIn1 to be incremented */
elt14lpo 1:d0884279b41d 268 count = 1u;
elt14lpo 1:d0884279b41d 269
elt14lpo 1:d0884279b41d 270 /* -------------------
elt14lpo 1:d0884279b41d 271 * Stage2 process
elt14lpo 1:d0884279b41d 272 * ------------------*/
elt14lpo 1:d0884279b41d 273
elt14lpo 1:d0884279b41d 274 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
elt14lpo 1:d0884279b41d 275 * So, to loop unroll over blockSize2,
elt14lpo 1:d0884279b41d 276 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
elt14lpo 1:d0884279b41d 277 if(srcBLen >= 4u) {
elt14lpo 1:d0884279b41d 278 /* Loop unroll over blockSize2, by 4 */
elt14lpo 1:d0884279b41d 279 blkCnt = blockSize2 >> 2u;
elt14lpo 1:d0884279b41d 280
elt14lpo 1:d0884279b41d 281 while(blkCnt > 0u) {
elt14lpo 1:d0884279b41d 282 /* Set all accumulators to zero */
elt14lpo 1:d0884279b41d 283 acc0 = 0.0f;
elt14lpo 1:d0884279b41d 284 acc1 = 0.0f;
elt14lpo 1:d0884279b41d 285 acc2 = 0.0f;
elt14lpo 1:d0884279b41d 286 acc3 = 0.0f;
elt14lpo 1:d0884279b41d 287
elt14lpo 1:d0884279b41d 288 /* read x[0], x[1], x[2] samples */
elt14lpo 1:d0884279b41d 289 x0 = *(px++);
elt14lpo 1:d0884279b41d 290 x1 = *(px++);
elt14lpo 1:d0884279b41d 291 x2 = *(px++);
elt14lpo 1:d0884279b41d 292
elt14lpo 1:d0884279b41d 293 /* Apply loop unrolling and compute 4 MACs simultaneously. */
elt14lpo 1:d0884279b41d 294 k = srcBLen >> 2u;
elt14lpo 1:d0884279b41d 295
elt14lpo 1:d0884279b41d 296 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
elt14lpo 1:d0884279b41d 297 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
elt14lpo 1:d0884279b41d 298 do {
elt14lpo 1:d0884279b41d 299 /* Read y[0] sample */
elt14lpo 1:d0884279b41d 300 c0 = *(py++);
elt14lpo 1:d0884279b41d 301
elt14lpo 1:d0884279b41d 302 /* Read x[3] sample */
elt14lpo 1:d0884279b41d 303 x3 = *(px++);
elt14lpo 1:d0884279b41d 304
elt14lpo 1:d0884279b41d 305 /* Perform the multiply-accumulate */
elt14lpo 1:d0884279b41d 306 /* acc0 += x[0] * y[0] */
elt14lpo 1:d0884279b41d 307 acc0 += x0 * c0;
elt14lpo 1:d0884279b41d 308 /* acc1 += x[1] * y[0] */
elt14lpo 1:d0884279b41d 309 acc1 += x1 * c0;
elt14lpo 1:d0884279b41d 310 /* acc2 += x[2] * y[0] */
elt14lpo 1:d0884279b41d 311 acc2 += x2 * c0;
elt14lpo 1:d0884279b41d 312 /* acc3 += x[3] * y[0] */
elt14lpo 1:d0884279b41d 313 acc3 += x3 * c0;
elt14lpo 1:d0884279b41d 314
elt14lpo 1:d0884279b41d 315 /* Read y[1] sample */
elt14lpo 1:d0884279b41d 316 c0 = *(py++);
elt14lpo 1:d0884279b41d 317
elt14lpo 1:d0884279b41d 318 /* Read x[4] sample */
elt14lpo 1:d0884279b41d 319 x0 = *(px++);
elt14lpo 1:d0884279b41d 320
elt14lpo 1:d0884279b41d 321 /* Perform the multiply-accumulate */
elt14lpo 1:d0884279b41d 322 /* acc0 += x[1] * y[1] */
elt14lpo 1:d0884279b41d 323 acc0 += x1 * c0;
elt14lpo 1:d0884279b41d 324 /* acc1 += x[2] * y[1] */
elt14lpo 1:d0884279b41d 325 acc1 += x2 * c0;
elt14lpo 1:d0884279b41d 326 /* acc2 += x[3] * y[1] */
elt14lpo 1:d0884279b41d 327 acc2 += x3 * c0;
elt14lpo 1:d0884279b41d 328 /* acc3 += x[4] * y[1] */
elt14lpo 1:d0884279b41d 329 acc3 += x0 * c0;
elt14lpo 1:d0884279b41d 330
elt14lpo 1:d0884279b41d 331 /* Read y[2] sample */
elt14lpo 1:d0884279b41d 332 c0 = *(py++);
elt14lpo 1:d0884279b41d 333
elt14lpo 1:d0884279b41d 334 /* Read x[5] sample */
elt14lpo 1:d0884279b41d 335 x1 = *(px++);
elt14lpo 1:d0884279b41d 336
elt14lpo 1:d0884279b41d 337 /* Perform the multiply-accumulates */
elt14lpo 1:d0884279b41d 338 /* acc0 += x[2] * y[2] */
elt14lpo 1:d0884279b41d 339 acc0 += x2 * c0;
elt14lpo 1:d0884279b41d 340 /* acc1 += x[3] * y[2] */
elt14lpo 1:d0884279b41d 341 acc1 += x3 * c0;
elt14lpo 1:d0884279b41d 342 /* acc2 += x[4] * y[2] */
elt14lpo 1:d0884279b41d 343 acc2 += x0 * c0;
elt14lpo 1:d0884279b41d 344 /* acc3 += x[5] * y[2] */
elt14lpo 1:d0884279b41d 345 acc3 += x1 * c0;
elt14lpo 1:d0884279b41d 346
elt14lpo 1:d0884279b41d 347 /* Read y[3] sample */
elt14lpo 1:d0884279b41d 348 c0 = *(py++);
elt14lpo 1:d0884279b41d 349
elt14lpo 1:d0884279b41d 350 /* Read x[6] sample */
elt14lpo 1:d0884279b41d 351 x2 = *(px++);
elt14lpo 1:d0884279b41d 352
elt14lpo 1:d0884279b41d 353 /* Perform the multiply-accumulates */
elt14lpo 1:d0884279b41d 354 /* acc0 += x[3] * y[3] */
elt14lpo 1:d0884279b41d 355 acc0 += x3 * c0;
elt14lpo 1:d0884279b41d 356 /* acc1 += x[4] * y[3] */
elt14lpo 1:d0884279b41d 357 acc1 += x0 * c0;
elt14lpo 1:d0884279b41d 358 /* acc2 += x[5] * y[3] */
elt14lpo 1:d0884279b41d 359 acc2 += x1 * c0;
elt14lpo 1:d0884279b41d 360 /* acc3 += x[6] * y[3] */
elt14lpo 1:d0884279b41d 361 acc3 += x2 * c0;
elt14lpo 1:d0884279b41d 362
elt14lpo 1:d0884279b41d 363
elt14lpo 1:d0884279b41d 364 } while(--k);
elt14lpo 1:d0884279b41d 365
elt14lpo 1:d0884279b41d 366 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
elt14lpo 1:d0884279b41d 367 ** No loop unrolling is used. */
elt14lpo 1:d0884279b41d 368 k = srcBLen % 0x4u;
elt14lpo 1:d0884279b41d 369
elt14lpo 1:d0884279b41d 370 while(k > 0u) {
elt14lpo 1:d0884279b41d 371 /* Read y[4] sample */
elt14lpo 1:d0884279b41d 372 c0 = *(py++);
elt14lpo 1:d0884279b41d 373
elt14lpo 1:d0884279b41d 374 /* Read x[7] sample */
elt14lpo 1:d0884279b41d 375 x3 = *(px++);
elt14lpo 1:d0884279b41d 376
elt14lpo 1:d0884279b41d 377 /* Perform the multiply-accumulates */
elt14lpo 1:d0884279b41d 378 /* acc0 += x[4] * y[4] */
elt14lpo 1:d0884279b41d 379 acc0 += x0 * c0;
elt14lpo 1:d0884279b41d 380 /* acc1 += x[5] * y[4] */
elt14lpo 1:d0884279b41d 381 acc1 += x1 * c0;
elt14lpo 1:d0884279b41d 382 /* acc2 += x[6] * y[4] */
elt14lpo 1:d0884279b41d 383 acc2 += x2 * c0;
elt14lpo 1:d0884279b41d 384 /* acc3 += x[7] * y[4] */
elt14lpo 1:d0884279b41d 385 acc3 += x3 * c0;
elt14lpo 1:d0884279b41d 386
elt14lpo 1:d0884279b41d 387 /* Reuse the present samples for the next MAC */
elt14lpo 1:d0884279b41d 388 x0 = x1;
elt14lpo 1:d0884279b41d 389 x1 = x2;
elt14lpo 1:d0884279b41d 390 x2 = x3;
elt14lpo 1:d0884279b41d 391
elt14lpo 1:d0884279b41d 392 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 393 k--;
elt14lpo 1:d0884279b41d 394 }
elt14lpo 1:d0884279b41d 395
elt14lpo 1:d0884279b41d 396 /* Store the result in the accumulator in the destination buffer. */
elt14lpo 1:d0884279b41d 397 *pOut = acc0;
elt14lpo 1:d0884279b41d 398 /* Destination pointer is updated according to the address modifier, inc */
elt14lpo 1:d0884279b41d 399 pOut += inc;
elt14lpo 1:d0884279b41d 400
elt14lpo 1:d0884279b41d 401 *pOut = acc1;
elt14lpo 1:d0884279b41d 402 pOut += inc;
elt14lpo 1:d0884279b41d 403
elt14lpo 1:d0884279b41d 404 *pOut = acc2;
elt14lpo 1:d0884279b41d 405 pOut += inc;
elt14lpo 1:d0884279b41d 406
elt14lpo 1:d0884279b41d 407 *pOut = acc3;
elt14lpo 1:d0884279b41d 408 pOut += inc;
elt14lpo 1:d0884279b41d 409
elt14lpo 1:d0884279b41d 410 /* Update the inputA and inputB pointers for next MAC calculation */
elt14lpo 1:d0884279b41d 411 px = pIn1 + (count * 4u);
elt14lpo 1:d0884279b41d 412 py = pIn2;
elt14lpo 1:d0884279b41d 413
elt14lpo 1:d0884279b41d 414 /* Increment the pointer pIn1 index, count by 1 */
elt14lpo 1:d0884279b41d 415 count++;
elt14lpo 1:d0884279b41d 416
elt14lpo 1:d0884279b41d 417 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 418 blkCnt--;
elt14lpo 1:d0884279b41d 419 }
elt14lpo 1:d0884279b41d 420
elt14lpo 1:d0884279b41d 421 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
elt14lpo 1:d0884279b41d 422 ** No loop unrolling is used. */
elt14lpo 1:d0884279b41d 423 blkCnt = blockSize2 % 0x4u;
elt14lpo 1:d0884279b41d 424
elt14lpo 1:d0884279b41d 425 while(blkCnt > 0u) {
elt14lpo 1:d0884279b41d 426 /* Accumulator is made zero for every iteration */
elt14lpo 1:d0884279b41d 427 sum = 0.0f;
elt14lpo 1:d0884279b41d 428
elt14lpo 1:d0884279b41d 429 /* Apply loop unrolling and compute 4 MACs simultaneously. */
elt14lpo 1:d0884279b41d 430 k = srcBLen >> 2u;
elt14lpo 1:d0884279b41d 431
elt14lpo 1:d0884279b41d 432 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
elt14lpo 1:d0884279b41d 433 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
elt14lpo 1:d0884279b41d 434 while(k > 0u) {
elt14lpo 1:d0884279b41d 435 /* Perform the multiply-accumulates */
elt14lpo 1:d0884279b41d 436 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 437 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 438 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 439 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 440
elt14lpo 1:d0884279b41d 441 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 442 k--;
elt14lpo 1:d0884279b41d 443 }
elt14lpo 1:d0884279b41d 444
elt14lpo 1:d0884279b41d 445 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
elt14lpo 1:d0884279b41d 446 ** No loop unrolling is used. */
elt14lpo 1:d0884279b41d 447 k = srcBLen % 0x4u;
elt14lpo 1:d0884279b41d 448
elt14lpo 1:d0884279b41d 449 while(k > 0u) {
elt14lpo 1:d0884279b41d 450 /* Perform the multiply-accumulate */
elt14lpo 1:d0884279b41d 451 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 452
elt14lpo 1:d0884279b41d 453 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 454 k--;
elt14lpo 1:d0884279b41d 455 }
elt14lpo 1:d0884279b41d 456
elt14lpo 1:d0884279b41d 457 /* Store the result in the accumulator in the destination buffer. */
elt14lpo 1:d0884279b41d 458 *pOut = sum;
elt14lpo 1:d0884279b41d 459 /* Destination pointer is updated according to the address modifier, inc */
elt14lpo 1:d0884279b41d 460 pOut += inc;
elt14lpo 1:d0884279b41d 461
elt14lpo 1:d0884279b41d 462 /* Update the inputA and inputB pointers for next MAC calculation */
elt14lpo 1:d0884279b41d 463 px = pIn1 + count;
elt14lpo 1:d0884279b41d 464 py = pIn2;
elt14lpo 1:d0884279b41d 465
elt14lpo 1:d0884279b41d 466 /* Increment the pointer pIn1 index, count by 1 */
elt14lpo 1:d0884279b41d 467 count++;
elt14lpo 1:d0884279b41d 468
elt14lpo 1:d0884279b41d 469 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 470 blkCnt--;
elt14lpo 1:d0884279b41d 471 }
elt14lpo 1:d0884279b41d 472 } else {
elt14lpo 1:d0884279b41d 473 /* If the srcBLen is not a multiple of 4,
elt14lpo 1:d0884279b41d 474 * the blockSize2 loop cannot be unrolled by 4 */
elt14lpo 1:d0884279b41d 475 blkCnt = blockSize2;
elt14lpo 1:d0884279b41d 476
elt14lpo 1:d0884279b41d 477 while(blkCnt > 0u) {
elt14lpo 1:d0884279b41d 478 /* Accumulator is made zero for every iteration */
elt14lpo 1:d0884279b41d 479 sum = 0.0f;
elt14lpo 1:d0884279b41d 480
elt14lpo 1:d0884279b41d 481 /* Loop over srcBLen */
elt14lpo 1:d0884279b41d 482 k = srcBLen;
elt14lpo 1:d0884279b41d 483
elt14lpo 1:d0884279b41d 484 while(k > 0u) {
elt14lpo 1:d0884279b41d 485 /* Perform the multiply-accumulate */
elt14lpo 1:d0884279b41d 486 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 487
elt14lpo 1:d0884279b41d 488 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 489 k--;
elt14lpo 1:d0884279b41d 490 }
elt14lpo 1:d0884279b41d 491
elt14lpo 1:d0884279b41d 492 /* Store the result in the accumulator in the destination buffer. */
elt14lpo 1:d0884279b41d 493 *pOut = sum;
elt14lpo 1:d0884279b41d 494 /* Destination pointer is updated according to the address modifier, inc */
elt14lpo 1:d0884279b41d 495 pOut += inc;
elt14lpo 1:d0884279b41d 496
elt14lpo 1:d0884279b41d 497 /* Update the inputA and inputB pointers for next MAC calculation */
elt14lpo 1:d0884279b41d 498 px = pIn1 + count;
elt14lpo 1:d0884279b41d 499 py = pIn2;
elt14lpo 1:d0884279b41d 500
elt14lpo 1:d0884279b41d 501 /* Increment the pointer pIn1 index, count by 1 */
elt14lpo 1:d0884279b41d 502 count++;
elt14lpo 1:d0884279b41d 503
elt14lpo 1:d0884279b41d 504 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 505 blkCnt--;
elt14lpo 1:d0884279b41d 506 }
elt14lpo 1:d0884279b41d 507 }
elt14lpo 1:d0884279b41d 508
elt14lpo 1:d0884279b41d 509 /* --------------------------
elt14lpo 1:d0884279b41d 510 * Initializations of stage3
elt14lpo 1:d0884279b41d 511 * -------------------------*/
elt14lpo 1:d0884279b41d 512
elt14lpo 1:d0884279b41d 513 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
elt14lpo 1:d0884279b41d 514 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
elt14lpo 1:d0884279b41d 515 * ....
elt14lpo 1:d0884279b41d 516 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
elt14lpo 1:d0884279b41d 517 * sum += x[srcALen-1] * y[0]
elt14lpo 1:d0884279b41d 518 */
elt14lpo 1:d0884279b41d 519
elt14lpo 1:d0884279b41d 520 /* In this stage the MAC operations are decreased by 1 for every iteration.
elt14lpo 1:d0884279b41d 521 The count variable holds the number of MAC operations performed */
elt14lpo 1:d0884279b41d 522 count = srcBLen - 1u;
elt14lpo 1:d0884279b41d 523
elt14lpo 1:d0884279b41d 524 /* Working pointer of inputA */
elt14lpo 1:d0884279b41d 525 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
elt14lpo 1:d0884279b41d 526 px = pSrc1;
elt14lpo 1:d0884279b41d 527
elt14lpo 1:d0884279b41d 528 /* Working pointer of inputB */
elt14lpo 1:d0884279b41d 529 py = pIn2;
elt14lpo 1:d0884279b41d 530
elt14lpo 1:d0884279b41d 531 /* -------------------
elt14lpo 1:d0884279b41d 532 * Stage3 process
elt14lpo 1:d0884279b41d 533 * ------------------*/
elt14lpo 1:d0884279b41d 534
elt14lpo 1:d0884279b41d 535 while(blockSize3 > 0u) {
elt14lpo 1:d0884279b41d 536 /* Accumulator is made zero for every iteration */
elt14lpo 1:d0884279b41d 537 sum = 0.0f;
elt14lpo 1:d0884279b41d 538
elt14lpo 1:d0884279b41d 539 /* Apply loop unrolling and compute 4 MACs simultaneously. */
elt14lpo 1:d0884279b41d 540 k = count >> 2u;
elt14lpo 1:d0884279b41d 541
elt14lpo 1:d0884279b41d 542 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
elt14lpo 1:d0884279b41d 543 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
elt14lpo 1:d0884279b41d 544 while(k > 0u) {
elt14lpo 1:d0884279b41d 545 /* Perform the multiply-accumulates */
elt14lpo 1:d0884279b41d 546 /* sum += x[srcALen - srcBLen + 4] * y[3] */
elt14lpo 1:d0884279b41d 547 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 548 /* sum += x[srcALen - srcBLen + 3] * y[2] */
elt14lpo 1:d0884279b41d 549 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 550 /* sum += x[srcALen - srcBLen + 2] * y[1] */
elt14lpo 1:d0884279b41d 551 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 552 /* sum += x[srcALen - srcBLen + 1] * y[0] */
elt14lpo 1:d0884279b41d 553 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 554
elt14lpo 1:d0884279b41d 555 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 556 k--;
elt14lpo 1:d0884279b41d 557 }
elt14lpo 1:d0884279b41d 558
elt14lpo 1:d0884279b41d 559 /* If the count is not a multiple of 4, compute any remaining MACs here.
elt14lpo 1:d0884279b41d 560 ** No loop unrolling is used. */
elt14lpo 1:d0884279b41d 561 k = count % 0x4u;
elt14lpo 1:d0884279b41d 562
elt14lpo 1:d0884279b41d 563 while(k > 0u) {
elt14lpo 1:d0884279b41d 564 /* Perform the multiply-accumulates */
elt14lpo 1:d0884279b41d 565 sum += *px++ * *py++;
elt14lpo 1:d0884279b41d 566
elt14lpo 1:d0884279b41d 567 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 568 k--;
elt14lpo 1:d0884279b41d 569 }
elt14lpo 1:d0884279b41d 570
elt14lpo 1:d0884279b41d 571 /* Store the result in the accumulator in the destination buffer. */
elt14lpo 1:d0884279b41d 572 *pOut = sum;
elt14lpo 1:d0884279b41d 573 /* Destination pointer is updated according to the address modifier, inc */
elt14lpo 1:d0884279b41d 574 pOut += inc;
elt14lpo 1:d0884279b41d 575
elt14lpo 1:d0884279b41d 576 /* Update the inputA and inputB pointers for next MAC calculation */
elt14lpo 1:d0884279b41d 577 px = ++pSrc1;
elt14lpo 1:d0884279b41d 578 py = pIn2;
elt14lpo 1:d0884279b41d 579
elt14lpo 1:d0884279b41d 580 /* Decrement the MAC count */
elt14lpo 1:d0884279b41d 581 count--;
elt14lpo 1:d0884279b41d 582
elt14lpo 1:d0884279b41d 583 /* Decrement the loop counter */
elt14lpo 1:d0884279b41d 584 blockSize3--;
elt14lpo 1:d0884279b41d 585 }
elt14lpo 1:d0884279b41d 586
elt14lpo 1:d0884279b41d 587 }
elt14lpo 1:d0884279b41d 588
elt14lpo 1:d0884279b41d 589 /**
elt14lpo 1:d0884279b41d 590 * @} end of Corr group
elt14lpo 1:d0884279b41d 591 */