CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Legacy Warning

This is an mbed 2 library. To learn more about mbed OS 5, visit the docs.

Committer:
emilmont
Date:
Thu May 30 17:10:11 2013 +0100
Revision:
2:da51fb522205
Parent:
1:fdd22bb7aa52
Child:
3:7a284390b0ce
Keep "cmsis-dsp" module in synch with its source

Who changed what in which revision?

UserRevisionLine numberNew contents of line
emilmont 1:fdd22bb7aa52 1 /* ----------------------------------------------------------------------
emilmont 1:fdd22bb7aa52 2 * Copyright (C) 2010 ARM Limited. All rights reserved.
emilmont 1:fdd22bb7aa52 3 *
emilmont 1:fdd22bb7aa52 4 * $Date: 15. February 2012
emilmont 2:da51fb522205 5 * $Revision: V1.1.0
emilmont 1:fdd22bb7aa52 6 *
emilmont 2:da51fb522205 7 * Project: CMSIS DSP Library
emilmont 2:da51fb522205 8 * Title: arm_conv_fast_q31.c
emilmont 1:fdd22bb7aa52 9 *
emilmont 2:da51fb522205 10 * Description: Q31 Convolution (fast version).
emilmont 1:fdd22bb7aa52 11 *
emilmont 1:fdd22bb7aa52 12 * Target Processor: Cortex-M4/Cortex-M3
emilmont 1:fdd22bb7aa52 13 *
emilmont 1:fdd22bb7aa52 14 * Version 1.1.0 2012/02/15
emilmont 1:fdd22bb7aa52 15 * Updated with more optimizations, bug fixes and minor API changes.
emilmont 1:fdd22bb7aa52 16 *
emilmont 1:fdd22bb7aa52 17 * Version 1.0.11 2011/10/18
emilmont 1:fdd22bb7aa52 18 * Bug Fix in conv, correlation, partial convolution.
emilmont 1:fdd22bb7aa52 19 *
emilmont 1:fdd22bb7aa52 20 * Version 1.0.10 2011/7/15
emilmont 1:fdd22bb7aa52 21 * Big Endian support added and Merged M0 and M3/M4 Source code.
emilmont 1:fdd22bb7aa52 22 *
emilmont 1:fdd22bb7aa52 23 * Version 1.0.3 2010/11/29
emilmont 1:fdd22bb7aa52 24 * Re-organized the CMSIS folders and updated documentation.
emilmont 1:fdd22bb7aa52 25 *
emilmont 1:fdd22bb7aa52 26 * Version 1.0.2 2010/11/11
emilmont 1:fdd22bb7aa52 27 * Documentation updated.
emilmont 1:fdd22bb7aa52 28 *
emilmont 1:fdd22bb7aa52 29 * Version 1.0.1 2010/10/05
emilmont 1:fdd22bb7aa52 30 * Production release and review comments incorporated.
emilmont 1:fdd22bb7aa52 31 *
emilmont 1:fdd22bb7aa52 32 * Version 1.0.0 2010/09/20
emilmont 1:fdd22bb7aa52 33 * Production release and review comments incorporated.
emilmont 1:fdd22bb7aa52 34 * -------------------------------------------------------------------- */
emilmont 1:fdd22bb7aa52 35
emilmont 1:fdd22bb7aa52 36 #include "arm_math.h"
emilmont 1:fdd22bb7aa52 37
emilmont 1:fdd22bb7aa52 38 /**
emilmont 1:fdd22bb7aa52 39 * @ingroup groupFilters
emilmont 1:fdd22bb7aa52 40 */
emilmont 1:fdd22bb7aa52 41
emilmont 1:fdd22bb7aa52 42 /**
emilmont 1:fdd22bb7aa52 43 * @addtogroup Conv
emilmont 1:fdd22bb7aa52 44 * @{
emilmont 1:fdd22bb7aa52 45 */
emilmont 1:fdd22bb7aa52 46
emilmont 1:fdd22bb7aa52 47 /**
emilmont 1:fdd22bb7aa52 48 * @param[in] *pSrcA points to the first input sequence.
emilmont 1:fdd22bb7aa52 49 * @param[in] srcALen length of the first input sequence.
emilmont 1:fdd22bb7aa52 50 * @param[in] *pSrcB points to the second input sequence.
emilmont 1:fdd22bb7aa52 51 * @param[in] srcBLen length of the second input sequence.
emilmont 1:fdd22bb7aa52 52 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
emilmont 1:fdd22bb7aa52 53 * @return none.
emilmont 1:fdd22bb7aa52 54 *
emilmont 1:fdd22bb7aa52 55 * @details
emilmont 1:fdd22bb7aa52 56 * <b>Scaling and Overflow Behavior:</b>
emilmont 1:fdd22bb7aa52 57 *
emilmont 1:fdd22bb7aa52 58 * \par
emilmont 1:fdd22bb7aa52 59 * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
emilmont 1:fdd22bb7aa52 60 * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
emilmont 1:fdd22bb7aa52 61 * These intermediate results are accumulated in a 32-bit register in 2.30 format.
emilmont 1:fdd22bb7aa52 62 * Finally, the accumulator is saturated and converted to a 1.31 result.
emilmont 1:fdd22bb7aa52 63 *
emilmont 1:fdd22bb7aa52 64 * \par
emilmont 1:fdd22bb7aa52 65 * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
emilmont 1:fdd22bb7aa52 66 * In order to avoid overflows completely the input signals must be scaled down.
emilmont 1:fdd22bb7aa52 67 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
emilmont 1:fdd22bb7aa52 68 * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
emilmont 1:fdd22bb7aa52 69 *
emilmont 1:fdd22bb7aa52 70 * \par
emilmont 1:fdd22bb7aa52 71 * See <code>arm_conv_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
emilmont 1:fdd22bb7aa52 72 */
emilmont 1:fdd22bb7aa52 73
emilmont 1:fdd22bb7aa52 74 void arm_conv_fast_q31(
emilmont 1:fdd22bb7aa52 75 q31_t * pSrcA,
emilmont 1:fdd22bb7aa52 76 uint32_t srcALen,
emilmont 1:fdd22bb7aa52 77 q31_t * pSrcB,
emilmont 1:fdd22bb7aa52 78 uint32_t srcBLen,
emilmont 1:fdd22bb7aa52 79 q31_t * pDst)
emilmont 1:fdd22bb7aa52 80 {
emilmont 1:fdd22bb7aa52 81 q31_t *pIn1; /* inputA pointer */
emilmont 1:fdd22bb7aa52 82 q31_t *pIn2; /* inputB pointer */
emilmont 1:fdd22bb7aa52 83 q31_t *pOut = pDst; /* output pointer */
emilmont 1:fdd22bb7aa52 84 q31_t *px; /* Intermediate inputA pointer */
emilmont 1:fdd22bb7aa52 85 q31_t *py; /* Intermediate inputB pointer */
emilmont 1:fdd22bb7aa52 86 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
emilmont 1:fdd22bb7aa52 87 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
emilmont 1:fdd22bb7aa52 88 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
emilmont 1:fdd22bb7aa52 89 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
emilmont 1:fdd22bb7aa52 90
emilmont 1:fdd22bb7aa52 91 /* The algorithm implementation is based on the lengths of the inputs. */
emilmont 1:fdd22bb7aa52 92 /* srcB is always made to slide across srcA. */
emilmont 1:fdd22bb7aa52 93 /* So srcBLen is always considered as shorter or equal to srcALen */
emilmont 1:fdd22bb7aa52 94 if(srcALen >= srcBLen)
emilmont 1:fdd22bb7aa52 95 {
emilmont 1:fdd22bb7aa52 96 /* Initialization of inputA pointer */
emilmont 1:fdd22bb7aa52 97 pIn1 = pSrcA;
emilmont 1:fdd22bb7aa52 98
emilmont 1:fdd22bb7aa52 99 /* Initialization of inputB pointer */
emilmont 1:fdd22bb7aa52 100 pIn2 = pSrcB;
emilmont 1:fdd22bb7aa52 101 }
emilmont 1:fdd22bb7aa52 102 else
emilmont 1:fdd22bb7aa52 103 {
emilmont 1:fdd22bb7aa52 104 /* Initialization of inputA pointer */
emilmont 1:fdd22bb7aa52 105 pIn1 = pSrcB;
emilmont 1:fdd22bb7aa52 106
emilmont 1:fdd22bb7aa52 107 /* Initialization of inputB pointer */
emilmont 1:fdd22bb7aa52 108 pIn2 = pSrcA;
emilmont 1:fdd22bb7aa52 109
emilmont 1:fdd22bb7aa52 110 /* srcBLen is always considered as shorter or equal to srcALen */
emilmont 1:fdd22bb7aa52 111 j = srcBLen;
emilmont 1:fdd22bb7aa52 112 srcBLen = srcALen;
emilmont 1:fdd22bb7aa52 113 srcALen = j;
emilmont 1:fdd22bb7aa52 114 }
emilmont 1:fdd22bb7aa52 115
emilmont 1:fdd22bb7aa52 116 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
emilmont 1:fdd22bb7aa52 117 /* The function is internally
emilmont 1:fdd22bb7aa52 118 * divided into three stages according to the number of multiplications that has to be
emilmont 1:fdd22bb7aa52 119 * taken place between inputA samples and inputB samples. In the first stage of the
emilmont 1:fdd22bb7aa52 120 * algorithm, the multiplications increase by one for every iteration.
emilmont 1:fdd22bb7aa52 121 * In the second stage of the algorithm, srcBLen number of multiplications are done.
emilmont 1:fdd22bb7aa52 122 * In the third stage of the algorithm, the multiplications decrease by one
emilmont 1:fdd22bb7aa52 123 * for every iteration. */
emilmont 1:fdd22bb7aa52 124
emilmont 1:fdd22bb7aa52 125 /* The algorithm is implemented in three stages.
emilmont 1:fdd22bb7aa52 126 The loop counters of each stage is initiated here. */
emilmont 1:fdd22bb7aa52 127 blockSize1 = srcBLen - 1u;
emilmont 1:fdd22bb7aa52 128 blockSize2 = srcALen - (srcBLen - 1u);
emilmont 1:fdd22bb7aa52 129 blockSize3 = blockSize1;
emilmont 1:fdd22bb7aa52 130
emilmont 1:fdd22bb7aa52 131 /* --------------------------
emilmont 1:fdd22bb7aa52 132 * Initializations of stage1
emilmont 1:fdd22bb7aa52 133 * -------------------------*/
emilmont 1:fdd22bb7aa52 134
emilmont 1:fdd22bb7aa52 135 /* sum = x[0] * y[0]
emilmont 1:fdd22bb7aa52 136 * sum = x[0] * y[1] + x[1] * y[0]
emilmont 1:fdd22bb7aa52 137 * ....
emilmont 1:fdd22bb7aa52 138 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
emilmont 1:fdd22bb7aa52 139 */
emilmont 1:fdd22bb7aa52 140
emilmont 1:fdd22bb7aa52 141 /* In this stage the MAC operations are increased by 1 for every iteration.
emilmont 1:fdd22bb7aa52 142 The count variable holds the number of MAC operations performed */
emilmont 1:fdd22bb7aa52 143 count = 1u;
emilmont 1:fdd22bb7aa52 144
emilmont 1:fdd22bb7aa52 145 /* Working pointer of inputA */
emilmont 1:fdd22bb7aa52 146 px = pIn1;
emilmont 1:fdd22bb7aa52 147
emilmont 1:fdd22bb7aa52 148 /* Working pointer of inputB */
emilmont 1:fdd22bb7aa52 149 py = pIn2;
emilmont 1:fdd22bb7aa52 150
emilmont 1:fdd22bb7aa52 151
emilmont 1:fdd22bb7aa52 152 /* ------------------------
emilmont 1:fdd22bb7aa52 153 * Stage1 process
emilmont 1:fdd22bb7aa52 154 * ----------------------*/
emilmont 1:fdd22bb7aa52 155
emilmont 1:fdd22bb7aa52 156 /* The first stage starts here */
emilmont 1:fdd22bb7aa52 157 while(blockSize1 > 0u)
emilmont 1:fdd22bb7aa52 158 {
emilmont 1:fdd22bb7aa52 159 /* Accumulator is made zero for every iteration */
emilmont 1:fdd22bb7aa52 160 sum = 0;
emilmont 1:fdd22bb7aa52 161
emilmont 1:fdd22bb7aa52 162 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emilmont 1:fdd22bb7aa52 163 k = count >> 2u;
emilmont 1:fdd22bb7aa52 164
emilmont 1:fdd22bb7aa52 165 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emilmont 1:fdd22bb7aa52 166 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
emilmont 1:fdd22bb7aa52 167 while(k > 0u)
emilmont 1:fdd22bb7aa52 168 {
emilmont 1:fdd22bb7aa52 169 /* x[0] * y[srcBLen - 1] */
emilmont 1:fdd22bb7aa52 170 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 171 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 172
emilmont 1:fdd22bb7aa52 173 /* x[1] * y[srcBLen - 2] */
emilmont 1:fdd22bb7aa52 174 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 175 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 176
emilmont 1:fdd22bb7aa52 177 /* x[2] * y[srcBLen - 3] */
emilmont 1:fdd22bb7aa52 178 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 179 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 180
emilmont 1:fdd22bb7aa52 181 /* x[3] * y[srcBLen - 4] */
emilmont 1:fdd22bb7aa52 182 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 183 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 184
emilmont 1:fdd22bb7aa52 185 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 186 k--;
emilmont 1:fdd22bb7aa52 187 }
emilmont 1:fdd22bb7aa52 188
emilmont 1:fdd22bb7aa52 189 /* If the count is not a multiple of 4, compute any remaining MACs here.
emilmont 1:fdd22bb7aa52 190 ** No loop unrolling is used. */
emilmont 1:fdd22bb7aa52 191 k = count % 0x4u;
emilmont 1:fdd22bb7aa52 192
emilmont 1:fdd22bb7aa52 193 while(k > 0u)
emilmont 1:fdd22bb7aa52 194 {
emilmont 1:fdd22bb7aa52 195 /* Perform the multiply-accumulate */
emilmont 1:fdd22bb7aa52 196 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 197 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 198
emilmont 1:fdd22bb7aa52 199 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 200 k--;
emilmont 1:fdd22bb7aa52 201 }
emilmont 1:fdd22bb7aa52 202
emilmont 1:fdd22bb7aa52 203 /* Store the result in the accumulator in the destination buffer. */
emilmont 1:fdd22bb7aa52 204 *pOut++ = sum << 1;
emilmont 1:fdd22bb7aa52 205
emilmont 1:fdd22bb7aa52 206 /* Update the inputA and inputB pointers for next MAC calculation */
emilmont 1:fdd22bb7aa52 207 py = pIn2 + count;
emilmont 1:fdd22bb7aa52 208 px = pIn1;
emilmont 1:fdd22bb7aa52 209
emilmont 1:fdd22bb7aa52 210 /* Increment the MAC count */
emilmont 1:fdd22bb7aa52 211 count++;
emilmont 1:fdd22bb7aa52 212
emilmont 1:fdd22bb7aa52 213 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 214 blockSize1--;
emilmont 1:fdd22bb7aa52 215 }
emilmont 1:fdd22bb7aa52 216
emilmont 1:fdd22bb7aa52 217 /* --------------------------
emilmont 1:fdd22bb7aa52 218 * Initializations of stage2
emilmont 1:fdd22bb7aa52 219 * ------------------------*/
emilmont 1:fdd22bb7aa52 220
emilmont 1:fdd22bb7aa52 221 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
emilmont 1:fdd22bb7aa52 222 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
emilmont 1:fdd22bb7aa52 223 * ....
emilmont 1:fdd22bb7aa52 224 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
emilmont 1:fdd22bb7aa52 225 */
emilmont 1:fdd22bb7aa52 226
emilmont 1:fdd22bb7aa52 227 /* Working pointer of inputA */
emilmont 1:fdd22bb7aa52 228 px = pIn1;
emilmont 1:fdd22bb7aa52 229
emilmont 1:fdd22bb7aa52 230 /* Working pointer of inputB */
emilmont 1:fdd22bb7aa52 231 pSrc2 = pIn2 + (srcBLen - 1u);
emilmont 1:fdd22bb7aa52 232 py = pSrc2;
emilmont 1:fdd22bb7aa52 233
emilmont 1:fdd22bb7aa52 234 /* count is index by which the pointer pIn1 to be incremented */
emilmont 1:fdd22bb7aa52 235 count = 0u;
emilmont 1:fdd22bb7aa52 236
emilmont 1:fdd22bb7aa52 237 /* -------------------
emilmont 1:fdd22bb7aa52 238 * Stage2 process
emilmont 1:fdd22bb7aa52 239 * ------------------*/
emilmont 1:fdd22bb7aa52 240
emilmont 1:fdd22bb7aa52 241 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
emilmont 1:fdd22bb7aa52 242 * So, to loop unroll over blockSize2,
emilmont 1:fdd22bb7aa52 243 * srcBLen should be greater than or equal to 4 */
emilmont 1:fdd22bb7aa52 244 if(srcBLen >= 4u)
emilmont 1:fdd22bb7aa52 245 {
emilmont 1:fdd22bb7aa52 246 /* Loop unroll over blockSize2, by 4 */
emilmont 1:fdd22bb7aa52 247 blkCnt = blockSize2 >> 2u;
emilmont 1:fdd22bb7aa52 248
emilmont 1:fdd22bb7aa52 249 while(blkCnt > 0u)
emilmont 1:fdd22bb7aa52 250 {
emilmont 1:fdd22bb7aa52 251 /* Set all accumulators to zero */
emilmont 1:fdd22bb7aa52 252 acc0 = 0;
emilmont 1:fdd22bb7aa52 253 acc1 = 0;
emilmont 1:fdd22bb7aa52 254 acc2 = 0;
emilmont 1:fdd22bb7aa52 255 acc3 = 0;
emilmont 1:fdd22bb7aa52 256
emilmont 1:fdd22bb7aa52 257 /* read x[0], x[1], x[2] samples */
emilmont 1:fdd22bb7aa52 258 x0 = *(px++);
emilmont 1:fdd22bb7aa52 259 x1 = *(px++);
emilmont 1:fdd22bb7aa52 260 x2 = *(px++);
emilmont 1:fdd22bb7aa52 261
emilmont 1:fdd22bb7aa52 262 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emilmont 1:fdd22bb7aa52 263 k = srcBLen >> 2u;
emilmont 1:fdd22bb7aa52 264
emilmont 1:fdd22bb7aa52 265 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emilmont 1:fdd22bb7aa52 266 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
emilmont 1:fdd22bb7aa52 267 do
emilmont 1:fdd22bb7aa52 268 {
emilmont 1:fdd22bb7aa52 269 /* Read y[srcBLen - 1] sample */
emilmont 1:fdd22bb7aa52 270 c0 = *(py--);
emilmont 1:fdd22bb7aa52 271
emilmont 1:fdd22bb7aa52 272 /* Read x[3] sample */
emilmont 1:fdd22bb7aa52 273 x3 = *(px++);
emilmont 1:fdd22bb7aa52 274
emilmont 1:fdd22bb7aa52 275 /* Perform the multiply-accumulates */
emilmont 1:fdd22bb7aa52 276 /* acc0 += x[0] * y[srcBLen - 1] */
emilmont 1:fdd22bb7aa52 277 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 278
emilmont 1:fdd22bb7aa52 279 /* acc1 += x[1] * y[srcBLen - 1] */
emilmont 1:fdd22bb7aa52 280 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 281
emilmont 1:fdd22bb7aa52 282 /* acc2 += x[2] * y[srcBLen - 1] */
emilmont 1:fdd22bb7aa52 283 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 284
emilmont 1:fdd22bb7aa52 285 /* acc3 += x[3] * y[srcBLen - 1] */
emilmont 1:fdd22bb7aa52 286 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 287
emilmont 1:fdd22bb7aa52 288 /* Read y[srcBLen - 2] sample */
emilmont 1:fdd22bb7aa52 289 c0 = *(py--);
emilmont 1:fdd22bb7aa52 290
emilmont 1:fdd22bb7aa52 291 /* Read x[4] sample */
emilmont 1:fdd22bb7aa52 292 x0 = *(px++);
emilmont 1:fdd22bb7aa52 293
emilmont 1:fdd22bb7aa52 294 /* Perform the multiply-accumulate */
emilmont 1:fdd22bb7aa52 295 /* acc0 += x[1] * y[srcBLen - 2] */
emilmont 1:fdd22bb7aa52 296 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 297 /* acc1 += x[2] * y[srcBLen - 2] */
emilmont 1:fdd22bb7aa52 298 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 299 /* acc2 += x[3] * y[srcBLen - 2] */
emilmont 1:fdd22bb7aa52 300 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 301 /* acc3 += x[4] * y[srcBLen - 2] */
emilmont 1:fdd22bb7aa52 302 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 303
emilmont 1:fdd22bb7aa52 304 /* Read y[srcBLen - 3] sample */
emilmont 1:fdd22bb7aa52 305 c0 = *(py--);
emilmont 1:fdd22bb7aa52 306
emilmont 1:fdd22bb7aa52 307 /* Read x[5] sample */
emilmont 1:fdd22bb7aa52 308 x1 = *(px++);
emilmont 1:fdd22bb7aa52 309
emilmont 1:fdd22bb7aa52 310 /* Perform the multiply-accumulates */
emilmont 1:fdd22bb7aa52 311 /* acc0 += x[2] * y[srcBLen - 3] */
emilmont 1:fdd22bb7aa52 312 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 313 /* acc1 += x[3] * y[srcBLen - 3] */
emilmont 1:fdd22bb7aa52 314 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 315 /* acc2 += x[4] * y[srcBLen - 3] */
emilmont 1:fdd22bb7aa52 316 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 317 /* acc3 += x[5] * y[srcBLen - 3] */
emilmont 1:fdd22bb7aa52 318 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 319
emilmont 1:fdd22bb7aa52 320 /* Read y[srcBLen - 4] sample */
emilmont 1:fdd22bb7aa52 321 c0 = *(py--);
emilmont 1:fdd22bb7aa52 322
emilmont 1:fdd22bb7aa52 323 /* Read x[6] sample */
emilmont 1:fdd22bb7aa52 324 x2 = *(px++);
emilmont 1:fdd22bb7aa52 325
emilmont 1:fdd22bb7aa52 326 /* Perform the multiply-accumulates */
emilmont 1:fdd22bb7aa52 327 /* acc0 += x[3] * y[srcBLen - 4] */
emilmont 1:fdd22bb7aa52 328 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 329 /* acc1 += x[4] * y[srcBLen - 4] */
emilmont 1:fdd22bb7aa52 330 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 331 /* acc2 += x[5] * y[srcBLen - 4] */
emilmont 1:fdd22bb7aa52 332 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 333 /* acc3 += x[6] * y[srcBLen - 4] */
emilmont 1:fdd22bb7aa52 334 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 335
emilmont 1:fdd22bb7aa52 336
emilmont 1:fdd22bb7aa52 337 } while(--k);
emilmont 1:fdd22bb7aa52 338
emilmont 1:fdd22bb7aa52 339 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
emilmont 1:fdd22bb7aa52 340 ** No loop unrolling is used. */
emilmont 1:fdd22bb7aa52 341 k = srcBLen % 0x4u;
emilmont 1:fdd22bb7aa52 342
emilmont 1:fdd22bb7aa52 343 while(k > 0u)
emilmont 1:fdd22bb7aa52 344 {
emilmont 1:fdd22bb7aa52 345 /* Read y[srcBLen - 5] sample */
emilmont 1:fdd22bb7aa52 346 c0 = *(py--);
emilmont 1:fdd22bb7aa52 347
emilmont 1:fdd22bb7aa52 348 /* Read x[7] sample */
emilmont 1:fdd22bb7aa52 349 x3 = *(px++);
emilmont 1:fdd22bb7aa52 350
emilmont 1:fdd22bb7aa52 351 /* Perform the multiply-accumulates */
emilmont 1:fdd22bb7aa52 352 /* acc0 += x[4] * y[srcBLen - 5] */
emilmont 1:fdd22bb7aa52 353 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 354 /* acc1 += x[5] * y[srcBLen - 5] */
emilmont 1:fdd22bb7aa52 355 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 356 /* acc2 += x[6] * y[srcBLen - 5] */
emilmont 1:fdd22bb7aa52 357 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 358 /* acc3 += x[7] * y[srcBLen - 5] */
emilmont 1:fdd22bb7aa52 359 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
emilmont 1:fdd22bb7aa52 360
emilmont 1:fdd22bb7aa52 361 /* Reuse the present samples for the next MAC */
emilmont 1:fdd22bb7aa52 362 x0 = x1;
emilmont 1:fdd22bb7aa52 363 x1 = x2;
emilmont 1:fdd22bb7aa52 364 x2 = x3;
emilmont 1:fdd22bb7aa52 365
emilmont 1:fdd22bb7aa52 366 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 367 k--;
emilmont 1:fdd22bb7aa52 368 }
emilmont 1:fdd22bb7aa52 369
emilmont 1:fdd22bb7aa52 370 /* Store the results in the accumulators in the destination buffer. */
emilmont 1:fdd22bb7aa52 371 *pOut++ = (q31_t) (acc0 << 1);
emilmont 1:fdd22bb7aa52 372 *pOut++ = (q31_t) (acc1 << 1);
emilmont 1:fdd22bb7aa52 373 *pOut++ = (q31_t) (acc2 << 1);
emilmont 1:fdd22bb7aa52 374 *pOut++ = (q31_t) (acc3 << 1);
emilmont 1:fdd22bb7aa52 375
emilmont 1:fdd22bb7aa52 376 /* Increment the pointer pIn1 index, count by 4 */
emilmont 1:fdd22bb7aa52 377 count += 4u;
emilmont 1:fdd22bb7aa52 378
emilmont 1:fdd22bb7aa52 379 /* Update the inputA and inputB pointers for next MAC calculation */
emilmont 1:fdd22bb7aa52 380 px = pIn1 + count;
emilmont 1:fdd22bb7aa52 381 py = pSrc2;
emilmont 1:fdd22bb7aa52 382
emilmont 1:fdd22bb7aa52 383 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 384 blkCnt--;
emilmont 1:fdd22bb7aa52 385 }
emilmont 1:fdd22bb7aa52 386
emilmont 1:fdd22bb7aa52 387 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
emilmont 1:fdd22bb7aa52 388 ** No loop unrolling is used. */
emilmont 1:fdd22bb7aa52 389 blkCnt = blockSize2 % 0x4u;
emilmont 1:fdd22bb7aa52 390
emilmont 1:fdd22bb7aa52 391 while(blkCnt > 0u)
emilmont 1:fdd22bb7aa52 392 {
emilmont 1:fdd22bb7aa52 393 /* Accumulator is made zero for every iteration */
emilmont 1:fdd22bb7aa52 394 sum = 0;
emilmont 1:fdd22bb7aa52 395
emilmont 1:fdd22bb7aa52 396 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emilmont 1:fdd22bb7aa52 397 k = srcBLen >> 2u;
emilmont 1:fdd22bb7aa52 398
emilmont 1:fdd22bb7aa52 399 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emilmont 1:fdd22bb7aa52 400 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
emilmont 1:fdd22bb7aa52 401 while(k > 0u)
emilmont 1:fdd22bb7aa52 402 {
emilmont 1:fdd22bb7aa52 403 /* Perform the multiply-accumulates */
emilmont 1:fdd22bb7aa52 404 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 405 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 406 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 407 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 408 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 409 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 410 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 411 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 412
emilmont 1:fdd22bb7aa52 413 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 414 k--;
emilmont 1:fdd22bb7aa52 415 }
emilmont 1:fdd22bb7aa52 416
emilmont 1:fdd22bb7aa52 417 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
emilmont 1:fdd22bb7aa52 418 ** No loop unrolling is used. */
emilmont 1:fdd22bb7aa52 419 k = srcBLen % 0x4u;
emilmont 1:fdd22bb7aa52 420
emilmont 1:fdd22bb7aa52 421 while(k > 0u)
emilmont 1:fdd22bb7aa52 422 {
emilmont 1:fdd22bb7aa52 423 /* Perform the multiply-accumulate */
emilmont 1:fdd22bb7aa52 424 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 425 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 426
emilmont 1:fdd22bb7aa52 427 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 428 k--;
emilmont 1:fdd22bb7aa52 429 }
emilmont 1:fdd22bb7aa52 430
emilmont 1:fdd22bb7aa52 431 /* Store the result in the accumulator in the destination buffer. */
emilmont 1:fdd22bb7aa52 432 *pOut++ = sum << 1;
emilmont 1:fdd22bb7aa52 433
emilmont 1:fdd22bb7aa52 434 /* Increment the MAC count */
emilmont 1:fdd22bb7aa52 435 count++;
emilmont 1:fdd22bb7aa52 436
emilmont 1:fdd22bb7aa52 437 /* Update the inputA and inputB pointers for next MAC calculation */
emilmont 1:fdd22bb7aa52 438 px = pIn1 + count;
emilmont 1:fdd22bb7aa52 439 py = pSrc2;
emilmont 1:fdd22bb7aa52 440
emilmont 1:fdd22bb7aa52 441 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 442 blkCnt--;
emilmont 1:fdd22bb7aa52 443 }
emilmont 1:fdd22bb7aa52 444 }
emilmont 1:fdd22bb7aa52 445 else
emilmont 1:fdd22bb7aa52 446 {
emilmont 1:fdd22bb7aa52 447 /* If the srcBLen is not a multiple of 4,
emilmont 1:fdd22bb7aa52 448 * the blockSize2 loop cannot be unrolled by 4 */
emilmont 1:fdd22bb7aa52 449 blkCnt = blockSize2;
emilmont 1:fdd22bb7aa52 450
emilmont 1:fdd22bb7aa52 451 while(blkCnt > 0u)
emilmont 1:fdd22bb7aa52 452 {
emilmont 1:fdd22bb7aa52 453 /* Accumulator is made zero for every iteration */
emilmont 1:fdd22bb7aa52 454 sum = 0;
emilmont 1:fdd22bb7aa52 455
emilmont 1:fdd22bb7aa52 456 /* srcBLen number of MACS should be performed */
emilmont 1:fdd22bb7aa52 457 k = srcBLen;
emilmont 1:fdd22bb7aa52 458
emilmont 1:fdd22bb7aa52 459 while(k > 0u)
emilmont 1:fdd22bb7aa52 460 {
emilmont 1:fdd22bb7aa52 461 /* Perform the multiply-accumulate */
emilmont 1:fdd22bb7aa52 462 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 463 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 464
emilmont 1:fdd22bb7aa52 465 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 466 k--;
emilmont 1:fdd22bb7aa52 467 }
emilmont 1:fdd22bb7aa52 468
emilmont 1:fdd22bb7aa52 469 /* Store the result in the accumulator in the destination buffer. */
emilmont 1:fdd22bb7aa52 470 *pOut++ = sum << 1;
emilmont 1:fdd22bb7aa52 471
emilmont 1:fdd22bb7aa52 472 /* Increment the MAC count */
emilmont 1:fdd22bb7aa52 473 count++;
emilmont 1:fdd22bb7aa52 474
emilmont 1:fdd22bb7aa52 475 /* Update the inputA and inputB pointers for next MAC calculation */
emilmont 1:fdd22bb7aa52 476 px = pIn1 + count;
emilmont 1:fdd22bb7aa52 477 py = pSrc2;
emilmont 1:fdd22bb7aa52 478
emilmont 1:fdd22bb7aa52 479 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 480 blkCnt--;
emilmont 1:fdd22bb7aa52 481 }
emilmont 1:fdd22bb7aa52 482 }
emilmont 1:fdd22bb7aa52 483
emilmont 1:fdd22bb7aa52 484
emilmont 1:fdd22bb7aa52 485 /* --------------------------
emilmont 1:fdd22bb7aa52 486 * Initializations of stage3
emilmont 1:fdd22bb7aa52 487 * -------------------------*/
emilmont 1:fdd22bb7aa52 488
emilmont 1:fdd22bb7aa52 489 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
emilmont 1:fdd22bb7aa52 490 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
emilmont 1:fdd22bb7aa52 491 * ....
emilmont 1:fdd22bb7aa52 492 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
emilmont 1:fdd22bb7aa52 493 * sum += x[srcALen-1] * y[srcBLen-1]
emilmont 1:fdd22bb7aa52 494 */
emilmont 1:fdd22bb7aa52 495
emilmont 1:fdd22bb7aa52 496 /* In this stage the MAC operations are decreased by 1 for every iteration.
emilmont 1:fdd22bb7aa52 497 The blockSize3 variable holds the number of MAC operations performed */
emilmont 1:fdd22bb7aa52 498
emilmont 1:fdd22bb7aa52 499 /* Working pointer of inputA */
emilmont 1:fdd22bb7aa52 500 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
emilmont 1:fdd22bb7aa52 501 px = pSrc1;
emilmont 1:fdd22bb7aa52 502
emilmont 1:fdd22bb7aa52 503 /* Working pointer of inputB */
emilmont 1:fdd22bb7aa52 504 pSrc2 = pIn2 + (srcBLen - 1u);
emilmont 1:fdd22bb7aa52 505 py = pSrc2;
emilmont 1:fdd22bb7aa52 506
emilmont 1:fdd22bb7aa52 507 /* -------------------
emilmont 1:fdd22bb7aa52 508 * Stage3 process
emilmont 1:fdd22bb7aa52 509 * ------------------*/
emilmont 1:fdd22bb7aa52 510
emilmont 1:fdd22bb7aa52 511 while(blockSize3 > 0u)
emilmont 1:fdd22bb7aa52 512 {
emilmont 1:fdd22bb7aa52 513 /* Accumulator is made zero for every iteration */
emilmont 1:fdd22bb7aa52 514 sum = 0;
emilmont 1:fdd22bb7aa52 515
emilmont 1:fdd22bb7aa52 516 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emilmont 1:fdd22bb7aa52 517 k = blockSize3 >> 2u;
emilmont 1:fdd22bb7aa52 518
emilmont 1:fdd22bb7aa52 519 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emilmont 1:fdd22bb7aa52 520 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
emilmont 1:fdd22bb7aa52 521 while(k > 0u)
emilmont 1:fdd22bb7aa52 522 {
emilmont 1:fdd22bb7aa52 523 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
emilmont 1:fdd22bb7aa52 524 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 525 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 526
emilmont 1:fdd22bb7aa52 527 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
emilmont 1:fdd22bb7aa52 528 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 529 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 530
emilmont 1:fdd22bb7aa52 531 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
emilmont 1:fdd22bb7aa52 532 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 533 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 534
emilmont 1:fdd22bb7aa52 535 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
emilmont 1:fdd22bb7aa52 536 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 537 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 538
emilmont 1:fdd22bb7aa52 539 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 540 k--;
emilmont 1:fdd22bb7aa52 541 }
emilmont 1:fdd22bb7aa52 542
emilmont 1:fdd22bb7aa52 543 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
emilmont 1:fdd22bb7aa52 544 ** No loop unrolling is used. */
emilmont 1:fdd22bb7aa52 545 k = blockSize3 % 0x4u;
emilmont 1:fdd22bb7aa52 546
emilmont 1:fdd22bb7aa52 547 while(k > 0u)
emilmont 1:fdd22bb7aa52 548 {
emilmont 1:fdd22bb7aa52 549 /* Perform the multiply-accumulate */
emilmont 1:fdd22bb7aa52 550 sum = (q31_t) ((((q63_t) sum << 32) +
emilmont 1:fdd22bb7aa52 551 ((q63_t) * px++ * (*py--))) >> 32);
emilmont 1:fdd22bb7aa52 552
emilmont 1:fdd22bb7aa52 553 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 554 k--;
emilmont 1:fdd22bb7aa52 555 }
emilmont 1:fdd22bb7aa52 556
emilmont 1:fdd22bb7aa52 557 /* Store the result in the accumulator in the destination buffer. */
emilmont 1:fdd22bb7aa52 558 *pOut++ = sum << 1;
emilmont 1:fdd22bb7aa52 559
emilmont 1:fdd22bb7aa52 560 /* Update the inputA and inputB pointers for next MAC calculation */
emilmont 1:fdd22bb7aa52 561 px = ++pSrc1;
emilmont 1:fdd22bb7aa52 562 py = pSrc2;
emilmont 1:fdd22bb7aa52 563
emilmont 1:fdd22bb7aa52 564 /* Decrement the loop counter */
emilmont 1:fdd22bb7aa52 565 blockSize3--;
emilmont 1:fdd22bb7aa52 566 }
emilmont 1:fdd22bb7aa52 567
emilmont 1:fdd22bb7aa52 568 }
emilmont 1:fdd22bb7aa52 569
emilmont 1:fdd22bb7aa52 570 /**
emilmont 1:fdd22bb7aa52 571 * @} end of Conv group
emilmont 1:fdd22bb7aa52 572 */