Neil Tan / dsp

Fork of dsp by Simon Ford

Committer:
nprobably
Date:
Thu Jun 29 06:56:55 2017 +0000
Revision:
3:ad02f4ea1fbe
Parent:
0:1014af42efd9
A hack to make it compile with mbed 5

Who changed what in which revision?

UserRevisionLine numberNew contents of line
simon 0:1014af42efd9 1 /* ----------------------------------------------------------------------
simon 0:1014af42efd9 2 * Copyright (C) 2010 ARM Limited. All rights reserved.
simon 0:1014af42efd9 3 *
simon 0:1014af42efd9 4 * $Date: 29. November 2010
simon 0:1014af42efd9 5 * $Revision: V1.0.3
simon 0:1014af42efd9 6 *
simon 0:1014af42efd9 7 * Project: CMSIS DSP Library
simon 0:1014af42efd9 8 * Title: arm_fir_q15.c
simon 0:1014af42efd9 9 *
simon 0:1014af42efd9 10 * Description: Q15 FIR filter processing function.
simon 0:1014af42efd9 11 *
simon 0:1014af42efd9 12 * Target Processor: Cortex-M4/Cortex-M3
simon 0:1014af42efd9 13 *
simon 0:1014af42efd9 14 * Version 1.0.3 2010/11/29
simon 0:1014af42efd9 15 * Re-organized the CMSIS folders and updated documentation.
simon 0:1014af42efd9 16 *
simon 0:1014af42efd9 17 * Version 1.0.2 2010/11/11
simon 0:1014af42efd9 18 * Documentation updated.
simon 0:1014af42efd9 19 *
simon 0:1014af42efd9 20 * Version 1.0.1 2010/10/05
simon 0:1014af42efd9 21 * Production release and review comments incorporated.
simon 0:1014af42efd9 22 *
simon 0:1014af42efd9 23 * Version 1.0.0 2010/09/20
simon 0:1014af42efd9 24 * Production release and review comments incorporated.
simon 0:1014af42efd9 25 *
simon 0:1014af42efd9 26 * Version 0.0.5 2010/04/26
simon 0:1014af42efd9 27 * incorporated review comments and updated with latest CMSIS layer
simon 0:1014af42efd9 28 *
simon 0:1014af42efd9 29 * Version 0.0.3 2010/03/10
simon 0:1014af42efd9 30 * Initial version
simon 0:1014af42efd9 31 * -------------------------------------------------------------------- */
simon 0:1014af42efd9 32
simon 0:1014af42efd9 33 #include "arm_math.h"
simon 0:1014af42efd9 34
simon 0:1014af42efd9 35 /**
simon 0:1014af42efd9 36 * @ingroup groupFilters
simon 0:1014af42efd9 37 */
simon 0:1014af42efd9 38
simon 0:1014af42efd9 39 /**
simon 0:1014af42efd9 40 * @addtogroup FIR
simon 0:1014af42efd9 41 * @{
simon 0:1014af42efd9 42 */
simon 0:1014af42efd9 43
simon 0:1014af42efd9 44 /**
simon 0:1014af42efd9 45 * @brief Processing function for the Q15 FIR filter.
simon 0:1014af42efd9 46 * @param[in] *S points to an instance of the Q15 FIR structure.
simon 0:1014af42efd9 47 * @param[in] *pSrc points to the block of input data.
simon 0:1014af42efd9 48 * @param[out] *pDst points to the block of output data.
simon 0:1014af42efd9 49 * @param[in] blockSize number of samples to process per call.
simon 0:1014af42efd9 50 * @return none.
simon 0:1014af42efd9 51 *
simon 0:1014af42efd9 52 * <b>Scaling and Overflow Behavior:</b>
simon 0:1014af42efd9 53 * \par
simon 0:1014af42efd9 54 * The function is implemented using a 64-bit internal accumulator.
simon 0:1014af42efd9 55 * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
simon 0:1014af42efd9 56 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
simon 0:1014af42efd9 57 * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
simon 0:1014af42efd9 58 * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
simon 0:1014af42efd9 59 * Lastly, the accumulator is saturated to yield a result in 1.15 format.
simon 0:1014af42efd9 60 *
simon 0:1014af42efd9 61 * \par
simon 0:1014af42efd9 62 * Refer to the function <code>arm_fir_fast_q15()</code> for a faster but less precise implementation of this function.
simon 0:1014af42efd9 63 */
simon 0:1014af42efd9 64
simon 0:1014af42efd9 65 void arm_fir_q15(
simon 0:1014af42efd9 66 const arm_fir_instance_q15 * S,
simon 0:1014af42efd9 67 q15_t * pSrc,
simon 0:1014af42efd9 68 q15_t * pDst,
simon 0:1014af42efd9 69 uint32_t blockSize)
simon 0:1014af42efd9 70 {
simon 0:1014af42efd9 71 q15_t *pState = S->pState; /* State pointer */
simon 0:1014af42efd9 72 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
simon 0:1014af42efd9 73 q15_t *pStateCurnt; /* Points to the current sample of the state */
simon 0:1014af42efd9 74 q15_t *px1; /* Temporary q15 pointer for state buffer */
simon 0:1014af42efd9 75 q31_t *pb; /* Temporary pointer for coefficient buffer */
simon 0:1014af42efd9 76 q31_t *px2; /* Temporary q31 pointer for SIMD state buffer accesses */
simon 0:1014af42efd9 77 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold SIMD state and coefficient values */
simon 0:1014af42efd9 78 q63_t acc0, acc1, acc2, acc3; /* Accumulators */
simon 0:1014af42efd9 79 uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
simon 0:1014af42efd9 80 uint32_t tapCnt, blkCnt; /* Loop counters */
simon 0:1014af42efd9 81
simon 0:1014af42efd9 82 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
simon 0:1014af42efd9 83 /* pStateCurnt points to the location where the new input data should be written */
simon 0:1014af42efd9 84 pStateCurnt = &(S->pState[(numTaps - 1u)]);
simon 0:1014af42efd9 85
simon 0:1014af42efd9 86 /* Apply loop unrolling and compute 4 output values simultaneously.
simon 0:1014af42efd9 87 * The variables acc0 ... acc3 hold output values that are being computed:
simon 0:1014af42efd9 88 *
simon 0:1014af42efd9 89 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
simon 0:1014af42efd9 90 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
simon 0:1014af42efd9 91 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
simon 0:1014af42efd9 92 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
simon 0:1014af42efd9 93 */
simon 0:1014af42efd9 94 blkCnt = blockSize >> 2;
simon 0:1014af42efd9 95
simon 0:1014af42efd9 96 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
simon 0:1014af42efd9 97 ** a second loop below computes the remaining 1 to 3 samples. */
simon 0:1014af42efd9 98 while(blkCnt > 0u)
simon 0:1014af42efd9 99 {
simon 0:1014af42efd9 100 /* Copy four new input samples into the state buffer.
simon 0:1014af42efd9 101 ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */
simon 0:1014af42efd9 102 *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++;
simon 0:1014af42efd9 103 *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++;
simon 0:1014af42efd9 104
simon 0:1014af42efd9 105 /* Set all accumulators to zero */
simon 0:1014af42efd9 106 acc0 = 0;
simon 0:1014af42efd9 107 acc1 = 0;
simon 0:1014af42efd9 108 acc2 = 0;
simon 0:1014af42efd9 109 acc3 = 0;
simon 0:1014af42efd9 110
simon 0:1014af42efd9 111 /* Initialize state pointer of type q15 */
simon 0:1014af42efd9 112 px1 = pState;
simon 0:1014af42efd9 113
simon 0:1014af42efd9 114 /* Initialize coeff pointer of type q31 */
simon 0:1014af42efd9 115 pb = (q31_t *) (pCoeffs);
simon 0:1014af42efd9 116
simon 0:1014af42efd9 117 /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
simon 0:1014af42efd9 118 x0 = *(q31_t *) (px1++);
simon 0:1014af42efd9 119
simon 0:1014af42efd9 120 /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */
simon 0:1014af42efd9 121 x1 = *(q31_t *) (px1++);
simon 0:1014af42efd9 122
simon 0:1014af42efd9 123 /* Loop over the number of taps. Unroll by a factor of 4.
simon 0:1014af42efd9 124 ** Repeat until we've computed numTaps-4 coefficients. */
simon 0:1014af42efd9 125 tapCnt = numTaps >> 2;
simon 0:1014af42efd9 126 do
simon 0:1014af42efd9 127 {
simon 0:1014af42efd9 128 /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
simon 0:1014af42efd9 129 c0 = *(pb++);
simon 0:1014af42efd9 130
simon 0:1014af42efd9 131 /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
simon 0:1014af42efd9 132 acc0 = __SMLALD(x0, c0, acc0);
simon 0:1014af42efd9 133
simon 0:1014af42efd9 134 /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
simon 0:1014af42efd9 135 acc1 = __SMLALD(x1, c0, acc1);
simon 0:1014af42efd9 136
simon 0:1014af42efd9 137 /* Read state x[n-N-2], x[n-N-3] */
simon 0:1014af42efd9 138 x2 = *(q31_t *) (px1++);
simon 0:1014af42efd9 139
simon 0:1014af42efd9 140 /* Read state x[n-N-3], x[n-N-4] */
simon 0:1014af42efd9 141 x3 = *(q31_t *) (px1++);
simon 0:1014af42efd9 142
simon 0:1014af42efd9 143 /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
simon 0:1014af42efd9 144 acc2 = __SMLALD(x2, c0, acc2);
simon 0:1014af42efd9 145
simon 0:1014af42efd9 146 /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
simon 0:1014af42efd9 147 acc3 = __SMLALD(x3, c0, acc3);
simon 0:1014af42efd9 148
simon 0:1014af42efd9 149 /* Read coefficients b[N-2], b[N-3] */
simon 0:1014af42efd9 150 c0 = *(pb++);
simon 0:1014af42efd9 151
simon 0:1014af42efd9 152 /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
simon 0:1014af42efd9 153 acc0 = __SMLALD(x2, c0, acc0);
simon 0:1014af42efd9 154
simon 0:1014af42efd9 155 /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
simon 0:1014af42efd9 156 acc1 = __SMLALD(x3, c0, acc1);
simon 0:1014af42efd9 157
simon 0:1014af42efd9 158 /* Read state x[n-N-4], x[n-N-5] */
simon 0:1014af42efd9 159 x0 = *(q31_t *) (px1++);
simon 0:1014af42efd9 160
simon 0:1014af42efd9 161 /* Read state x[n-N-5], x[n-N-6] */
simon 0:1014af42efd9 162 x1 = *(q31_t *) (px1++);
simon 0:1014af42efd9 163
simon 0:1014af42efd9 164 /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
simon 0:1014af42efd9 165 acc2 = __SMLALD(x0, c0, acc2);
simon 0:1014af42efd9 166
simon 0:1014af42efd9 167 /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
simon 0:1014af42efd9 168 acc3 = __SMLALD(x1, c0, acc3);
simon 0:1014af42efd9 169 tapCnt--;
simon 0:1014af42efd9 170
simon 0:1014af42efd9 171 }
simon 0:1014af42efd9 172 while(tapCnt > 0u);
simon 0:1014af42efd9 173
simon 0:1014af42efd9 174 /* If the filter length is not a multiple of 4, compute the remaining filter taps.
simon 0:1014af42efd9 175 ** This is always be 2 taps since the filter length is even. */
simon 0:1014af42efd9 176 if((numTaps & 0x3u) != 0u)
simon 0:1014af42efd9 177 {
simon 0:1014af42efd9 178 /* Read 2 coefficients */
simon 0:1014af42efd9 179 c0 = *(pb++);
simon 0:1014af42efd9 180 /* Fetch 4 state variables */
simon 0:1014af42efd9 181 x2 = *(q31_t *) (px1++);
simon 0:1014af42efd9 182 x3 = *(q31_t *) (px1++);
simon 0:1014af42efd9 183
simon 0:1014af42efd9 184 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 185 acc0 = __SMLALD(x0, c0, acc0);
simon 0:1014af42efd9 186 acc1 = __SMLALD(x1, c0, acc1);
simon 0:1014af42efd9 187 acc2 = __SMLALD(x2, c0, acc2);
simon 0:1014af42efd9 188 acc3 = __SMLALD(x3, c0, acc3);
simon 0:1014af42efd9 189 }
simon 0:1014af42efd9 190
simon 0:1014af42efd9 191 /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
simon 0:1014af42efd9 192 ** Then store the 4 outputs in the destination buffer. */
simon 0:1014af42efd9 193 *__SIMD32(pDst)++ =
simon 0:1014af42efd9 194 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
simon 0:1014af42efd9 195 *__SIMD32(pDst)++ =
simon 0:1014af42efd9 196 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
simon 0:1014af42efd9 197
simon 0:1014af42efd9 198
simon 0:1014af42efd9 199 /* Advance the state pointer by 4 to process the next group of 4 samples */
simon 0:1014af42efd9 200 pState = pState + 4;
simon 0:1014af42efd9 201
simon 0:1014af42efd9 202 /* Decrement the loop counter */
simon 0:1014af42efd9 203 blkCnt--;
simon 0:1014af42efd9 204 }
simon 0:1014af42efd9 205
simon 0:1014af42efd9 206 /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
simon 0:1014af42efd9 207 ** No loop unrolling is used. */
simon 0:1014af42efd9 208 blkCnt = blockSize % 0x4u;
simon 0:1014af42efd9 209 while(blkCnt > 0u)
simon 0:1014af42efd9 210 {
simon 0:1014af42efd9 211 /* Copy two samples into state buffer */
simon 0:1014af42efd9 212 *pStateCurnt++ = *pSrc++;
simon 0:1014af42efd9 213
simon 0:1014af42efd9 214 /* Set the accumulator to zero */
simon 0:1014af42efd9 215 acc0 = 0;
simon 0:1014af42efd9 216
simon 0:1014af42efd9 217 /* Use SIMD to hold states and coefficients */
simon 0:1014af42efd9 218 px2 = (q31_t *) pState;
simon 0:1014af42efd9 219 pb = (q31_t *) (pCoeffs);
simon 0:1014af42efd9 220 tapCnt = numTaps >> 1;
simon 0:1014af42efd9 221
simon 0:1014af42efd9 222 do
simon 0:1014af42efd9 223 {
simon 0:1014af42efd9 224 acc0 = __SMLALD(*px2++, *(pb++), acc0);
simon 0:1014af42efd9 225 tapCnt--;
simon 0:1014af42efd9 226 }
simon 0:1014af42efd9 227 while(tapCnt > 0u);
simon 0:1014af42efd9 228
simon 0:1014af42efd9 229 /* The result is in 2.30 format. Convert to 1.15 with saturation.
simon 0:1014af42efd9 230 ** Then store the output in the destination buffer. */
simon 0:1014af42efd9 231 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
simon 0:1014af42efd9 232
simon 0:1014af42efd9 233 /* Advance state pointer by 1 for the next sample */
simon 0:1014af42efd9 234 pState = pState + 1;
simon 0:1014af42efd9 235
simon 0:1014af42efd9 236 /* Decrement the loop counter */
simon 0:1014af42efd9 237 blkCnt--;
simon 0:1014af42efd9 238 }
simon 0:1014af42efd9 239
simon 0:1014af42efd9 240 /* Processing is complete.
simon 0:1014af42efd9 241 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
simon 0:1014af42efd9 242 ** This prepares the state buffer for the next function call. */
simon 0:1014af42efd9 243
simon 0:1014af42efd9 244 /* Points to the start of the state buffer */
simon 0:1014af42efd9 245 pStateCurnt = S->pState;
simon 0:1014af42efd9 246
simon 0:1014af42efd9 247 /* Calculation of count for copying integer writes */
simon 0:1014af42efd9 248 tapCnt = (numTaps - 1u) >> 2;
simon 0:1014af42efd9 249
simon 0:1014af42efd9 250 while(tapCnt > 0u)
simon 0:1014af42efd9 251 {
simon 0:1014af42efd9 252 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
simon 0:1014af42efd9 253 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
simon 0:1014af42efd9 254
simon 0:1014af42efd9 255 tapCnt--;
simon 0:1014af42efd9 256
simon 0:1014af42efd9 257 }
simon 0:1014af42efd9 258
simon 0:1014af42efd9 259 /* Calculation of count for remaining q15_t data */
simon 0:1014af42efd9 260 tapCnt = (numTaps - 1u) % 0x4u;
simon 0:1014af42efd9 261
simon 0:1014af42efd9 262 /* copy remaining data */
simon 0:1014af42efd9 263 while(tapCnt > 0u)
simon 0:1014af42efd9 264 {
simon 0:1014af42efd9 265 *pStateCurnt++ = *pState++;
simon 0:1014af42efd9 266
simon 0:1014af42efd9 267 /* Decrement the loop counter */
simon 0:1014af42efd9 268 tapCnt--;
simon 0:1014af42efd9 269 }
simon 0:1014af42efd9 270 }
simon 0:1014af42efd9 271
simon 0:1014af42efd9 272 /**
simon 0:1014af42efd9 273 * @} end of FIR group
simon 0:1014af42efd9 274 */