V4.0.1 of the ARM CMSIS DSP libraries. Note that arm_bitreversal2.s, arm_cfft_f32.c and arm_rfft_fast_f32.c had to be removed. arm_bitreversal2.s will not assemble with the online tools. So, the fast f32 FFT functions are not yet available. All the other FFT functions are available.

Dependents:   MPU9150_Example fir_f32 fir_f32 MPU9150_nucleo_noni2cdev ... more

Committer:
emh203
Date:
Mon Jul 28 15:03:15 2014 +0000
Revision:
0:3d9c67d97d6f
1st working commit.   Had to remove arm_bitreversal2.s     arm_cfft_f32.c and arm_rfft_fast_f32.c.    The .s will not assemble.      For now I removed these functions so we could at least have a library for the other functions.

Who changed what in which revision?

UserRevisionLine numberNew contents of line
emh203 0:3d9c67d97d6f 1 /* ----------------------------------------------------------------------
emh203 0:3d9c67d97d6f 2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
emh203 0:3d9c67d97d6f 3 *
emh203 0:3d9c67d97d6f 4 * $Date: 12. March 2014
emh203 0:3d9c67d97d6f 5 * $Revision: V1.4.3
emh203 0:3d9c67d97d6f 6 *
emh203 0:3d9c67d97d6f 7 * Project: CMSIS DSP Library
emh203 0:3d9c67d97d6f 8 * Title: arm_conv_partial_fast_q15.c
emh203 0:3d9c67d97d6f 9 *
emh203 0:3d9c67d97d6f 10 * Description: Fast Q15 Partial convolution.
emh203 0:3d9c67d97d6f 11 *
emh203 0:3d9c67d97d6f 12 * Target Processor: Cortex-M4/Cortex-M3
emh203 0:3d9c67d97d6f 13 *
emh203 0:3d9c67d97d6f 14 * Redistribution and use in source and binary forms, with or without
emh203 0:3d9c67d97d6f 15 * modification, are permitted provided that the following conditions
emh203 0:3d9c67d97d6f 16 * are met:
emh203 0:3d9c67d97d6f 17 * - Redistributions of source code must retain the above copyright
emh203 0:3d9c67d97d6f 18 * notice, this list of conditions and the following disclaimer.
emh203 0:3d9c67d97d6f 19 * - Redistributions in binary form must reproduce the above copyright
emh203 0:3d9c67d97d6f 20 * notice, this list of conditions and the following disclaimer in
emh203 0:3d9c67d97d6f 21 * the documentation and/or other materials provided with the
emh203 0:3d9c67d97d6f 22 * distribution.
emh203 0:3d9c67d97d6f 23 * - Neither the name of ARM LIMITED nor the names of its contributors
emh203 0:3d9c67d97d6f 24 * may be used to endorse or promote products derived from this
emh203 0:3d9c67d97d6f 25 * software without specific prior written permission.
emh203 0:3d9c67d97d6f 26 *
emh203 0:3d9c67d97d6f 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
emh203 0:3d9c67d97d6f 28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
emh203 0:3d9c67d97d6f 29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
emh203 0:3d9c67d97d6f 30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
emh203 0:3d9c67d97d6f 31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
emh203 0:3d9c67d97d6f 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
emh203 0:3d9c67d97d6f 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
emh203 0:3d9c67d97d6f 34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
emh203 0:3d9c67d97d6f 35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
emh203 0:3d9c67d97d6f 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
emh203 0:3d9c67d97d6f 37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
emh203 0:3d9c67d97d6f 38 * POSSIBILITY OF SUCH DAMAGE.
emh203 0:3d9c67d97d6f 39 * -------------------------------------------------------------------- */
emh203 0:3d9c67d97d6f 40
emh203 0:3d9c67d97d6f 41 #include "arm_math.h"
emh203 0:3d9c67d97d6f 42
emh203 0:3d9c67d97d6f 43 /**
emh203 0:3d9c67d97d6f 44 * @ingroup groupFilters
emh203 0:3d9c67d97d6f 45 */
emh203 0:3d9c67d97d6f 46
emh203 0:3d9c67d97d6f 47 /**
emh203 0:3d9c67d97d6f 48 * @addtogroup PartialConv
emh203 0:3d9c67d97d6f 49 * @{
emh203 0:3d9c67d97d6f 50 */
emh203 0:3d9c67d97d6f 51
emh203 0:3d9c67d97d6f 52 /**
emh203 0:3d9c67d97d6f 53 * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
emh203 0:3d9c67d97d6f 54 * @param[in] *pSrcA points to the first input sequence.
emh203 0:3d9c67d97d6f 55 * @param[in] srcALen length of the first input sequence.
emh203 0:3d9c67d97d6f 56 * @param[in] *pSrcB points to the second input sequence.
emh203 0:3d9c67d97d6f 57 * @param[in] srcBLen length of the second input sequence.
emh203 0:3d9c67d97d6f 58 * @param[out] *pDst points to the location where the output result is written.
emh203 0:3d9c67d97d6f 59 * @param[in] firstIndex is the first output sample to start with.
emh203 0:3d9c67d97d6f 60 * @param[in] numPoints is the number of output points to be computed.
emh203 0:3d9c67d97d6f 61 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
emh203 0:3d9c67d97d6f 62 *
emh203 0:3d9c67d97d6f 63 * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
emh203 0:3d9c67d97d6f 64 */
emh203 0:3d9c67d97d6f 65
emh203 0:3d9c67d97d6f 66
emh203 0:3d9c67d97d6f 67 arm_status arm_conv_partial_fast_q15(
emh203 0:3d9c67d97d6f 68 q15_t * pSrcA,
emh203 0:3d9c67d97d6f 69 uint32_t srcALen,
emh203 0:3d9c67d97d6f 70 q15_t * pSrcB,
emh203 0:3d9c67d97d6f 71 uint32_t srcBLen,
emh203 0:3d9c67d97d6f 72 q15_t * pDst,
emh203 0:3d9c67d97d6f 73 uint32_t firstIndex,
emh203 0:3d9c67d97d6f 74 uint32_t numPoints)
emh203 0:3d9c67d97d6f 75 {
emh203 0:3d9c67d97d6f 76 #ifndef UNALIGNED_SUPPORT_DISABLE
emh203 0:3d9c67d97d6f 77
emh203 0:3d9c67d97d6f 78 q15_t *pIn1; /* inputA pointer */
emh203 0:3d9c67d97d6f 79 q15_t *pIn2; /* inputB pointer */
emh203 0:3d9c67d97d6f 80 q15_t *pOut = pDst; /* output pointer */
emh203 0:3d9c67d97d6f 81 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
emh203 0:3d9c67d97d6f 82 q15_t *px; /* Intermediate inputA pointer */
emh203 0:3d9c67d97d6f 83 q15_t *py; /* Intermediate inputB pointer */
emh203 0:3d9c67d97d6f 84 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
emh203 0:3d9c67d97d6f 85 q31_t x0, x1, x2, x3, c0;
emh203 0:3d9c67d97d6f 86 uint32_t j, k, count, check, blkCnt;
emh203 0:3d9c67d97d6f 87 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
emh203 0:3d9c67d97d6f 88 arm_status status; /* status of Partial convolution */
emh203 0:3d9c67d97d6f 89
emh203 0:3d9c67d97d6f 90 /* Check for range of output samples to be calculated */
emh203 0:3d9c67d97d6f 91 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
emh203 0:3d9c67d97d6f 92 {
emh203 0:3d9c67d97d6f 93 /* Set status as ARM_MATH_ARGUMENT_ERROR */
emh203 0:3d9c67d97d6f 94 status = ARM_MATH_ARGUMENT_ERROR;
emh203 0:3d9c67d97d6f 95 }
emh203 0:3d9c67d97d6f 96 else
emh203 0:3d9c67d97d6f 97 {
emh203 0:3d9c67d97d6f 98
emh203 0:3d9c67d97d6f 99 /* The algorithm implementation is based on the lengths of the inputs. */
emh203 0:3d9c67d97d6f 100 /* srcB is always made to slide across srcA. */
emh203 0:3d9c67d97d6f 101 /* So srcBLen is always considered as shorter or equal to srcALen */
emh203 0:3d9c67d97d6f 102 if(srcALen >=srcBLen)
emh203 0:3d9c67d97d6f 103 {
emh203 0:3d9c67d97d6f 104 /* Initialization of inputA pointer */
emh203 0:3d9c67d97d6f 105 pIn1 = pSrcA;
emh203 0:3d9c67d97d6f 106
emh203 0:3d9c67d97d6f 107 /* Initialization of inputB pointer */
emh203 0:3d9c67d97d6f 108 pIn2 = pSrcB;
emh203 0:3d9c67d97d6f 109 }
emh203 0:3d9c67d97d6f 110 else
emh203 0:3d9c67d97d6f 111 {
emh203 0:3d9c67d97d6f 112 /* Initialization of inputA pointer */
emh203 0:3d9c67d97d6f 113 pIn1 = pSrcB;
emh203 0:3d9c67d97d6f 114
emh203 0:3d9c67d97d6f 115 /* Initialization of inputB pointer */
emh203 0:3d9c67d97d6f 116 pIn2 = pSrcA;
emh203 0:3d9c67d97d6f 117
emh203 0:3d9c67d97d6f 118 /* srcBLen is always considered as shorter or equal to srcALen */
emh203 0:3d9c67d97d6f 119 j = srcBLen;
emh203 0:3d9c67d97d6f 120 srcBLen = srcALen;
emh203 0:3d9c67d97d6f 121 srcALen = j;
emh203 0:3d9c67d97d6f 122 }
emh203 0:3d9c67d97d6f 123
emh203 0:3d9c67d97d6f 124 /* Conditions to check which loopCounter holds
emh203 0:3d9c67d97d6f 125 * the first and last indices of the output samples to be calculated. */
emh203 0:3d9c67d97d6f 126 check = firstIndex + numPoints;
emh203 0:3d9c67d97d6f 127 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
emh203 0:3d9c67d97d6f 128 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
emh203 0:3d9c67d97d6f 129 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
emh203 0:3d9c67d97d6f 130 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
emh203 0:3d9c67d97d6f 131 (int32_t) numPoints) : 0;
emh203 0:3d9c67d97d6f 132 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
emh203 0:3d9c67d97d6f 133 (int32_t) firstIndex);
emh203 0:3d9c67d97d6f 134 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
emh203 0:3d9c67d97d6f 135
emh203 0:3d9c67d97d6f 136 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
emh203 0:3d9c67d97d6f 137 /* The function is internally
emh203 0:3d9c67d97d6f 138 * divided into three stages according to the number of multiplications that has to be
emh203 0:3d9c67d97d6f 139 * taken place between inputA samples and inputB samples. In the first stage of the
emh203 0:3d9c67d97d6f 140 * algorithm, the multiplications increase by one for every iteration.
emh203 0:3d9c67d97d6f 141 * In the second stage of the algorithm, srcBLen number of multiplications are done.
emh203 0:3d9c67d97d6f 142 * In the third stage of the algorithm, the multiplications decrease by one
emh203 0:3d9c67d97d6f 143 * for every iteration. */
emh203 0:3d9c67d97d6f 144
emh203 0:3d9c67d97d6f 145 /* Set the output pointer to point to the firstIndex
emh203 0:3d9c67d97d6f 146 * of the output sample to be calculated. */
emh203 0:3d9c67d97d6f 147 pOut = pDst + firstIndex;
emh203 0:3d9c67d97d6f 148
emh203 0:3d9c67d97d6f 149 /* --------------------------
emh203 0:3d9c67d97d6f 150 * Initializations of stage1
emh203 0:3d9c67d97d6f 151 * -------------------------*/
emh203 0:3d9c67d97d6f 152
emh203 0:3d9c67d97d6f 153 /* sum = x[0] * y[0]
emh203 0:3d9c67d97d6f 154 * sum = x[0] * y[1] + x[1] * y[0]
emh203 0:3d9c67d97d6f 155 * ....
emh203 0:3d9c67d97d6f 156 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
emh203 0:3d9c67d97d6f 157 */
emh203 0:3d9c67d97d6f 158
emh203 0:3d9c67d97d6f 159 /* In this stage the MAC operations are increased by 1 for every iteration.
emh203 0:3d9c67d97d6f 160 The count variable holds the number of MAC operations performed.
emh203 0:3d9c67d97d6f 161 Since the partial convolution starts from firstIndex
emh203 0:3d9c67d97d6f 162 Number of Macs to be performed is firstIndex + 1 */
emh203 0:3d9c67d97d6f 163 count = 1u + firstIndex;
emh203 0:3d9c67d97d6f 164
emh203 0:3d9c67d97d6f 165 /* Working pointer of inputA */
emh203 0:3d9c67d97d6f 166 px = pIn1;
emh203 0:3d9c67d97d6f 167
emh203 0:3d9c67d97d6f 168 /* Working pointer of inputB */
emh203 0:3d9c67d97d6f 169 pSrc2 = pIn2 + firstIndex;
emh203 0:3d9c67d97d6f 170 py = pSrc2;
emh203 0:3d9c67d97d6f 171
emh203 0:3d9c67d97d6f 172 /* ------------------------
emh203 0:3d9c67d97d6f 173 * Stage1 process
emh203 0:3d9c67d97d6f 174 * ----------------------*/
emh203 0:3d9c67d97d6f 175
emh203 0:3d9c67d97d6f 176 /* For loop unrolling by 4, this stage is divided into two. */
emh203 0:3d9c67d97d6f 177 /* First part of this stage computes the MAC operations less than 4 */
emh203 0:3d9c67d97d6f 178 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
emh203 0:3d9c67d97d6f 179
emh203 0:3d9c67d97d6f 180 /* The first part of the stage starts here */
emh203 0:3d9c67d97d6f 181 while((count < 4u) && (blockSize1 > 0))
emh203 0:3d9c67d97d6f 182 {
emh203 0:3d9c67d97d6f 183 /* Accumulator is made zero for every iteration */
emh203 0:3d9c67d97d6f 184 sum = 0;
emh203 0:3d9c67d97d6f 185
emh203 0:3d9c67d97d6f 186 /* Loop over number of MAC operations between
emh203 0:3d9c67d97d6f 187 * inputA samples and inputB samples */
emh203 0:3d9c67d97d6f 188 k = count;
emh203 0:3d9c67d97d6f 189
emh203 0:3d9c67d97d6f 190 while(k > 0u)
emh203 0:3d9c67d97d6f 191 {
emh203 0:3d9c67d97d6f 192 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 193 sum = __SMLAD(*px++, *py--, sum);
emh203 0:3d9c67d97d6f 194
emh203 0:3d9c67d97d6f 195 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 196 k--;
emh203 0:3d9c67d97d6f 197 }
emh203 0:3d9c67d97d6f 198
emh203 0:3d9c67d97d6f 199 /* Store the result in the accumulator in the destination buffer. */
emh203 0:3d9c67d97d6f 200 *pOut++ = (q15_t) (sum >> 15);
emh203 0:3d9c67d97d6f 201
emh203 0:3d9c67d97d6f 202 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 203 py = ++pSrc2;
emh203 0:3d9c67d97d6f 204 px = pIn1;
emh203 0:3d9c67d97d6f 205
emh203 0:3d9c67d97d6f 206 /* Increment the MAC count */
emh203 0:3d9c67d97d6f 207 count++;
emh203 0:3d9c67d97d6f 208
emh203 0:3d9c67d97d6f 209 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 210 blockSize1--;
emh203 0:3d9c67d97d6f 211 }
emh203 0:3d9c67d97d6f 212
emh203 0:3d9c67d97d6f 213 /* The second part of the stage starts here */
emh203 0:3d9c67d97d6f 214 /* The internal loop, over count, is unrolled by 4 */
emh203 0:3d9c67d97d6f 215 /* To, read the last two inputB samples using SIMD:
emh203 0:3d9c67d97d6f 216 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
emh203 0:3d9c67d97d6f 217 py = py - 1;
emh203 0:3d9c67d97d6f 218
emh203 0:3d9c67d97d6f 219 while(blockSize1 > 0)
emh203 0:3d9c67d97d6f 220 {
emh203 0:3d9c67d97d6f 221 /* Accumulator is made zero for every iteration */
emh203 0:3d9c67d97d6f 222 sum = 0;
emh203 0:3d9c67d97d6f 223
emh203 0:3d9c67d97d6f 224 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203 0:3d9c67d97d6f 225 k = count >> 2u;
emh203 0:3d9c67d97d6f 226
emh203 0:3d9c67d97d6f 227 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emh203 0:3d9c67d97d6f 228 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
emh203 0:3d9c67d97d6f 229 while(k > 0u)
emh203 0:3d9c67d97d6f 230 {
emh203 0:3d9c67d97d6f 231 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 232 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
emh203 0:3d9c67d97d6f 233 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
emh203 0:3d9c67d97d6f 234 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
emh203 0:3d9c67d97d6f 235 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
emh203 0:3d9c67d97d6f 236
emh203 0:3d9c67d97d6f 237 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 238 k--;
emh203 0:3d9c67d97d6f 239 }
emh203 0:3d9c67d97d6f 240
emh203 0:3d9c67d97d6f 241 /* For the next MAC operations, the pointer py is used without SIMD
emh203 0:3d9c67d97d6f 242 * So, py is incremented by 1 */
emh203 0:3d9c67d97d6f 243 py = py + 1u;
emh203 0:3d9c67d97d6f 244
emh203 0:3d9c67d97d6f 245 /* If the count is not a multiple of 4, compute any remaining MACs here.
emh203 0:3d9c67d97d6f 246 ** No loop unrolling is used. */
emh203 0:3d9c67d97d6f 247 k = count % 0x4u;
emh203 0:3d9c67d97d6f 248
emh203 0:3d9c67d97d6f 249 while(k > 0u)
emh203 0:3d9c67d97d6f 250 {
emh203 0:3d9c67d97d6f 251 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 252 sum = __SMLAD(*px++, *py--, sum);
emh203 0:3d9c67d97d6f 253
emh203 0:3d9c67d97d6f 254 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 255 k--;
emh203 0:3d9c67d97d6f 256 }
emh203 0:3d9c67d97d6f 257
emh203 0:3d9c67d97d6f 258 /* Store the result in the accumulator in the destination buffer. */
emh203 0:3d9c67d97d6f 259 *pOut++ = (q15_t) (sum >> 15);
emh203 0:3d9c67d97d6f 260
emh203 0:3d9c67d97d6f 261 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 262 py = ++pSrc2 - 1u;
emh203 0:3d9c67d97d6f 263 px = pIn1;
emh203 0:3d9c67d97d6f 264
emh203 0:3d9c67d97d6f 265 /* Increment the MAC count */
emh203 0:3d9c67d97d6f 266 count++;
emh203 0:3d9c67d97d6f 267
emh203 0:3d9c67d97d6f 268 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 269 blockSize1--;
emh203 0:3d9c67d97d6f 270 }
emh203 0:3d9c67d97d6f 271
emh203 0:3d9c67d97d6f 272 /* --------------------------
emh203 0:3d9c67d97d6f 273 * Initializations of stage2
emh203 0:3d9c67d97d6f 274 * ------------------------*/
emh203 0:3d9c67d97d6f 275
emh203 0:3d9c67d97d6f 276 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
emh203 0:3d9c67d97d6f 277 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
emh203 0:3d9c67d97d6f 278 * ....
emh203 0:3d9c67d97d6f 279 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
emh203 0:3d9c67d97d6f 280 */
emh203 0:3d9c67d97d6f 281
emh203 0:3d9c67d97d6f 282 /* Working pointer of inputA */
emh203 0:3d9c67d97d6f 283 if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
emh203 0:3d9c67d97d6f 284 {
emh203 0:3d9c67d97d6f 285 px = pIn1 + firstIndex - srcBLen + 1;
emh203 0:3d9c67d97d6f 286 }
emh203 0:3d9c67d97d6f 287 else
emh203 0:3d9c67d97d6f 288 {
emh203 0:3d9c67d97d6f 289 px = pIn1;
emh203 0:3d9c67d97d6f 290 }
emh203 0:3d9c67d97d6f 291
emh203 0:3d9c67d97d6f 292 /* Working pointer of inputB */
emh203 0:3d9c67d97d6f 293 pSrc2 = pIn2 + (srcBLen - 1u);
emh203 0:3d9c67d97d6f 294 py = pSrc2;
emh203 0:3d9c67d97d6f 295
emh203 0:3d9c67d97d6f 296 /* count is the index by which the pointer pIn1 to be incremented */
emh203 0:3d9c67d97d6f 297 count = 0u;
emh203 0:3d9c67d97d6f 298
emh203 0:3d9c67d97d6f 299
emh203 0:3d9c67d97d6f 300 /* --------------------
emh203 0:3d9c67d97d6f 301 * Stage2 process
emh203 0:3d9c67d97d6f 302 * -------------------*/
emh203 0:3d9c67d97d6f 303
emh203 0:3d9c67d97d6f 304 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
emh203 0:3d9c67d97d6f 305 * So, to loop unroll over blockSize2,
emh203 0:3d9c67d97d6f 306 * srcBLen should be greater than or equal to 4 */
emh203 0:3d9c67d97d6f 307 if(srcBLen >= 4u)
emh203 0:3d9c67d97d6f 308 {
emh203 0:3d9c67d97d6f 309 /* Loop unroll over blockSize2, by 4 */
emh203 0:3d9c67d97d6f 310 blkCnt = ((uint32_t) blockSize2 >> 2u);
emh203 0:3d9c67d97d6f 311
emh203 0:3d9c67d97d6f 312 while(blkCnt > 0u)
emh203 0:3d9c67d97d6f 313 {
emh203 0:3d9c67d97d6f 314 py = py - 1u;
emh203 0:3d9c67d97d6f 315
emh203 0:3d9c67d97d6f 316 /* Set all accumulators to zero */
emh203 0:3d9c67d97d6f 317 acc0 = 0;
emh203 0:3d9c67d97d6f 318 acc1 = 0;
emh203 0:3d9c67d97d6f 319 acc2 = 0;
emh203 0:3d9c67d97d6f 320 acc3 = 0;
emh203 0:3d9c67d97d6f 321
emh203 0:3d9c67d97d6f 322
emh203 0:3d9c67d97d6f 323 /* read x[0], x[1] samples */
emh203 0:3d9c67d97d6f 324 x0 = *__SIMD32(px);
emh203 0:3d9c67d97d6f 325 /* read x[1], x[2] samples */
emh203 0:3d9c67d97d6f 326 x1 = _SIMD32_OFFSET(px+1);
emh203 0:3d9c67d97d6f 327 px+= 2u;
emh203 0:3d9c67d97d6f 328
emh203 0:3d9c67d97d6f 329
emh203 0:3d9c67d97d6f 330 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203 0:3d9c67d97d6f 331 k = srcBLen >> 2u;
emh203 0:3d9c67d97d6f 332
emh203 0:3d9c67d97d6f 333 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emh203 0:3d9c67d97d6f 334 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
emh203 0:3d9c67d97d6f 335 do
emh203 0:3d9c67d97d6f 336 {
emh203 0:3d9c67d97d6f 337 /* Read the last two inputB samples using SIMD:
emh203 0:3d9c67d97d6f 338 * y[srcBLen - 1] and y[srcBLen - 2] */
emh203 0:3d9c67d97d6f 339 c0 = *__SIMD32(py)--;
emh203 0:3d9c67d97d6f 340
emh203 0:3d9c67d97d6f 341 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
emh203 0:3d9c67d97d6f 342 acc0 = __SMLADX(x0, c0, acc0);
emh203 0:3d9c67d97d6f 343
emh203 0:3d9c67d97d6f 344 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
emh203 0:3d9c67d97d6f 345 acc1 = __SMLADX(x1, c0, acc1);
emh203 0:3d9c67d97d6f 346
emh203 0:3d9c67d97d6f 347 /* Read x[2], x[3] */
emh203 0:3d9c67d97d6f 348 x2 = *__SIMD32(px);
emh203 0:3d9c67d97d6f 349
emh203 0:3d9c67d97d6f 350 /* Read x[3], x[4] */
emh203 0:3d9c67d97d6f 351 x3 = _SIMD32_OFFSET(px+1);
emh203 0:3d9c67d97d6f 352
emh203 0:3d9c67d97d6f 353 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
emh203 0:3d9c67d97d6f 354 acc2 = __SMLADX(x2, c0, acc2);
emh203 0:3d9c67d97d6f 355
emh203 0:3d9c67d97d6f 356 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
emh203 0:3d9c67d97d6f 357 acc3 = __SMLADX(x3, c0, acc3);
emh203 0:3d9c67d97d6f 358
emh203 0:3d9c67d97d6f 359 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
emh203 0:3d9c67d97d6f 360 c0 = *__SIMD32(py)--;
emh203 0:3d9c67d97d6f 361
emh203 0:3d9c67d97d6f 362 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
emh203 0:3d9c67d97d6f 363 acc0 = __SMLADX(x2, c0, acc0);
emh203 0:3d9c67d97d6f 364
emh203 0:3d9c67d97d6f 365 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
emh203 0:3d9c67d97d6f 366 acc1 = __SMLADX(x3, c0, acc1);
emh203 0:3d9c67d97d6f 367
emh203 0:3d9c67d97d6f 368 /* Read x[4], x[5] */
emh203 0:3d9c67d97d6f 369 x0 = _SIMD32_OFFSET(px+2);
emh203 0:3d9c67d97d6f 370
emh203 0:3d9c67d97d6f 371 /* Read x[5], x[6] */
emh203 0:3d9c67d97d6f 372 x1 = _SIMD32_OFFSET(px+3);
emh203 0:3d9c67d97d6f 373 px += 4u;
emh203 0:3d9c67d97d6f 374
emh203 0:3d9c67d97d6f 375 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
emh203 0:3d9c67d97d6f 376 acc2 = __SMLADX(x0, c0, acc2);
emh203 0:3d9c67d97d6f 377
emh203 0:3d9c67d97d6f 378 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
emh203 0:3d9c67d97d6f 379 acc3 = __SMLADX(x1, c0, acc3);
emh203 0:3d9c67d97d6f 380
emh203 0:3d9c67d97d6f 381 } while(--k);
emh203 0:3d9c67d97d6f 382
emh203 0:3d9c67d97d6f 383 /* For the next MAC operations, SIMD is not used
emh203 0:3d9c67d97d6f 384 * So, the 16 bit pointer if inputB, py is updated */
emh203 0:3d9c67d97d6f 385
emh203 0:3d9c67d97d6f 386 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
emh203 0:3d9c67d97d6f 387 ** No loop unrolling is used. */
emh203 0:3d9c67d97d6f 388 k = srcBLen % 0x4u;
emh203 0:3d9c67d97d6f 389
emh203 0:3d9c67d97d6f 390 if(k == 1u)
emh203 0:3d9c67d97d6f 391 {
emh203 0:3d9c67d97d6f 392 /* Read y[srcBLen - 5] */
emh203 0:3d9c67d97d6f 393 c0 = *(py+1);
emh203 0:3d9c67d97d6f 394 #ifdef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 395
emh203 0:3d9c67d97d6f 396 c0 = c0 << 16u;
emh203 0:3d9c67d97d6f 397
emh203 0:3d9c67d97d6f 398 #else
emh203 0:3d9c67d97d6f 399
emh203 0:3d9c67d97d6f 400 c0 = c0 & 0x0000FFFF;
emh203 0:3d9c67d97d6f 401
emh203 0:3d9c67d97d6f 402 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 403
emh203 0:3d9c67d97d6f 404 /* Read x[7] */
emh203 0:3d9c67d97d6f 405 x3 = *__SIMD32(px);
emh203 0:3d9c67d97d6f 406 px++;
emh203 0:3d9c67d97d6f 407
emh203 0:3d9c67d97d6f 408 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 409 acc0 = __SMLAD(x0, c0, acc0);
emh203 0:3d9c67d97d6f 410 acc1 = __SMLAD(x1, c0, acc1);
emh203 0:3d9c67d97d6f 411 acc2 = __SMLADX(x1, c0, acc2);
emh203 0:3d9c67d97d6f 412 acc3 = __SMLADX(x3, c0, acc3);
emh203 0:3d9c67d97d6f 413 }
emh203 0:3d9c67d97d6f 414
emh203 0:3d9c67d97d6f 415 if(k == 2u)
emh203 0:3d9c67d97d6f 416 {
emh203 0:3d9c67d97d6f 417 /* Read y[srcBLen - 5], y[srcBLen - 6] */
emh203 0:3d9c67d97d6f 418 c0 = _SIMD32_OFFSET(py);
emh203 0:3d9c67d97d6f 419
emh203 0:3d9c67d97d6f 420 /* Read x[7], x[8] */
emh203 0:3d9c67d97d6f 421 x3 = *__SIMD32(px);
emh203 0:3d9c67d97d6f 422
emh203 0:3d9c67d97d6f 423 /* Read x[9] */
emh203 0:3d9c67d97d6f 424 x2 = _SIMD32_OFFSET(px+1);
emh203 0:3d9c67d97d6f 425 px += 2u;
emh203 0:3d9c67d97d6f 426
emh203 0:3d9c67d97d6f 427 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 428 acc0 = __SMLADX(x0, c0, acc0);
emh203 0:3d9c67d97d6f 429 acc1 = __SMLADX(x1, c0, acc1);
emh203 0:3d9c67d97d6f 430 acc2 = __SMLADX(x3, c0, acc2);
emh203 0:3d9c67d97d6f 431 acc3 = __SMLADX(x2, c0, acc3);
emh203 0:3d9c67d97d6f 432 }
emh203 0:3d9c67d97d6f 433
emh203 0:3d9c67d97d6f 434 if(k == 3u)
emh203 0:3d9c67d97d6f 435 {
emh203 0:3d9c67d97d6f 436 /* Read y[srcBLen - 5], y[srcBLen - 6] */
emh203 0:3d9c67d97d6f 437 c0 = _SIMD32_OFFSET(py);
emh203 0:3d9c67d97d6f 438
emh203 0:3d9c67d97d6f 439 /* Read x[7], x[8] */
emh203 0:3d9c67d97d6f 440 x3 = *__SIMD32(px);
emh203 0:3d9c67d97d6f 441
emh203 0:3d9c67d97d6f 442 /* Read x[9] */
emh203 0:3d9c67d97d6f 443 x2 = _SIMD32_OFFSET(px+1);
emh203 0:3d9c67d97d6f 444
emh203 0:3d9c67d97d6f 445 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 446 acc0 = __SMLADX(x0, c0, acc0);
emh203 0:3d9c67d97d6f 447 acc1 = __SMLADX(x1, c0, acc1);
emh203 0:3d9c67d97d6f 448 acc2 = __SMLADX(x3, c0, acc2);
emh203 0:3d9c67d97d6f 449 acc3 = __SMLADX(x2, c0, acc3);
emh203 0:3d9c67d97d6f 450
emh203 0:3d9c67d97d6f 451 c0 = *(py-1);
emh203 0:3d9c67d97d6f 452 #ifdef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 453
emh203 0:3d9c67d97d6f 454 c0 = c0 << 16u;
emh203 0:3d9c67d97d6f 455 #else
emh203 0:3d9c67d97d6f 456
emh203 0:3d9c67d97d6f 457 c0 = c0 & 0x0000FFFF;
emh203 0:3d9c67d97d6f 458 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 459
emh203 0:3d9c67d97d6f 460 /* Read x[10] */
emh203 0:3d9c67d97d6f 461 x3 = _SIMD32_OFFSET(px+2);
emh203 0:3d9c67d97d6f 462 px += 3u;
emh203 0:3d9c67d97d6f 463
emh203 0:3d9c67d97d6f 464 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 465 acc0 = __SMLADX(x1, c0, acc0);
emh203 0:3d9c67d97d6f 466 acc1 = __SMLAD(x2, c0, acc1);
emh203 0:3d9c67d97d6f 467 acc2 = __SMLADX(x2, c0, acc2);
emh203 0:3d9c67d97d6f 468 acc3 = __SMLADX(x3, c0, acc3);
emh203 0:3d9c67d97d6f 469 }
emh203 0:3d9c67d97d6f 470
emh203 0:3d9c67d97d6f 471 /* Store the results in the accumulators in the destination buffer. */
emh203 0:3d9c67d97d6f 472 #ifndef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 473
emh203 0:3d9c67d97d6f 474 *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16);
emh203 0:3d9c67d97d6f 475 *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16);
emh203 0:3d9c67d97d6f 476
emh203 0:3d9c67d97d6f 477 #else
emh203 0:3d9c67d97d6f 478
emh203 0:3d9c67d97d6f 479 *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16);
emh203 0:3d9c67d97d6f 480 *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16);
emh203 0:3d9c67d97d6f 481
emh203 0:3d9c67d97d6f 482 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 483
emh203 0:3d9c67d97d6f 484 /* Increment the pointer pIn1 index, count by 4 */
emh203 0:3d9c67d97d6f 485 count += 4u;
emh203 0:3d9c67d97d6f 486
emh203 0:3d9c67d97d6f 487 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 488 px = pIn1 + count;
emh203 0:3d9c67d97d6f 489 py = pSrc2;
emh203 0:3d9c67d97d6f 490
emh203 0:3d9c67d97d6f 491 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 492 blkCnt--;
emh203 0:3d9c67d97d6f 493 }
emh203 0:3d9c67d97d6f 494
emh203 0:3d9c67d97d6f 495 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
emh203 0:3d9c67d97d6f 496 ** No loop unrolling is used. */
emh203 0:3d9c67d97d6f 497 blkCnt = (uint32_t) blockSize2 % 0x4u;
emh203 0:3d9c67d97d6f 498
emh203 0:3d9c67d97d6f 499 while(blkCnt > 0u)
emh203 0:3d9c67d97d6f 500 {
emh203 0:3d9c67d97d6f 501 /* Accumulator is made zero for every iteration */
emh203 0:3d9c67d97d6f 502 sum = 0;
emh203 0:3d9c67d97d6f 503
emh203 0:3d9c67d97d6f 504 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203 0:3d9c67d97d6f 505 k = srcBLen >> 2u;
emh203 0:3d9c67d97d6f 506
emh203 0:3d9c67d97d6f 507 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emh203 0:3d9c67d97d6f 508 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
emh203 0:3d9c67d97d6f 509 while(k > 0u)
emh203 0:3d9c67d97d6f 510 {
emh203 0:3d9c67d97d6f 511 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 512 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 513 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 514 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 515 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 516
emh203 0:3d9c67d97d6f 517 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 518 k--;
emh203 0:3d9c67d97d6f 519 }
emh203 0:3d9c67d97d6f 520
emh203 0:3d9c67d97d6f 521 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
emh203 0:3d9c67d97d6f 522 ** No loop unrolling is used. */
emh203 0:3d9c67d97d6f 523 k = srcBLen % 0x4u;
emh203 0:3d9c67d97d6f 524
emh203 0:3d9c67d97d6f 525 while(k > 0u)
emh203 0:3d9c67d97d6f 526 {
emh203 0:3d9c67d97d6f 527 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 528 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 529
emh203 0:3d9c67d97d6f 530 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 531 k--;
emh203 0:3d9c67d97d6f 532 }
emh203 0:3d9c67d97d6f 533
emh203 0:3d9c67d97d6f 534 /* Store the result in the accumulator in the destination buffer. */
emh203 0:3d9c67d97d6f 535 *pOut++ = (q15_t) (sum >> 15);
emh203 0:3d9c67d97d6f 536
emh203 0:3d9c67d97d6f 537 /* Increment the pointer pIn1 index, count by 1 */
emh203 0:3d9c67d97d6f 538 count++;
emh203 0:3d9c67d97d6f 539
emh203 0:3d9c67d97d6f 540 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 541 px = pIn1 + count;
emh203 0:3d9c67d97d6f 542 py = pSrc2;
emh203 0:3d9c67d97d6f 543
emh203 0:3d9c67d97d6f 544 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 545 blkCnt--;
emh203 0:3d9c67d97d6f 546 }
emh203 0:3d9c67d97d6f 547 }
emh203 0:3d9c67d97d6f 548 else
emh203 0:3d9c67d97d6f 549 {
emh203 0:3d9c67d97d6f 550 /* If the srcBLen is not a multiple of 4,
emh203 0:3d9c67d97d6f 551 * the blockSize2 loop cannot be unrolled by 4 */
emh203 0:3d9c67d97d6f 552 blkCnt = (uint32_t) blockSize2;
emh203 0:3d9c67d97d6f 553
emh203 0:3d9c67d97d6f 554 while(blkCnt > 0u)
emh203 0:3d9c67d97d6f 555 {
emh203 0:3d9c67d97d6f 556 /* Accumulator is made zero for every iteration */
emh203 0:3d9c67d97d6f 557 sum = 0;
emh203 0:3d9c67d97d6f 558
emh203 0:3d9c67d97d6f 559 /* srcBLen number of MACS should be performed */
emh203 0:3d9c67d97d6f 560 k = srcBLen;
emh203 0:3d9c67d97d6f 561
emh203 0:3d9c67d97d6f 562 while(k > 0u)
emh203 0:3d9c67d97d6f 563 {
emh203 0:3d9c67d97d6f 564 /* Perform the multiply-accumulate */
emh203 0:3d9c67d97d6f 565 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 566
emh203 0:3d9c67d97d6f 567 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 568 k--;
emh203 0:3d9c67d97d6f 569 }
emh203 0:3d9c67d97d6f 570
emh203 0:3d9c67d97d6f 571 /* Store the result in the accumulator in the destination buffer. */
emh203 0:3d9c67d97d6f 572 *pOut++ = (q15_t) (sum >> 15);
emh203 0:3d9c67d97d6f 573
emh203 0:3d9c67d97d6f 574 /* Increment the MAC count */
emh203 0:3d9c67d97d6f 575 count++;
emh203 0:3d9c67d97d6f 576
emh203 0:3d9c67d97d6f 577 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 578 px = pIn1 + count;
emh203 0:3d9c67d97d6f 579 py = pSrc2;
emh203 0:3d9c67d97d6f 580
emh203 0:3d9c67d97d6f 581 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 582 blkCnt--;
emh203 0:3d9c67d97d6f 583 }
emh203 0:3d9c67d97d6f 584 }
emh203 0:3d9c67d97d6f 585
emh203 0:3d9c67d97d6f 586
emh203 0:3d9c67d97d6f 587 /* --------------------------
emh203 0:3d9c67d97d6f 588 * Initializations of stage3
emh203 0:3d9c67d97d6f 589 * -------------------------*/
emh203 0:3d9c67d97d6f 590
emh203 0:3d9c67d97d6f 591 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
emh203 0:3d9c67d97d6f 592 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
emh203 0:3d9c67d97d6f 593 * ....
emh203 0:3d9c67d97d6f 594 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
emh203 0:3d9c67d97d6f 595 * sum += x[srcALen-1] * y[srcBLen-1]
emh203 0:3d9c67d97d6f 596 */
emh203 0:3d9c67d97d6f 597
emh203 0:3d9c67d97d6f 598 /* In this stage the MAC operations are decreased by 1 for every iteration.
emh203 0:3d9c67d97d6f 599 The count variable holds the number of MAC operations performed */
emh203 0:3d9c67d97d6f 600 count = srcBLen - 1u;
emh203 0:3d9c67d97d6f 601
emh203 0:3d9c67d97d6f 602 /* Working pointer of inputA */
emh203 0:3d9c67d97d6f 603 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
emh203 0:3d9c67d97d6f 604 px = pSrc1;
emh203 0:3d9c67d97d6f 605
emh203 0:3d9c67d97d6f 606 /* Working pointer of inputB */
emh203 0:3d9c67d97d6f 607 pSrc2 = pIn2 + (srcBLen - 1u);
emh203 0:3d9c67d97d6f 608 pIn2 = pSrc2 - 1u;
emh203 0:3d9c67d97d6f 609 py = pIn2;
emh203 0:3d9c67d97d6f 610
emh203 0:3d9c67d97d6f 611 /* -------------------
emh203 0:3d9c67d97d6f 612 * Stage3 process
emh203 0:3d9c67d97d6f 613 * ------------------*/
emh203 0:3d9c67d97d6f 614
emh203 0:3d9c67d97d6f 615 /* For loop unrolling by 4, this stage is divided into two. */
emh203 0:3d9c67d97d6f 616 /* First part of this stage computes the MAC operations greater than 4 */
emh203 0:3d9c67d97d6f 617 /* Second part of this stage computes the MAC operations less than or equal to 4 */
emh203 0:3d9c67d97d6f 618
emh203 0:3d9c67d97d6f 619 /* The first part of the stage starts here */
emh203 0:3d9c67d97d6f 620 j = count >> 2u;
emh203 0:3d9c67d97d6f 621
emh203 0:3d9c67d97d6f 622 while((j > 0u) && (blockSize3 > 0))
emh203 0:3d9c67d97d6f 623 {
emh203 0:3d9c67d97d6f 624 /* Accumulator is made zero for every iteration */
emh203 0:3d9c67d97d6f 625 sum = 0;
emh203 0:3d9c67d97d6f 626
emh203 0:3d9c67d97d6f 627 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203 0:3d9c67d97d6f 628 k = count >> 2u;
emh203 0:3d9c67d97d6f 629
emh203 0:3d9c67d97d6f 630 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emh203 0:3d9c67d97d6f 631 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
emh203 0:3d9c67d97d6f 632 while(k > 0u)
emh203 0:3d9c67d97d6f 633 {
emh203 0:3d9c67d97d6f 634 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
emh203 0:3d9c67d97d6f 635 * with y[srcBLen - 1], y[srcBLen - 2] respectively */
emh203 0:3d9c67d97d6f 636 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
emh203 0:3d9c67d97d6f 637 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
emh203 0:3d9c67d97d6f 638 * with y[srcBLen - 3], y[srcBLen - 4] respectively */
emh203 0:3d9c67d97d6f 639 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
emh203 0:3d9c67d97d6f 640
emh203 0:3d9c67d97d6f 641 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 642 k--;
emh203 0:3d9c67d97d6f 643 }
emh203 0:3d9c67d97d6f 644
emh203 0:3d9c67d97d6f 645 /* For the next MAC operations, the pointer py is used without SIMD
emh203 0:3d9c67d97d6f 646 * So, py is incremented by 1 */
emh203 0:3d9c67d97d6f 647 py = py + 1u;
emh203 0:3d9c67d97d6f 648
emh203 0:3d9c67d97d6f 649 /* If the count is not a multiple of 4, compute any remaining MACs here.
emh203 0:3d9c67d97d6f 650 ** No loop unrolling is used. */
emh203 0:3d9c67d97d6f 651 k = count % 0x4u;
emh203 0:3d9c67d97d6f 652
emh203 0:3d9c67d97d6f 653 while(k > 0u)
emh203 0:3d9c67d97d6f 654 {
emh203 0:3d9c67d97d6f 655 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
emh203 0:3d9c67d97d6f 656 sum = __SMLAD(*px++, *py--, sum);
emh203 0:3d9c67d97d6f 657
emh203 0:3d9c67d97d6f 658 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 659 k--;
emh203 0:3d9c67d97d6f 660 }
emh203 0:3d9c67d97d6f 661
emh203 0:3d9c67d97d6f 662 /* Store the result in the accumulator in the destination buffer. */
emh203 0:3d9c67d97d6f 663 *pOut++ = (q15_t) (sum >> 15);
emh203 0:3d9c67d97d6f 664
emh203 0:3d9c67d97d6f 665 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 666 px = ++pSrc1;
emh203 0:3d9c67d97d6f 667 py = pIn2;
emh203 0:3d9c67d97d6f 668
emh203 0:3d9c67d97d6f 669 /* Decrement the MAC count */
emh203 0:3d9c67d97d6f 670 count--;
emh203 0:3d9c67d97d6f 671
emh203 0:3d9c67d97d6f 672 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 673 blockSize3--;
emh203 0:3d9c67d97d6f 674
emh203 0:3d9c67d97d6f 675 j--;
emh203 0:3d9c67d97d6f 676 }
emh203 0:3d9c67d97d6f 677
emh203 0:3d9c67d97d6f 678 /* The second part of the stage starts here */
emh203 0:3d9c67d97d6f 679 /* SIMD is not used for the next MAC operations,
emh203 0:3d9c67d97d6f 680 * so pointer py is updated to read only one sample at a time */
emh203 0:3d9c67d97d6f 681 py = py + 1u;
emh203 0:3d9c67d97d6f 682
emh203 0:3d9c67d97d6f 683 while(blockSize3 > 0)
emh203 0:3d9c67d97d6f 684 {
emh203 0:3d9c67d97d6f 685 /* Accumulator is made zero for every iteration */
emh203 0:3d9c67d97d6f 686 sum = 0;
emh203 0:3d9c67d97d6f 687
emh203 0:3d9c67d97d6f 688 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203 0:3d9c67d97d6f 689 k = count;
emh203 0:3d9c67d97d6f 690
emh203 0:3d9c67d97d6f 691 while(k > 0u)
emh203 0:3d9c67d97d6f 692 {
emh203 0:3d9c67d97d6f 693 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 694 /* sum += x[srcALen-1] * y[srcBLen-1] */
emh203 0:3d9c67d97d6f 695 sum = __SMLAD(*px++, *py--, sum);
emh203 0:3d9c67d97d6f 696
emh203 0:3d9c67d97d6f 697 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 698 k--;
emh203 0:3d9c67d97d6f 699 }
emh203 0:3d9c67d97d6f 700
emh203 0:3d9c67d97d6f 701 /* Store the result in the accumulator in the destination buffer. */
emh203 0:3d9c67d97d6f 702 *pOut++ = (q15_t) (sum >> 15);
emh203 0:3d9c67d97d6f 703
emh203 0:3d9c67d97d6f 704 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 705 px = ++pSrc1;
emh203 0:3d9c67d97d6f 706 py = pSrc2;
emh203 0:3d9c67d97d6f 707
emh203 0:3d9c67d97d6f 708 /* Decrement the MAC count */
emh203 0:3d9c67d97d6f 709 count--;
emh203 0:3d9c67d97d6f 710
emh203 0:3d9c67d97d6f 711 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 712 blockSize3--;
emh203 0:3d9c67d97d6f 713 }
emh203 0:3d9c67d97d6f 714
emh203 0:3d9c67d97d6f 715 /* set status as ARM_MATH_SUCCESS */
emh203 0:3d9c67d97d6f 716 status = ARM_MATH_SUCCESS;
emh203 0:3d9c67d97d6f 717 }
emh203 0:3d9c67d97d6f 718
emh203 0:3d9c67d97d6f 719 /* Return to application */
emh203 0:3d9c67d97d6f 720 return (status);
emh203 0:3d9c67d97d6f 721
emh203 0:3d9c67d97d6f 722 #else
emh203 0:3d9c67d97d6f 723
emh203 0:3d9c67d97d6f 724 q15_t *pIn1; /* inputA pointer */
emh203 0:3d9c67d97d6f 725 q15_t *pIn2; /* inputB pointer */
emh203 0:3d9c67d97d6f 726 q15_t *pOut = pDst; /* output pointer */
emh203 0:3d9c67d97d6f 727 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
emh203 0:3d9c67d97d6f 728 q15_t *px; /* Intermediate inputA pointer */
emh203 0:3d9c67d97d6f 729 q15_t *py; /* Intermediate inputB pointer */
emh203 0:3d9c67d97d6f 730 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
emh203 0:3d9c67d97d6f 731 q31_t x0, x1, x2, x3, c0;
emh203 0:3d9c67d97d6f 732 uint32_t j, k, count, check, blkCnt;
emh203 0:3d9c67d97d6f 733 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
emh203 0:3d9c67d97d6f 734 arm_status status; /* status of Partial convolution */
emh203 0:3d9c67d97d6f 735 q15_t a, b;
emh203 0:3d9c67d97d6f 736
emh203 0:3d9c67d97d6f 737 /* Check for range of output samples to be calculated */
emh203 0:3d9c67d97d6f 738 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
emh203 0:3d9c67d97d6f 739 {
emh203 0:3d9c67d97d6f 740 /* Set status as ARM_MATH_ARGUMENT_ERROR */
emh203 0:3d9c67d97d6f 741 status = ARM_MATH_ARGUMENT_ERROR;
emh203 0:3d9c67d97d6f 742 }
emh203 0:3d9c67d97d6f 743 else
emh203 0:3d9c67d97d6f 744 {
emh203 0:3d9c67d97d6f 745
emh203 0:3d9c67d97d6f 746 /* The algorithm implementation is based on the lengths of the inputs. */
emh203 0:3d9c67d97d6f 747 /* srcB is always made to slide across srcA. */
emh203 0:3d9c67d97d6f 748 /* So srcBLen is always considered as shorter or equal to srcALen */
emh203 0:3d9c67d97d6f 749 if(srcALen >=srcBLen)
emh203 0:3d9c67d97d6f 750 {
emh203 0:3d9c67d97d6f 751 /* Initialization of inputA pointer */
emh203 0:3d9c67d97d6f 752 pIn1 = pSrcA;
emh203 0:3d9c67d97d6f 753
emh203 0:3d9c67d97d6f 754 /* Initialization of inputB pointer */
emh203 0:3d9c67d97d6f 755 pIn2 = pSrcB;
emh203 0:3d9c67d97d6f 756 }
emh203 0:3d9c67d97d6f 757 else
emh203 0:3d9c67d97d6f 758 {
emh203 0:3d9c67d97d6f 759 /* Initialization of inputA pointer */
emh203 0:3d9c67d97d6f 760 pIn1 = pSrcB;
emh203 0:3d9c67d97d6f 761
emh203 0:3d9c67d97d6f 762 /* Initialization of inputB pointer */
emh203 0:3d9c67d97d6f 763 pIn2 = pSrcA;
emh203 0:3d9c67d97d6f 764
emh203 0:3d9c67d97d6f 765 /* srcBLen is always considered as shorter or equal to srcALen */
emh203 0:3d9c67d97d6f 766 j = srcBLen;
emh203 0:3d9c67d97d6f 767 srcBLen = srcALen;
emh203 0:3d9c67d97d6f 768 srcALen = j;
emh203 0:3d9c67d97d6f 769 }
emh203 0:3d9c67d97d6f 770
emh203 0:3d9c67d97d6f 771 /* Conditions to check which loopCounter holds
emh203 0:3d9c67d97d6f 772 * the first and last indices of the output samples to be calculated. */
emh203 0:3d9c67d97d6f 773 check = firstIndex + numPoints;
emh203 0:3d9c67d97d6f 774 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
emh203 0:3d9c67d97d6f 775 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
emh203 0:3d9c67d97d6f 776 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
emh203 0:3d9c67d97d6f 777 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
emh203 0:3d9c67d97d6f 778 (int32_t) numPoints) : 0;
emh203 0:3d9c67d97d6f 779 blockSize2 = ((int32_t) check - blockSize3) -
emh203 0:3d9c67d97d6f 780 (blockSize1 + (int32_t) firstIndex);
emh203 0:3d9c67d97d6f 781 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
emh203 0:3d9c67d97d6f 782
emh203 0:3d9c67d97d6f 783 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
emh203 0:3d9c67d97d6f 784 /* The function is internally
emh203 0:3d9c67d97d6f 785 * divided into three stages according to the number of multiplications that has to be
emh203 0:3d9c67d97d6f 786 * taken place between inputA samples and inputB samples. In the first stage of the
emh203 0:3d9c67d97d6f 787 * algorithm, the multiplications increase by one for every iteration.
emh203 0:3d9c67d97d6f 788 * In the second stage of the algorithm, srcBLen number of multiplications are done.
emh203 0:3d9c67d97d6f 789 * In the third stage of the algorithm, the multiplications decrease by one
emh203 0:3d9c67d97d6f 790 * for every iteration. */
emh203 0:3d9c67d97d6f 791
emh203 0:3d9c67d97d6f 792 /* Set the output pointer to point to the firstIndex
emh203 0:3d9c67d97d6f 793 * of the output sample to be calculated. */
emh203 0:3d9c67d97d6f 794 pOut = pDst + firstIndex;
emh203 0:3d9c67d97d6f 795
emh203 0:3d9c67d97d6f 796 /* --------------------------
emh203 0:3d9c67d97d6f 797 * Initializations of stage1
emh203 0:3d9c67d97d6f 798 * -------------------------*/
emh203 0:3d9c67d97d6f 799
emh203 0:3d9c67d97d6f 800 /* sum = x[0] * y[0]
emh203 0:3d9c67d97d6f 801 * sum = x[0] * y[1] + x[1] * y[0]
emh203 0:3d9c67d97d6f 802 * ....
emh203 0:3d9c67d97d6f 803 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
emh203 0:3d9c67d97d6f 804 */
emh203 0:3d9c67d97d6f 805
emh203 0:3d9c67d97d6f 806 /* In this stage the MAC operations are increased by 1 for every iteration.
emh203 0:3d9c67d97d6f 807 The count variable holds the number of MAC operations performed.
emh203 0:3d9c67d97d6f 808 Since the partial convolution starts from firstIndex
emh203 0:3d9c67d97d6f 809 Number of Macs to be performed is firstIndex + 1 */
emh203 0:3d9c67d97d6f 810 count = 1u + firstIndex;
emh203 0:3d9c67d97d6f 811
emh203 0:3d9c67d97d6f 812 /* Working pointer of inputA */
emh203 0:3d9c67d97d6f 813 px = pIn1;
emh203 0:3d9c67d97d6f 814
emh203 0:3d9c67d97d6f 815 /* Working pointer of inputB */
emh203 0:3d9c67d97d6f 816 pSrc2 = pIn2 + firstIndex;
emh203 0:3d9c67d97d6f 817 py = pSrc2;
emh203 0:3d9c67d97d6f 818
emh203 0:3d9c67d97d6f 819 /* ------------------------
emh203 0:3d9c67d97d6f 820 * Stage1 process
emh203 0:3d9c67d97d6f 821 * ----------------------*/
emh203 0:3d9c67d97d6f 822
emh203 0:3d9c67d97d6f 823 /* For loop unrolling by 4, this stage is divided into two. */
emh203 0:3d9c67d97d6f 824 /* First part of this stage computes the MAC operations less than 4 */
emh203 0:3d9c67d97d6f 825 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
emh203 0:3d9c67d97d6f 826
emh203 0:3d9c67d97d6f 827 /* The first part of the stage starts here */
emh203 0:3d9c67d97d6f 828 while((count < 4u) && (blockSize1 > 0))
emh203 0:3d9c67d97d6f 829 {
emh203 0:3d9c67d97d6f 830 /* Accumulator is made zero for every iteration */
emh203 0:3d9c67d97d6f 831 sum = 0;
emh203 0:3d9c67d97d6f 832
emh203 0:3d9c67d97d6f 833 /* Loop over number of MAC operations between
emh203 0:3d9c67d97d6f 834 * inputA samples and inputB samples */
emh203 0:3d9c67d97d6f 835 k = count;
emh203 0:3d9c67d97d6f 836
emh203 0:3d9c67d97d6f 837 while(k > 0u)
emh203 0:3d9c67d97d6f 838 {
emh203 0:3d9c67d97d6f 839 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 840 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 841
emh203 0:3d9c67d97d6f 842 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 843 k--;
emh203 0:3d9c67d97d6f 844 }
emh203 0:3d9c67d97d6f 845
emh203 0:3d9c67d97d6f 846 /* Store the result in the accumulator in the destination buffer. */
emh203 0:3d9c67d97d6f 847 *pOut++ = (q15_t) (sum >> 15);
emh203 0:3d9c67d97d6f 848
emh203 0:3d9c67d97d6f 849 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 850 py = ++pSrc2;
emh203 0:3d9c67d97d6f 851 px = pIn1;
emh203 0:3d9c67d97d6f 852
emh203 0:3d9c67d97d6f 853 /* Increment the MAC count */
emh203 0:3d9c67d97d6f 854 count++;
emh203 0:3d9c67d97d6f 855
emh203 0:3d9c67d97d6f 856 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 857 blockSize1--;
emh203 0:3d9c67d97d6f 858 }
emh203 0:3d9c67d97d6f 859
emh203 0:3d9c67d97d6f 860 /* The second part of the stage starts here */
emh203 0:3d9c67d97d6f 861 /* The internal loop, over count, is unrolled by 4 */
emh203 0:3d9c67d97d6f 862 /* To, read the last two inputB samples using SIMD:
emh203 0:3d9c67d97d6f 863 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
emh203 0:3d9c67d97d6f 864 py = py - 1;
emh203 0:3d9c67d97d6f 865
emh203 0:3d9c67d97d6f 866 while(blockSize1 > 0)
emh203 0:3d9c67d97d6f 867 {
emh203 0:3d9c67d97d6f 868 /* Accumulator is made zero for every iteration */
emh203 0:3d9c67d97d6f 869 sum = 0;
emh203 0:3d9c67d97d6f 870
emh203 0:3d9c67d97d6f 871 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203 0:3d9c67d97d6f 872 k = count >> 2u;
emh203 0:3d9c67d97d6f 873
emh203 0:3d9c67d97d6f 874 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emh203 0:3d9c67d97d6f 875 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
emh203 0:3d9c67d97d6f 876 py++;
emh203 0:3d9c67d97d6f 877
emh203 0:3d9c67d97d6f 878 while(k > 0u)
emh203 0:3d9c67d97d6f 879 {
emh203 0:3d9c67d97d6f 880 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 881 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 882 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 883 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 884 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 885
emh203 0:3d9c67d97d6f 886 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 887 k--;
emh203 0:3d9c67d97d6f 888 }
emh203 0:3d9c67d97d6f 889
emh203 0:3d9c67d97d6f 890 /* If the count is not a multiple of 4, compute any remaining MACs here.
emh203 0:3d9c67d97d6f 891 ** No loop unrolling is used. */
emh203 0:3d9c67d97d6f 892 k = count % 0x4u;
emh203 0:3d9c67d97d6f 893
emh203 0:3d9c67d97d6f 894 while(k > 0u)
emh203 0:3d9c67d97d6f 895 {
emh203 0:3d9c67d97d6f 896 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 897 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 898
emh203 0:3d9c67d97d6f 899 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 900 k--;
emh203 0:3d9c67d97d6f 901 }
emh203 0:3d9c67d97d6f 902
emh203 0:3d9c67d97d6f 903 /* Store the result in the accumulator in the destination buffer. */
emh203 0:3d9c67d97d6f 904 *pOut++ = (q15_t) (sum >> 15);
emh203 0:3d9c67d97d6f 905
emh203 0:3d9c67d97d6f 906 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 907 py = ++pSrc2 - 1u;
emh203 0:3d9c67d97d6f 908 px = pIn1;
emh203 0:3d9c67d97d6f 909
emh203 0:3d9c67d97d6f 910 /* Increment the MAC count */
emh203 0:3d9c67d97d6f 911 count++;
emh203 0:3d9c67d97d6f 912
emh203 0:3d9c67d97d6f 913 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 914 blockSize1--;
emh203 0:3d9c67d97d6f 915 }
emh203 0:3d9c67d97d6f 916
emh203 0:3d9c67d97d6f 917 /* --------------------------
emh203 0:3d9c67d97d6f 918 * Initializations of stage2
emh203 0:3d9c67d97d6f 919 * ------------------------*/
emh203 0:3d9c67d97d6f 920
emh203 0:3d9c67d97d6f 921 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
emh203 0:3d9c67d97d6f 922 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
emh203 0:3d9c67d97d6f 923 * ....
emh203 0:3d9c67d97d6f 924 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
emh203 0:3d9c67d97d6f 925 */
emh203 0:3d9c67d97d6f 926
emh203 0:3d9c67d97d6f 927 /* Working pointer of inputA */
emh203 0:3d9c67d97d6f 928 if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
emh203 0:3d9c67d97d6f 929 {
emh203 0:3d9c67d97d6f 930 px = pIn1 + firstIndex - srcBLen + 1;
emh203 0:3d9c67d97d6f 931 }
emh203 0:3d9c67d97d6f 932 else
emh203 0:3d9c67d97d6f 933 {
emh203 0:3d9c67d97d6f 934 px = pIn1;
emh203 0:3d9c67d97d6f 935 }
emh203 0:3d9c67d97d6f 936
emh203 0:3d9c67d97d6f 937 /* Working pointer of inputB */
emh203 0:3d9c67d97d6f 938 pSrc2 = pIn2 + (srcBLen - 1u);
emh203 0:3d9c67d97d6f 939 py = pSrc2;
emh203 0:3d9c67d97d6f 940
emh203 0:3d9c67d97d6f 941 /* count is the index by which the pointer pIn1 to be incremented */
emh203 0:3d9c67d97d6f 942 count = 0u;
emh203 0:3d9c67d97d6f 943
emh203 0:3d9c67d97d6f 944
emh203 0:3d9c67d97d6f 945 /* --------------------
emh203 0:3d9c67d97d6f 946 * Stage2 process
emh203 0:3d9c67d97d6f 947 * -------------------*/
emh203 0:3d9c67d97d6f 948
emh203 0:3d9c67d97d6f 949 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
emh203 0:3d9c67d97d6f 950 * So, to loop unroll over blockSize2,
emh203 0:3d9c67d97d6f 951 * srcBLen should be greater than or equal to 4 */
emh203 0:3d9c67d97d6f 952 if(srcBLen >= 4u)
emh203 0:3d9c67d97d6f 953 {
emh203 0:3d9c67d97d6f 954 /* Loop unroll over blockSize2, by 4 */
emh203 0:3d9c67d97d6f 955 blkCnt = ((uint32_t) blockSize2 >> 2u);
emh203 0:3d9c67d97d6f 956
emh203 0:3d9c67d97d6f 957 while(blkCnt > 0u)
emh203 0:3d9c67d97d6f 958 {
emh203 0:3d9c67d97d6f 959 py = py - 1u;
emh203 0:3d9c67d97d6f 960
emh203 0:3d9c67d97d6f 961 /* Set all accumulators to zero */
emh203 0:3d9c67d97d6f 962 acc0 = 0;
emh203 0:3d9c67d97d6f 963 acc1 = 0;
emh203 0:3d9c67d97d6f 964 acc2 = 0;
emh203 0:3d9c67d97d6f 965 acc3 = 0;
emh203 0:3d9c67d97d6f 966
emh203 0:3d9c67d97d6f 967 /* read x[0], x[1] samples */
emh203 0:3d9c67d97d6f 968 a = *px++;
emh203 0:3d9c67d97d6f 969 b = *px++;
emh203 0:3d9c67d97d6f 970
emh203 0:3d9c67d97d6f 971 #ifndef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 972
emh203 0:3d9c67d97d6f 973 x0 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 974 a = *px;
emh203 0:3d9c67d97d6f 975 x1 = __PKHBT(b, a, 16);
emh203 0:3d9c67d97d6f 976
emh203 0:3d9c67d97d6f 977 #else
emh203 0:3d9c67d97d6f 978
emh203 0:3d9c67d97d6f 979 x0 = __PKHBT(b, a, 16);
emh203 0:3d9c67d97d6f 980 a = *px;
emh203 0:3d9c67d97d6f 981 x1 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 982
emh203 0:3d9c67d97d6f 983 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 984
emh203 0:3d9c67d97d6f 985 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203 0:3d9c67d97d6f 986 k = srcBLen >> 2u;
emh203 0:3d9c67d97d6f 987
emh203 0:3d9c67d97d6f 988 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emh203 0:3d9c67d97d6f 989 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
emh203 0:3d9c67d97d6f 990 do
emh203 0:3d9c67d97d6f 991 {
emh203 0:3d9c67d97d6f 992 /* Read the last two inputB samples using SIMD:
emh203 0:3d9c67d97d6f 993 * y[srcBLen - 1] and y[srcBLen - 2] */
emh203 0:3d9c67d97d6f 994 a = *py;
emh203 0:3d9c67d97d6f 995 b = *(py+1);
emh203 0:3d9c67d97d6f 996 py -= 2;
emh203 0:3d9c67d97d6f 997
emh203 0:3d9c67d97d6f 998 #ifndef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 999
emh203 0:3d9c67d97d6f 1000 c0 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1001
emh203 0:3d9c67d97d6f 1002 #else
emh203 0:3d9c67d97d6f 1003
emh203 0:3d9c67d97d6f 1004 c0 = __PKHBT(b, a, 16);;
emh203 0:3d9c67d97d6f 1005
emh203 0:3d9c67d97d6f 1006 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 1007
emh203 0:3d9c67d97d6f 1008 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
emh203 0:3d9c67d97d6f 1009 acc0 = __SMLADX(x0, c0, acc0);
emh203 0:3d9c67d97d6f 1010
emh203 0:3d9c67d97d6f 1011 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
emh203 0:3d9c67d97d6f 1012 acc1 = __SMLADX(x1, c0, acc1);
emh203 0:3d9c67d97d6f 1013
emh203 0:3d9c67d97d6f 1014 a = *px;
emh203 0:3d9c67d97d6f 1015 b = *(px + 1);
emh203 0:3d9c67d97d6f 1016
emh203 0:3d9c67d97d6f 1017 #ifndef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 1018
emh203 0:3d9c67d97d6f 1019 x2 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1020 a = *(px + 2);
emh203 0:3d9c67d97d6f 1021 x3 = __PKHBT(b, a, 16);
emh203 0:3d9c67d97d6f 1022
emh203 0:3d9c67d97d6f 1023 #else
emh203 0:3d9c67d97d6f 1024
emh203 0:3d9c67d97d6f 1025 x2 = __PKHBT(b, a, 16);
emh203 0:3d9c67d97d6f 1026 a = *(px + 2);
emh203 0:3d9c67d97d6f 1027 x3 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1028
emh203 0:3d9c67d97d6f 1029 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 1030
emh203 0:3d9c67d97d6f 1031 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
emh203 0:3d9c67d97d6f 1032 acc2 = __SMLADX(x2, c0, acc2);
emh203 0:3d9c67d97d6f 1033
emh203 0:3d9c67d97d6f 1034 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
emh203 0:3d9c67d97d6f 1035 acc3 = __SMLADX(x3, c0, acc3);
emh203 0:3d9c67d97d6f 1036
emh203 0:3d9c67d97d6f 1037 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
emh203 0:3d9c67d97d6f 1038 a = *py;
emh203 0:3d9c67d97d6f 1039 b = *(py+1);
emh203 0:3d9c67d97d6f 1040 py -= 2;
emh203 0:3d9c67d97d6f 1041
emh203 0:3d9c67d97d6f 1042 #ifndef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 1043
emh203 0:3d9c67d97d6f 1044 c0 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1045
emh203 0:3d9c67d97d6f 1046 #else
emh203 0:3d9c67d97d6f 1047
emh203 0:3d9c67d97d6f 1048 c0 = __PKHBT(b, a, 16);;
emh203 0:3d9c67d97d6f 1049
emh203 0:3d9c67d97d6f 1050 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 1051
emh203 0:3d9c67d97d6f 1052 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
emh203 0:3d9c67d97d6f 1053 acc0 = __SMLADX(x2, c0, acc0);
emh203 0:3d9c67d97d6f 1054
emh203 0:3d9c67d97d6f 1055 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
emh203 0:3d9c67d97d6f 1056 acc1 = __SMLADX(x3, c0, acc1);
emh203 0:3d9c67d97d6f 1057
emh203 0:3d9c67d97d6f 1058 /* Read x[4], x[5], x[6] */
emh203 0:3d9c67d97d6f 1059 a = *(px + 2);
emh203 0:3d9c67d97d6f 1060 b = *(px + 3);
emh203 0:3d9c67d97d6f 1061
emh203 0:3d9c67d97d6f 1062 #ifndef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 1063
emh203 0:3d9c67d97d6f 1064 x0 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1065 a = *(px + 4);
emh203 0:3d9c67d97d6f 1066 x1 = __PKHBT(b, a, 16);
emh203 0:3d9c67d97d6f 1067
emh203 0:3d9c67d97d6f 1068 #else
emh203 0:3d9c67d97d6f 1069
emh203 0:3d9c67d97d6f 1070 x0 = __PKHBT(b, a, 16);
emh203 0:3d9c67d97d6f 1071 a = *(px + 4);
emh203 0:3d9c67d97d6f 1072 x1 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1073
emh203 0:3d9c67d97d6f 1074 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 1075
emh203 0:3d9c67d97d6f 1076 px += 4u;
emh203 0:3d9c67d97d6f 1077
emh203 0:3d9c67d97d6f 1078 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
emh203 0:3d9c67d97d6f 1079 acc2 = __SMLADX(x0, c0, acc2);
emh203 0:3d9c67d97d6f 1080
emh203 0:3d9c67d97d6f 1081 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
emh203 0:3d9c67d97d6f 1082 acc3 = __SMLADX(x1, c0, acc3);
emh203 0:3d9c67d97d6f 1083
emh203 0:3d9c67d97d6f 1084 } while(--k);
emh203 0:3d9c67d97d6f 1085
emh203 0:3d9c67d97d6f 1086 /* For the next MAC operations, SIMD is not used
emh203 0:3d9c67d97d6f 1087 * So, the 16 bit pointer if inputB, py is updated */
emh203 0:3d9c67d97d6f 1088
emh203 0:3d9c67d97d6f 1089 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
emh203 0:3d9c67d97d6f 1090 ** No loop unrolling is used. */
emh203 0:3d9c67d97d6f 1091 k = srcBLen % 0x4u;
emh203 0:3d9c67d97d6f 1092
emh203 0:3d9c67d97d6f 1093 if(k == 1u)
emh203 0:3d9c67d97d6f 1094 {
emh203 0:3d9c67d97d6f 1095 /* Read y[srcBLen - 5] */
emh203 0:3d9c67d97d6f 1096 c0 = *(py+1);
emh203 0:3d9c67d97d6f 1097
emh203 0:3d9c67d97d6f 1098 #ifdef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 1099
emh203 0:3d9c67d97d6f 1100 c0 = c0 << 16u;
emh203 0:3d9c67d97d6f 1101
emh203 0:3d9c67d97d6f 1102 #else
emh203 0:3d9c67d97d6f 1103
emh203 0:3d9c67d97d6f 1104 c0 = c0 & 0x0000FFFF;
emh203 0:3d9c67d97d6f 1105
emh203 0:3d9c67d97d6f 1106 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 1107
emh203 0:3d9c67d97d6f 1108 /* Read x[7] */
emh203 0:3d9c67d97d6f 1109 a = *px;
emh203 0:3d9c67d97d6f 1110 b = *(px+1);
emh203 0:3d9c67d97d6f 1111 px++;
emh203 0:3d9c67d97d6f 1112
emh203 0:3d9c67d97d6f 1113 #ifndef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 1114
emh203 0:3d9c67d97d6f 1115 x3 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1116
emh203 0:3d9c67d97d6f 1117 #else
emh203 0:3d9c67d97d6f 1118
emh203 0:3d9c67d97d6f 1119 x3 = __PKHBT(b, a, 16);;
emh203 0:3d9c67d97d6f 1120
emh203 0:3d9c67d97d6f 1121 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 1122
emh203 0:3d9c67d97d6f 1123
emh203 0:3d9c67d97d6f 1124 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 1125 acc0 = __SMLAD(x0, c0, acc0);
emh203 0:3d9c67d97d6f 1126 acc1 = __SMLAD(x1, c0, acc1);
emh203 0:3d9c67d97d6f 1127 acc2 = __SMLADX(x1, c0, acc2);
emh203 0:3d9c67d97d6f 1128 acc3 = __SMLADX(x3, c0, acc3);
emh203 0:3d9c67d97d6f 1129 }
emh203 0:3d9c67d97d6f 1130
emh203 0:3d9c67d97d6f 1131 if(k == 2u)
emh203 0:3d9c67d97d6f 1132 {
emh203 0:3d9c67d97d6f 1133 /* Read y[srcBLen - 5], y[srcBLen - 6] */
emh203 0:3d9c67d97d6f 1134 a = *py;
emh203 0:3d9c67d97d6f 1135 b = *(py+1);
emh203 0:3d9c67d97d6f 1136
emh203 0:3d9c67d97d6f 1137 #ifndef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 1138
emh203 0:3d9c67d97d6f 1139 c0 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1140
emh203 0:3d9c67d97d6f 1141 #else
emh203 0:3d9c67d97d6f 1142
emh203 0:3d9c67d97d6f 1143 c0 = __PKHBT(b, a, 16);;
emh203 0:3d9c67d97d6f 1144
emh203 0:3d9c67d97d6f 1145 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 1146
emh203 0:3d9c67d97d6f 1147 /* Read x[7], x[8], x[9] */
emh203 0:3d9c67d97d6f 1148 a = *px;
emh203 0:3d9c67d97d6f 1149 b = *(px + 1);
emh203 0:3d9c67d97d6f 1150
emh203 0:3d9c67d97d6f 1151 #ifndef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 1152
emh203 0:3d9c67d97d6f 1153 x3 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1154 a = *(px + 2);
emh203 0:3d9c67d97d6f 1155 x2 = __PKHBT(b, a, 16);
emh203 0:3d9c67d97d6f 1156
emh203 0:3d9c67d97d6f 1157 #else
emh203 0:3d9c67d97d6f 1158
emh203 0:3d9c67d97d6f 1159 x3 = __PKHBT(b, a, 16);
emh203 0:3d9c67d97d6f 1160 a = *(px + 2);
emh203 0:3d9c67d97d6f 1161 x2 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1162
emh203 0:3d9c67d97d6f 1163 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 1164 px += 2u;
emh203 0:3d9c67d97d6f 1165
emh203 0:3d9c67d97d6f 1166 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 1167 acc0 = __SMLADX(x0, c0, acc0);
emh203 0:3d9c67d97d6f 1168 acc1 = __SMLADX(x1, c0, acc1);
emh203 0:3d9c67d97d6f 1169 acc2 = __SMLADX(x3, c0, acc2);
emh203 0:3d9c67d97d6f 1170 acc3 = __SMLADX(x2, c0, acc3);
emh203 0:3d9c67d97d6f 1171 }
emh203 0:3d9c67d97d6f 1172
emh203 0:3d9c67d97d6f 1173 if(k == 3u)
emh203 0:3d9c67d97d6f 1174 {
emh203 0:3d9c67d97d6f 1175 /* Read y[srcBLen - 5], y[srcBLen - 6] */
emh203 0:3d9c67d97d6f 1176 a = *py;
emh203 0:3d9c67d97d6f 1177 b = *(py+1);
emh203 0:3d9c67d97d6f 1178
emh203 0:3d9c67d97d6f 1179 #ifndef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 1180
emh203 0:3d9c67d97d6f 1181 c0 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1182
emh203 0:3d9c67d97d6f 1183 #else
emh203 0:3d9c67d97d6f 1184
emh203 0:3d9c67d97d6f 1185 c0 = __PKHBT(b, a, 16);;
emh203 0:3d9c67d97d6f 1186
emh203 0:3d9c67d97d6f 1187 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 1188
emh203 0:3d9c67d97d6f 1189 /* Read x[7], x[8], x[9] */
emh203 0:3d9c67d97d6f 1190 a = *px;
emh203 0:3d9c67d97d6f 1191 b = *(px + 1);
emh203 0:3d9c67d97d6f 1192
emh203 0:3d9c67d97d6f 1193 #ifndef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 1194
emh203 0:3d9c67d97d6f 1195 x3 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1196 a = *(px + 2);
emh203 0:3d9c67d97d6f 1197 x2 = __PKHBT(b, a, 16);
emh203 0:3d9c67d97d6f 1198
emh203 0:3d9c67d97d6f 1199 #else
emh203 0:3d9c67d97d6f 1200
emh203 0:3d9c67d97d6f 1201 x3 = __PKHBT(b, a, 16);
emh203 0:3d9c67d97d6f 1202 a = *(px + 2);
emh203 0:3d9c67d97d6f 1203 x2 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1204
emh203 0:3d9c67d97d6f 1205 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 1206
emh203 0:3d9c67d97d6f 1207 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 1208 acc0 = __SMLADX(x0, c0, acc0);
emh203 0:3d9c67d97d6f 1209 acc1 = __SMLADX(x1, c0, acc1);
emh203 0:3d9c67d97d6f 1210 acc2 = __SMLADX(x3, c0, acc2);
emh203 0:3d9c67d97d6f 1211 acc3 = __SMLADX(x2, c0, acc3);
emh203 0:3d9c67d97d6f 1212
emh203 0:3d9c67d97d6f 1213 /* Read y[srcBLen - 7] */
emh203 0:3d9c67d97d6f 1214 c0 = *(py-1);
emh203 0:3d9c67d97d6f 1215 #ifdef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 1216
emh203 0:3d9c67d97d6f 1217 c0 = c0 << 16u;
emh203 0:3d9c67d97d6f 1218 #else
emh203 0:3d9c67d97d6f 1219
emh203 0:3d9c67d97d6f 1220 c0 = c0 & 0x0000FFFF;
emh203 0:3d9c67d97d6f 1221 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 1222
emh203 0:3d9c67d97d6f 1223 /* Read x[10] */
emh203 0:3d9c67d97d6f 1224 a = *(px+2);
emh203 0:3d9c67d97d6f 1225 b = *(px+3);
emh203 0:3d9c67d97d6f 1226
emh203 0:3d9c67d97d6f 1227 #ifndef ARM_MATH_BIG_ENDIAN
emh203 0:3d9c67d97d6f 1228
emh203 0:3d9c67d97d6f 1229 x3 = __PKHBT(a, b, 16);
emh203 0:3d9c67d97d6f 1230
emh203 0:3d9c67d97d6f 1231 #else
emh203 0:3d9c67d97d6f 1232
emh203 0:3d9c67d97d6f 1233 x3 = __PKHBT(b, a, 16);;
emh203 0:3d9c67d97d6f 1234
emh203 0:3d9c67d97d6f 1235 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
emh203 0:3d9c67d97d6f 1236
emh203 0:3d9c67d97d6f 1237 px += 3u;
emh203 0:3d9c67d97d6f 1238
emh203 0:3d9c67d97d6f 1239 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 1240 acc0 = __SMLADX(x1, c0, acc0);
emh203 0:3d9c67d97d6f 1241 acc1 = __SMLAD(x2, c0, acc1);
emh203 0:3d9c67d97d6f 1242 acc2 = __SMLADX(x2, c0, acc2);
emh203 0:3d9c67d97d6f 1243 acc3 = __SMLADX(x3, c0, acc3);
emh203 0:3d9c67d97d6f 1244 }
emh203 0:3d9c67d97d6f 1245
emh203 0:3d9c67d97d6f 1246 /* Store the results in the accumulators in the destination buffer. */
emh203 0:3d9c67d97d6f 1247 *pOut++ = (q15_t)(acc0 >> 15);
emh203 0:3d9c67d97d6f 1248 *pOut++ = (q15_t)(acc1 >> 15);
emh203 0:3d9c67d97d6f 1249 *pOut++ = (q15_t)(acc2 >> 15);
emh203 0:3d9c67d97d6f 1250 *pOut++ = (q15_t)(acc3 >> 15);
emh203 0:3d9c67d97d6f 1251
emh203 0:3d9c67d97d6f 1252 /* Increment the pointer pIn1 index, count by 4 */
emh203 0:3d9c67d97d6f 1253 count += 4u;
emh203 0:3d9c67d97d6f 1254
emh203 0:3d9c67d97d6f 1255 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 1256 px = pIn1 + count;
emh203 0:3d9c67d97d6f 1257 py = pSrc2;
emh203 0:3d9c67d97d6f 1258
emh203 0:3d9c67d97d6f 1259 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 1260 blkCnt--;
emh203 0:3d9c67d97d6f 1261 }
emh203 0:3d9c67d97d6f 1262
emh203 0:3d9c67d97d6f 1263 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
emh203 0:3d9c67d97d6f 1264 ** No loop unrolling is used. */
emh203 0:3d9c67d97d6f 1265 blkCnt = (uint32_t) blockSize2 % 0x4u;
emh203 0:3d9c67d97d6f 1266
emh203 0:3d9c67d97d6f 1267 while(blkCnt > 0u)
emh203 0:3d9c67d97d6f 1268 {
emh203 0:3d9c67d97d6f 1269 /* Accumulator is made zero for every iteration */
emh203 0:3d9c67d97d6f 1270 sum = 0;
emh203 0:3d9c67d97d6f 1271
emh203 0:3d9c67d97d6f 1272 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203 0:3d9c67d97d6f 1273 k = srcBLen >> 2u;
emh203 0:3d9c67d97d6f 1274
emh203 0:3d9c67d97d6f 1275 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emh203 0:3d9c67d97d6f 1276 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
emh203 0:3d9c67d97d6f 1277 while(k > 0u)
emh203 0:3d9c67d97d6f 1278 {
emh203 0:3d9c67d97d6f 1279 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 1280 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 1281 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 1282 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 1283 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 1284
emh203 0:3d9c67d97d6f 1285 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 1286 k--;
emh203 0:3d9c67d97d6f 1287 }
emh203 0:3d9c67d97d6f 1288
emh203 0:3d9c67d97d6f 1289 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
emh203 0:3d9c67d97d6f 1290 ** No loop unrolling is used. */
emh203 0:3d9c67d97d6f 1291 k = srcBLen % 0x4u;
emh203 0:3d9c67d97d6f 1292
emh203 0:3d9c67d97d6f 1293 while(k > 0u)
emh203 0:3d9c67d97d6f 1294 {
emh203 0:3d9c67d97d6f 1295 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 1296 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 1297
emh203 0:3d9c67d97d6f 1298 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 1299 k--;
emh203 0:3d9c67d97d6f 1300 }
emh203 0:3d9c67d97d6f 1301
emh203 0:3d9c67d97d6f 1302 /* Store the result in the accumulator in the destination buffer. */
emh203 0:3d9c67d97d6f 1303 *pOut++ = (q15_t) (sum >> 15);
emh203 0:3d9c67d97d6f 1304
emh203 0:3d9c67d97d6f 1305 /* Increment the pointer pIn1 index, count by 1 */
emh203 0:3d9c67d97d6f 1306 count++;
emh203 0:3d9c67d97d6f 1307
emh203 0:3d9c67d97d6f 1308 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 1309 px = pIn1 + count;
emh203 0:3d9c67d97d6f 1310 py = pSrc2;
emh203 0:3d9c67d97d6f 1311
emh203 0:3d9c67d97d6f 1312 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 1313 blkCnt--;
emh203 0:3d9c67d97d6f 1314 }
emh203 0:3d9c67d97d6f 1315 }
emh203 0:3d9c67d97d6f 1316 else
emh203 0:3d9c67d97d6f 1317 {
emh203 0:3d9c67d97d6f 1318 /* If the srcBLen is not a multiple of 4,
emh203 0:3d9c67d97d6f 1319 * the blockSize2 loop cannot be unrolled by 4 */
emh203 0:3d9c67d97d6f 1320 blkCnt = (uint32_t) blockSize2;
emh203 0:3d9c67d97d6f 1321
emh203 0:3d9c67d97d6f 1322 while(blkCnt > 0u)
emh203 0:3d9c67d97d6f 1323 {
emh203 0:3d9c67d97d6f 1324 /* Accumulator is made zero for every iteration */
emh203 0:3d9c67d97d6f 1325 sum = 0;
emh203 0:3d9c67d97d6f 1326
emh203 0:3d9c67d97d6f 1327 /* srcBLen number of MACS should be performed */
emh203 0:3d9c67d97d6f 1328 k = srcBLen;
emh203 0:3d9c67d97d6f 1329
emh203 0:3d9c67d97d6f 1330 while(k > 0u)
emh203 0:3d9c67d97d6f 1331 {
emh203 0:3d9c67d97d6f 1332 /* Perform the multiply-accumulate */
emh203 0:3d9c67d97d6f 1333 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 1334
emh203 0:3d9c67d97d6f 1335 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 1336 k--;
emh203 0:3d9c67d97d6f 1337 }
emh203 0:3d9c67d97d6f 1338
emh203 0:3d9c67d97d6f 1339 /* Store the result in the accumulator in the destination buffer. */
emh203 0:3d9c67d97d6f 1340 *pOut++ = (q15_t) (sum >> 15);
emh203 0:3d9c67d97d6f 1341
emh203 0:3d9c67d97d6f 1342 /* Increment the MAC count */
emh203 0:3d9c67d97d6f 1343 count++;
emh203 0:3d9c67d97d6f 1344
emh203 0:3d9c67d97d6f 1345 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 1346 px = pIn1 + count;
emh203 0:3d9c67d97d6f 1347 py = pSrc2;
emh203 0:3d9c67d97d6f 1348
emh203 0:3d9c67d97d6f 1349 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 1350 blkCnt--;
emh203 0:3d9c67d97d6f 1351 }
emh203 0:3d9c67d97d6f 1352 }
emh203 0:3d9c67d97d6f 1353
emh203 0:3d9c67d97d6f 1354
emh203 0:3d9c67d97d6f 1355 /* --------------------------
emh203 0:3d9c67d97d6f 1356 * Initializations of stage3
emh203 0:3d9c67d97d6f 1357 * -------------------------*/
emh203 0:3d9c67d97d6f 1358
emh203 0:3d9c67d97d6f 1359 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
emh203 0:3d9c67d97d6f 1360 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
emh203 0:3d9c67d97d6f 1361 * ....
emh203 0:3d9c67d97d6f 1362 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
emh203 0:3d9c67d97d6f 1363 * sum += x[srcALen-1] * y[srcBLen-1]
emh203 0:3d9c67d97d6f 1364 */
emh203 0:3d9c67d97d6f 1365
emh203 0:3d9c67d97d6f 1366 /* In this stage the MAC operations are decreased by 1 for every iteration.
emh203 0:3d9c67d97d6f 1367 The count variable holds the number of MAC operations performed */
emh203 0:3d9c67d97d6f 1368 count = srcBLen - 1u;
emh203 0:3d9c67d97d6f 1369
emh203 0:3d9c67d97d6f 1370 /* Working pointer of inputA */
emh203 0:3d9c67d97d6f 1371 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
emh203 0:3d9c67d97d6f 1372 px = pSrc1;
emh203 0:3d9c67d97d6f 1373
emh203 0:3d9c67d97d6f 1374 /* Working pointer of inputB */
emh203 0:3d9c67d97d6f 1375 pSrc2 = pIn2 + (srcBLen - 1u);
emh203 0:3d9c67d97d6f 1376 pIn2 = pSrc2 - 1u;
emh203 0:3d9c67d97d6f 1377 py = pIn2;
emh203 0:3d9c67d97d6f 1378
emh203 0:3d9c67d97d6f 1379 /* -------------------
emh203 0:3d9c67d97d6f 1380 * Stage3 process
emh203 0:3d9c67d97d6f 1381 * ------------------*/
emh203 0:3d9c67d97d6f 1382
emh203 0:3d9c67d97d6f 1383 /* For loop unrolling by 4, this stage is divided into two. */
emh203 0:3d9c67d97d6f 1384 /* First part of this stage computes the MAC operations greater than 4 */
emh203 0:3d9c67d97d6f 1385 /* Second part of this stage computes the MAC operations less than or equal to 4 */
emh203 0:3d9c67d97d6f 1386
emh203 0:3d9c67d97d6f 1387 /* The first part of the stage starts here */
emh203 0:3d9c67d97d6f 1388 j = count >> 2u;
emh203 0:3d9c67d97d6f 1389
emh203 0:3d9c67d97d6f 1390 while((j > 0u) && (blockSize3 > 0))
emh203 0:3d9c67d97d6f 1391 {
emh203 0:3d9c67d97d6f 1392 /* Accumulator is made zero for every iteration */
emh203 0:3d9c67d97d6f 1393 sum = 0;
emh203 0:3d9c67d97d6f 1394
emh203 0:3d9c67d97d6f 1395 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203 0:3d9c67d97d6f 1396 k = count >> 2u;
emh203 0:3d9c67d97d6f 1397
emh203 0:3d9c67d97d6f 1398 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
emh203 0:3d9c67d97d6f 1399 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
emh203 0:3d9c67d97d6f 1400 py++;
emh203 0:3d9c67d97d6f 1401
emh203 0:3d9c67d97d6f 1402 while(k > 0u)
emh203 0:3d9c67d97d6f 1403 {
emh203 0:3d9c67d97d6f 1404 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 1405 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 1406 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 1407 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 1408 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 1409 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 1410 k--;
emh203 0:3d9c67d97d6f 1411 }
emh203 0:3d9c67d97d6f 1412
emh203 0:3d9c67d97d6f 1413
emh203 0:3d9c67d97d6f 1414 /* If the count is not a multiple of 4, compute any remaining MACs here.
emh203 0:3d9c67d97d6f 1415 ** No loop unrolling is used. */
emh203 0:3d9c67d97d6f 1416 k = count % 0x4u;
emh203 0:3d9c67d97d6f 1417
emh203 0:3d9c67d97d6f 1418 while(k > 0u)
emh203 0:3d9c67d97d6f 1419 {
emh203 0:3d9c67d97d6f 1420 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 1421 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 1422
emh203 0:3d9c67d97d6f 1423 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 1424 k--;
emh203 0:3d9c67d97d6f 1425 }
emh203 0:3d9c67d97d6f 1426
emh203 0:3d9c67d97d6f 1427 /* Store the result in the accumulator in the destination buffer. */
emh203 0:3d9c67d97d6f 1428 *pOut++ = (q15_t) (sum >> 15);
emh203 0:3d9c67d97d6f 1429
emh203 0:3d9c67d97d6f 1430 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 1431 px = ++pSrc1;
emh203 0:3d9c67d97d6f 1432 py = pIn2;
emh203 0:3d9c67d97d6f 1433
emh203 0:3d9c67d97d6f 1434 /* Decrement the MAC count */
emh203 0:3d9c67d97d6f 1435 count--;
emh203 0:3d9c67d97d6f 1436
emh203 0:3d9c67d97d6f 1437 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 1438 blockSize3--;
emh203 0:3d9c67d97d6f 1439
emh203 0:3d9c67d97d6f 1440 j--;
emh203 0:3d9c67d97d6f 1441 }
emh203 0:3d9c67d97d6f 1442
emh203 0:3d9c67d97d6f 1443 /* The second part of the stage starts here */
emh203 0:3d9c67d97d6f 1444 /* SIMD is not used for the next MAC operations,
emh203 0:3d9c67d97d6f 1445 * so pointer py is updated to read only one sample at a time */
emh203 0:3d9c67d97d6f 1446 py = py + 1u;
emh203 0:3d9c67d97d6f 1447
emh203 0:3d9c67d97d6f 1448 while(blockSize3 > 0)
emh203 0:3d9c67d97d6f 1449 {
emh203 0:3d9c67d97d6f 1450 /* Accumulator is made zero for every iteration */
emh203 0:3d9c67d97d6f 1451 sum = 0;
emh203 0:3d9c67d97d6f 1452
emh203 0:3d9c67d97d6f 1453 /* Apply loop unrolling and compute 4 MACs simultaneously. */
emh203 0:3d9c67d97d6f 1454 k = count;
emh203 0:3d9c67d97d6f 1455
emh203 0:3d9c67d97d6f 1456 while(k > 0u)
emh203 0:3d9c67d97d6f 1457 {
emh203 0:3d9c67d97d6f 1458 /* Perform the multiply-accumulates */
emh203 0:3d9c67d97d6f 1459 /* sum += x[srcALen-1] * y[srcBLen-1] */
emh203 0:3d9c67d97d6f 1460 sum += ((q31_t) * px++ * *py--);
emh203 0:3d9c67d97d6f 1461
emh203 0:3d9c67d97d6f 1462 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 1463 k--;
emh203 0:3d9c67d97d6f 1464 }
emh203 0:3d9c67d97d6f 1465
emh203 0:3d9c67d97d6f 1466 /* Store the result in the accumulator in the destination buffer. */
emh203 0:3d9c67d97d6f 1467 *pOut++ = (q15_t) (sum >> 15);
emh203 0:3d9c67d97d6f 1468
emh203 0:3d9c67d97d6f 1469 /* Update the inputA and inputB pointers for next MAC calculation */
emh203 0:3d9c67d97d6f 1470 px = ++pSrc1;
emh203 0:3d9c67d97d6f 1471 py = pSrc2;
emh203 0:3d9c67d97d6f 1472
emh203 0:3d9c67d97d6f 1473 /* Decrement the MAC count */
emh203 0:3d9c67d97d6f 1474 count--;
emh203 0:3d9c67d97d6f 1475
emh203 0:3d9c67d97d6f 1476 /* Decrement the loop counter */
emh203 0:3d9c67d97d6f 1477 blockSize3--;
emh203 0:3d9c67d97d6f 1478 }
emh203 0:3d9c67d97d6f 1479
emh203 0:3d9c67d97d6f 1480 /* set status as ARM_MATH_SUCCESS */
emh203 0:3d9c67d97d6f 1481 status = ARM_MATH_SUCCESS;
emh203 0:3d9c67d97d6f 1482 }
emh203 0:3d9c67d97d6f 1483
emh203 0:3d9c67d97d6f 1484 /* Return to application */
emh203 0:3d9c67d97d6f 1485 return (status);
emh203 0:3d9c67d97d6f 1486
emh203 0:3d9c67d97d6f 1487 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
emh203 0:3d9c67d97d6f 1488 }
emh203 0:3d9c67d97d6f 1489
emh203 0:3d9c67d97d6f 1490 /**
emh203 0:3d9c67d97d6f 1491 * @} end of PartialConv group
emh203 0:3d9c67d97d6f 1492 */