The CMSIS DSP 5 library

Dependents:   Nucleo-Heart-Rate ejercicioVrms2 PROYECTOFINAL ejercicioVrms ... more

Committer:
xorjoep
Date:
Thu Jun 21 11:56:27 2018 +0000
Revision:
3:4098b9d3d571
Parent:
1:24714b45cd1b
headers is a folder not a library

Who changed what in which revision?

UserRevisionLine numberNew contents of line
xorjoep 1:24714b45cd1b 1 /* ----------------------------------------------------------------------
xorjoep 1:24714b45cd1b 2 * Project: CMSIS DSP Library
xorjoep 1:24714b45cd1b 3 * Title: arm_conv_partial_q15.c
xorjoep 1:24714b45cd1b 4 * Description: Partial convolution of Q15 sequences
xorjoep 1:24714b45cd1b 5 *
xorjoep 1:24714b45cd1b 6 * $Date: 27. January 2017
xorjoep 1:24714b45cd1b 7 * $Revision: V.1.5.1
xorjoep 1:24714b45cd1b 8 *
xorjoep 1:24714b45cd1b 9 * Target Processor: Cortex-M cores
xorjoep 1:24714b45cd1b 10 * -------------------------------------------------------------------- */
xorjoep 1:24714b45cd1b 11 /*
xorjoep 1:24714b45cd1b 12 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
xorjoep 1:24714b45cd1b 13 *
xorjoep 1:24714b45cd1b 14 * SPDX-License-Identifier: Apache-2.0
xorjoep 1:24714b45cd1b 15 *
xorjoep 1:24714b45cd1b 16 * Licensed under the Apache License, Version 2.0 (the License); you may
xorjoep 1:24714b45cd1b 17 * not use this file except in compliance with the License.
xorjoep 1:24714b45cd1b 18 * You may obtain a copy of the License at
xorjoep 1:24714b45cd1b 19 *
xorjoep 1:24714b45cd1b 20 * www.apache.org/licenses/LICENSE-2.0
xorjoep 1:24714b45cd1b 21 *
xorjoep 1:24714b45cd1b 22 * Unless required by applicable law or agreed to in writing, software
xorjoep 1:24714b45cd1b 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
xorjoep 1:24714b45cd1b 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
xorjoep 1:24714b45cd1b 25 * See the License for the specific language governing permissions and
xorjoep 1:24714b45cd1b 26 * limitations under the License.
xorjoep 1:24714b45cd1b 27 */
xorjoep 1:24714b45cd1b 28
xorjoep 1:24714b45cd1b 29 #include "arm_math.h"
xorjoep 1:24714b45cd1b 30
xorjoep 1:24714b45cd1b 31 /**
xorjoep 1:24714b45cd1b 32 * @ingroup groupFilters
xorjoep 1:24714b45cd1b 33 */
xorjoep 1:24714b45cd1b 34
xorjoep 1:24714b45cd1b 35 /**
xorjoep 1:24714b45cd1b 36 * @addtogroup PartialConv
xorjoep 1:24714b45cd1b 37 * @{
xorjoep 1:24714b45cd1b 38 */
xorjoep 1:24714b45cd1b 39
xorjoep 1:24714b45cd1b 40 /**
xorjoep 1:24714b45cd1b 41 * @brief Partial convolution of Q15 sequences.
xorjoep 1:24714b45cd1b 42 * @param[in] *pSrcA points to the first input sequence.
xorjoep 1:24714b45cd1b 43 * @param[in] srcALen length of the first input sequence.
xorjoep 1:24714b45cd1b 44 * @param[in] *pSrcB points to the second input sequence.
xorjoep 1:24714b45cd1b 45 * @param[in] srcBLen length of the second input sequence.
xorjoep 1:24714b45cd1b 46 * @param[out] *pDst points to the location where the output result is written.
xorjoep 1:24714b45cd1b 47 * @param[in] firstIndex is the first output sample to start with.
xorjoep 1:24714b45cd1b 48 * @param[in] numPoints is the number of output points to be computed.
xorjoep 1:24714b45cd1b 49 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
xorjoep 1:24714b45cd1b 50 *
xorjoep 1:24714b45cd1b 51 * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
xorjoep 1:24714b45cd1b 52 *
xorjoep 1:24714b45cd1b 53 * \par
xorjoep 1:24714b45cd1b 54 * Refer the function <code>arm_conv_partial_opt_q15()</code> for a faster implementation of this function using scratch buffers.
xorjoep 1:24714b45cd1b 55 *
xorjoep 1:24714b45cd1b 56 */
xorjoep 1:24714b45cd1b 57
xorjoep 1:24714b45cd1b 58 arm_status arm_conv_partial_q15(
xorjoep 1:24714b45cd1b 59 q15_t * pSrcA,
xorjoep 1:24714b45cd1b 60 uint32_t srcALen,
xorjoep 1:24714b45cd1b 61 q15_t * pSrcB,
xorjoep 1:24714b45cd1b 62 uint32_t srcBLen,
xorjoep 1:24714b45cd1b 63 q15_t * pDst,
xorjoep 1:24714b45cd1b 64 uint32_t firstIndex,
xorjoep 1:24714b45cd1b 65 uint32_t numPoints)
xorjoep 1:24714b45cd1b 66 {
xorjoep 1:24714b45cd1b 67
xorjoep 1:24714b45cd1b 68
xorjoep 1:24714b45cd1b 69 #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
xorjoep 1:24714b45cd1b 70
xorjoep 1:24714b45cd1b 71 /* Run the below code for Cortex-M4 and Cortex-M3 */
xorjoep 1:24714b45cd1b 72
xorjoep 1:24714b45cd1b 73 q15_t *pIn1; /* inputA pointer */
xorjoep 1:24714b45cd1b 74 q15_t *pIn2; /* inputB pointer */
xorjoep 1:24714b45cd1b 75 q15_t *pOut = pDst; /* output pointer */
xorjoep 1:24714b45cd1b 76 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
xorjoep 1:24714b45cd1b 77 q15_t *px; /* Intermediate inputA pointer */
xorjoep 1:24714b45cd1b 78 q15_t *py; /* Intermediate inputB pointer */
xorjoep 1:24714b45cd1b 79 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
xorjoep 1:24714b45cd1b 80 q31_t x0, x1, x2, x3, c0; /* Temporary input variables */
xorjoep 1:24714b45cd1b 81 uint32_t j, k, count, check, blkCnt;
xorjoep 1:24714b45cd1b 82 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
xorjoep 1:24714b45cd1b 83 arm_status status; /* status of Partial convolution */
xorjoep 1:24714b45cd1b 84
xorjoep 1:24714b45cd1b 85 /* Check for range of output samples to be calculated */
xorjoep 1:24714b45cd1b 86 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
xorjoep 1:24714b45cd1b 87 {
xorjoep 1:24714b45cd1b 88 /* Set status as ARM_MATH_ARGUMENT_ERROR */
xorjoep 1:24714b45cd1b 89 status = ARM_MATH_ARGUMENT_ERROR;
xorjoep 1:24714b45cd1b 90 }
xorjoep 1:24714b45cd1b 91 else
xorjoep 1:24714b45cd1b 92 {
xorjoep 1:24714b45cd1b 93
xorjoep 1:24714b45cd1b 94 /* The algorithm implementation is based on the lengths of the inputs. */
xorjoep 1:24714b45cd1b 95 /* srcB is always made to slide across srcA. */
xorjoep 1:24714b45cd1b 96 /* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep 1:24714b45cd1b 97 if (srcALen >= srcBLen)
xorjoep 1:24714b45cd1b 98 {
xorjoep 1:24714b45cd1b 99 /* Initialization of inputA pointer */
xorjoep 1:24714b45cd1b 100 pIn1 = pSrcA;
xorjoep 1:24714b45cd1b 101
xorjoep 1:24714b45cd1b 102 /* Initialization of inputB pointer */
xorjoep 1:24714b45cd1b 103 pIn2 = pSrcB;
xorjoep 1:24714b45cd1b 104 }
xorjoep 1:24714b45cd1b 105 else
xorjoep 1:24714b45cd1b 106 {
xorjoep 1:24714b45cd1b 107 /* Initialization of inputA pointer */
xorjoep 1:24714b45cd1b 108 pIn1 = pSrcB;
xorjoep 1:24714b45cd1b 109
xorjoep 1:24714b45cd1b 110 /* Initialization of inputB pointer */
xorjoep 1:24714b45cd1b 111 pIn2 = pSrcA;
xorjoep 1:24714b45cd1b 112
xorjoep 1:24714b45cd1b 113 /* srcBLen is always considered as shorter or equal to srcALen */
xorjoep 1:24714b45cd1b 114 j = srcBLen;
xorjoep 1:24714b45cd1b 115 srcBLen = srcALen;
xorjoep 1:24714b45cd1b 116 srcALen = j;
xorjoep 1:24714b45cd1b 117 }
xorjoep 1:24714b45cd1b 118
xorjoep 1:24714b45cd1b 119 /* Conditions to check which loopCounter holds
xorjoep 1:24714b45cd1b 120 * the first and last indices of the output samples to be calculated. */
xorjoep 1:24714b45cd1b 121 check = firstIndex + numPoints;
xorjoep 1:24714b45cd1b 122 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
xorjoep 1:24714b45cd1b 123 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
xorjoep 1:24714b45cd1b 124 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
xorjoep 1:24714b45cd1b 125 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
xorjoep 1:24714b45cd1b 126 (int32_t) numPoints) : 0;
xorjoep 1:24714b45cd1b 127 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
xorjoep 1:24714b45cd1b 128 (int32_t) firstIndex);
xorjoep 1:24714b45cd1b 129 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
xorjoep 1:24714b45cd1b 130
xorjoep 1:24714b45cd1b 131 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
xorjoep 1:24714b45cd1b 132 /* The function is internally
xorjoep 1:24714b45cd1b 133 * divided into three stages according to the number of multiplications that has to be
xorjoep 1:24714b45cd1b 134 * taken place between inputA samples and inputB samples. In the first stage of the
xorjoep 1:24714b45cd1b 135 * algorithm, the multiplications increase by one for every iteration.
xorjoep 1:24714b45cd1b 136 * In the second stage of the algorithm, srcBLen number of multiplications are done.
xorjoep 1:24714b45cd1b 137 * In the third stage of the algorithm, the multiplications decrease by one
xorjoep 1:24714b45cd1b 138 * for every iteration. */
xorjoep 1:24714b45cd1b 139
xorjoep 1:24714b45cd1b 140 /* Set the output pointer to point to the firstIndex
xorjoep 1:24714b45cd1b 141 * of the output sample to be calculated. */
xorjoep 1:24714b45cd1b 142 pOut = pDst + firstIndex;
xorjoep 1:24714b45cd1b 143
xorjoep 1:24714b45cd1b 144 /* --------------------------
xorjoep 1:24714b45cd1b 145 * Initializations of stage1
xorjoep 1:24714b45cd1b 146 * -------------------------*/
xorjoep 1:24714b45cd1b 147
xorjoep 1:24714b45cd1b 148 /* sum = x[0] * y[0]
xorjoep 1:24714b45cd1b 149 * sum = x[0] * y[1] + x[1] * y[0]
xorjoep 1:24714b45cd1b 150 * ....
xorjoep 1:24714b45cd1b 151 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
xorjoep 1:24714b45cd1b 152 */
xorjoep 1:24714b45cd1b 153
xorjoep 1:24714b45cd1b 154 /* In this stage the MAC operations are increased by 1 for every iteration.
xorjoep 1:24714b45cd1b 155 The count variable holds the number of MAC operations performed.
xorjoep 1:24714b45cd1b 156 Since the partial convolution starts from firstIndex
xorjoep 1:24714b45cd1b 157 Number of Macs to be performed is firstIndex + 1 */
xorjoep 1:24714b45cd1b 158 count = 1U + firstIndex;
xorjoep 1:24714b45cd1b 159
xorjoep 1:24714b45cd1b 160 /* Working pointer of inputA */
xorjoep 1:24714b45cd1b 161 px = pIn1;
xorjoep 1:24714b45cd1b 162
xorjoep 1:24714b45cd1b 163 /* Working pointer of inputB */
xorjoep 1:24714b45cd1b 164 pSrc2 = pIn2 + firstIndex;
xorjoep 1:24714b45cd1b 165 py = pSrc2;
xorjoep 1:24714b45cd1b 166
xorjoep 1:24714b45cd1b 167 /* ------------------------
xorjoep 1:24714b45cd1b 168 * Stage1 process
xorjoep 1:24714b45cd1b 169 * ----------------------*/
xorjoep 1:24714b45cd1b 170
xorjoep 1:24714b45cd1b 171 /* For loop unrolling by 4, this stage is divided into two. */
xorjoep 1:24714b45cd1b 172 /* First part of this stage computes the MAC operations less than 4 */
xorjoep 1:24714b45cd1b 173 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
xorjoep 1:24714b45cd1b 174
xorjoep 1:24714b45cd1b 175 /* The first part of the stage starts here */
xorjoep 1:24714b45cd1b 176 while ((count < 4U) && (blockSize1 > 0))
xorjoep 1:24714b45cd1b 177 {
xorjoep 1:24714b45cd1b 178 /* Accumulator is made zero for every iteration */
xorjoep 1:24714b45cd1b 179 sum = 0;
xorjoep 1:24714b45cd1b 180
xorjoep 1:24714b45cd1b 181 /* Loop over number of MAC operations between
xorjoep 1:24714b45cd1b 182 * inputA samples and inputB samples */
xorjoep 1:24714b45cd1b 183 k = count;
xorjoep 1:24714b45cd1b 184
xorjoep 1:24714b45cd1b 185 while (k > 0U)
xorjoep 1:24714b45cd1b 186 {
xorjoep 1:24714b45cd1b 187 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 188 sum = __SMLALD(*px++, *py--, sum);
xorjoep 1:24714b45cd1b 189
xorjoep 1:24714b45cd1b 190 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 191 k--;
xorjoep 1:24714b45cd1b 192 }
xorjoep 1:24714b45cd1b 193
xorjoep 1:24714b45cd1b 194 /* Store the result in the accumulator in the destination buffer. */
xorjoep 1:24714b45cd1b 195 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
xorjoep 1:24714b45cd1b 196
xorjoep 1:24714b45cd1b 197 /* Update the inputA and inputB pointers for next MAC calculation */
xorjoep 1:24714b45cd1b 198 py = ++pSrc2;
xorjoep 1:24714b45cd1b 199 px = pIn1;
xorjoep 1:24714b45cd1b 200
xorjoep 1:24714b45cd1b 201 /* Increment the MAC count */
xorjoep 1:24714b45cd1b 202 count++;
xorjoep 1:24714b45cd1b 203
xorjoep 1:24714b45cd1b 204 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 205 blockSize1--;
xorjoep 1:24714b45cd1b 206 }
xorjoep 1:24714b45cd1b 207
xorjoep 1:24714b45cd1b 208 /* The second part of the stage starts here */
xorjoep 1:24714b45cd1b 209 /* The internal loop, over count, is unrolled by 4 */
xorjoep 1:24714b45cd1b 210 /* To, read the last two inputB samples using SIMD:
xorjoep 1:24714b45cd1b 211 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
xorjoep 1:24714b45cd1b 212 py = py - 1;
xorjoep 1:24714b45cd1b 213
xorjoep 1:24714b45cd1b 214 while (blockSize1 > 0)
xorjoep 1:24714b45cd1b 215 {
xorjoep 1:24714b45cd1b 216 /* Accumulator is made zero for every iteration */
xorjoep 1:24714b45cd1b 217 sum = 0;
xorjoep 1:24714b45cd1b 218
xorjoep 1:24714b45cd1b 219 /* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep 1:24714b45cd1b 220 k = count >> 2U;
xorjoep 1:24714b45cd1b 221
xorjoep 1:24714b45cd1b 222 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep 1:24714b45cd1b 223 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep 1:24714b45cd1b 224 while (k > 0U)
xorjoep 1:24714b45cd1b 225 {
xorjoep 1:24714b45cd1b 226 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 227 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
xorjoep 1:24714b45cd1b 228 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
xorjoep 1:24714b45cd1b 229 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
xorjoep 1:24714b45cd1b 230 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
xorjoep 1:24714b45cd1b 231
xorjoep 1:24714b45cd1b 232 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 233 k--;
xorjoep 1:24714b45cd1b 234 }
xorjoep 1:24714b45cd1b 235
xorjoep 1:24714b45cd1b 236 /* For the next MAC operations, the pointer py is used without SIMD
xorjoep 1:24714b45cd1b 237 * So, py is incremented by 1 */
xorjoep 1:24714b45cd1b 238 py = py + 1U;
xorjoep 1:24714b45cd1b 239
xorjoep 1:24714b45cd1b 240 /* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep 1:24714b45cd1b 241 ** No loop unrolling is used. */
xorjoep 1:24714b45cd1b 242 k = count % 0x4U;
xorjoep 1:24714b45cd1b 243
xorjoep 1:24714b45cd1b 244 while (k > 0U)
xorjoep 1:24714b45cd1b 245 {
xorjoep 1:24714b45cd1b 246 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 247 sum = __SMLALD(*px++, *py--, sum);
xorjoep 1:24714b45cd1b 248
xorjoep 1:24714b45cd1b 249 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 250 k--;
xorjoep 1:24714b45cd1b 251 }
xorjoep 1:24714b45cd1b 252
xorjoep 1:24714b45cd1b 253 /* Store the result in the accumulator in the destination buffer. */
xorjoep 1:24714b45cd1b 254 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
xorjoep 1:24714b45cd1b 255
xorjoep 1:24714b45cd1b 256 /* Update the inputA and inputB pointers for next MAC calculation */
xorjoep 1:24714b45cd1b 257 py = ++pSrc2 - 1U;
xorjoep 1:24714b45cd1b 258 px = pIn1;
xorjoep 1:24714b45cd1b 259
xorjoep 1:24714b45cd1b 260 /* Increment the MAC count */
xorjoep 1:24714b45cd1b 261 count++;
xorjoep 1:24714b45cd1b 262
xorjoep 1:24714b45cd1b 263 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 264 blockSize1--;
xorjoep 1:24714b45cd1b 265 }
xorjoep 1:24714b45cd1b 266
xorjoep 1:24714b45cd1b 267 /* --------------------------
xorjoep 1:24714b45cd1b 268 * Initializations of stage2
xorjoep 1:24714b45cd1b 269 * ------------------------*/
xorjoep 1:24714b45cd1b 270
xorjoep 1:24714b45cd1b 271 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
xorjoep 1:24714b45cd1b 272 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
xorjoep 1:24714b45cd1b 273 * ....
xorjoep 1:24714b45cd1b 274 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
xorjoep 1:24714b45cd1b 275 */
xorjoep 1:24714b45cd1b 276
xorjoep 1:24714b45cd1b 277 /* Working pointer of inputA */
xorjoep 1:24714b45cd1b 278 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep 1:24714b45cd1b 279 {
xorjoep 1:24714b45cd1b 280 px = pIn1 + firstIndex - srcBLen + 1;
xorjoep 1:24714b45cd1b 281 }
xorjoep 1:24714b45cd1b 282 else
xorjoep 1:24714b45cd1b 283 {
xorjoep 1:24714b45cd1b 284 px = pIn1;
xorjoep 1:24714b45cd1b 285 }
xorjoep 1:24714b45cd1b 286
xorjoep 1:24714b45cd1b 287 /* Working pointer of inputB */
xorjoep 1:24714b45cd1b 288 pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep 1:24714b45cd1b 289 py = pSrc2;
xorjoep 1:24714b45cd1b 290
xorjoep 1:24714b45cd1b 291 /* count is the index by which the pointer pIn1 to be incremented */
xorjoep 1:24714b45cd1b 292 count = 0U;
xorjoep 1:24714b45cd1b 293
xorjoep 1:24714b45cd1b 294
xorjoep 1:24714b45cd1b 295 /* --------------------
xorjoep 1:24714b45cd1b 296 * Stage2 process
xorjoep 1:24714b45cd1b 297 * -------------------*/
xorjoep 1:24714b45cd1b 298
xorjoep 1:24714b45cd1b 299 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
xorjoep 1:24714b45cd1b 300 * So, to loop unroll over blockSize2,
xorjoep 1:24714b45cd1b 301 * srcBLen should be greater than or equal to 4 */
xorjoep 1:24714b45cd1b 302 if (srcBLen >= 4U)
xorjoep 1:24714b45cd1b 303 {
xorjoep 1:24714b45cd1b 304 /* Loop unroll over blockSize2, by 4 */
xorjoep 1:24714b45cd1b 305 blkCnt = blockSize2 >> 2U;
xorjoep 1:24714b45cd1b 306
xorjoep 1:24714b45cd1b 307 while (blkCnt > 0U)
xorjoep 1:24714b45cd1b 308 {
xorjoep 1:24714b45cd1b 309 py = py - 1U;
xorjoep 1:24714b45cd1b 310
xorjoep 1:24714b45cd1b 311 /* Set all accumulators to zero */
xorjoep 1:24714b45cd1b 312 acc0 = 0;
xorjoep 1:24714b45cd1b 313 acc1 = 0;
xorjoep 1:24714b45cd1b 314 acc2 = 0;
xorjoep 1:24714b45cd1b 315 acc3 = 0;
xorjoep 1:24714b45cd1b 316
xorjoep 1:24714b45cd1b 317
xorjoep 1:24714b45cd1b 318 /* read x[0], x[1] samples */
xorjoep 1:24714b45cd1b 319 x0 = *__SIMD32(px);
xorjoep 1:24714b45cd1b 320 /* read x[1], x[2] samples */
xorjoep 1:24714b45cd1b 321 x1 = _SIMD32_OFFSET(px+1);
xorjoep 1:24714b45cd1b 322 px+= 2U;
xorjoep 1:24714b45cd1b 323
xorjoep 1:24714b45cd1b 324
xorjoep 1:24714b45cd1b 325 /* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep 1:24714b45cd1b 326 k = srcBLen >> 2U;
xorjoep 1:24714b45cd1b 327
xorjoep 1:24714b45cd1b 328 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep 1:24714b45cd1b 329 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep 1:24714b45cd1b 330 do
xorjoep 1:24714b45cd1b 331 {
xorjoep 1:24714b45cd1b 332 /* Read the last two inputB samples using SIMD:
xorjoep 1:24714b45cd1b 333 * y[srcBLen - 1] and y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 334 c0 = *__SIMD32(py)--;
xorjoep 1:24714b45cd1b 335
xorjoep 1:24714b45cd1b 336 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 337 acc0 = __SMLALDX(x0, c0, acc0);
xorjoep 1:24714b45cd1b 338
xorjoep 1:24714b45cd1b 339 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 340 acc1 = __SMLALDX(x1, c0, acc1);
xorjoep 1:24714b45cd1b 341
xorjoep 1:24714b45cd1b 342 /* Read x[2], x[3] */
xorjoep 1:24714b45cd1b 343 x2 = *__SIMD32(px);
xorjoep 1:24714b45cd1b 344
xorjoep 1:24714b45cd1b 345 /* Read x[3], x[4] */
xorjoep 1:24714b45cd1b 346 x3 = _SIMD32_OFFSET(px+1);
xorjoep 1:24714b45cd1b 347
xorjoep 1:24714b45cd1b 348 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 349 acc2 = __SMLALDX(x2, c0, acc2);
xorjoep 1:24714b45cd1b 350
xorjoep 1:24714b45cd1b 351 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 352 acc3 = __SMLALDX(x3, c0, acc3);
xorjoep 1:24714b45cd1b 353
xorjoep 1:24714b45cd1b 354 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
xorjoep 1:24714b45cd1b 355 c0 = *__SIMD32(py)--;
xorjoep 1:24714b45cd1b 356
xorjoep 1:24714b45cd1b 357 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
xorjoep 1:24714b45cd1b 358 acc0 = __SMLALDX(x2, c0, acc0);
xorjoep 1:24714b45cd1b 359
xorjoep 1:24714b45cd1b 360 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
xorjoep 1:24714b45cd1b 361 acc1 = __SMLALDX(x3, c0, acc1);
xorjoep 1:24714b45cd1b 362
xorjoep 1:24714b45cd1b 363 /* Read x[4], x[5] */
xorjoep 1:24714b45cd1b 364 x0 = _SIMD32_OFFSET(px+2);
xorjoep 1:24714b45cd1b 365
xorjoep 1:24714b45cd1b 366 /* Read x[5], x[6] */
xorjoep 1:24714b45cd1b 367 x1 = _SIMD32_OFFSET(px+3);
xorjoep 1:24714b45cd1b 368 px += 4U;
xorjoep 1:24714b45cd1b 369
xorjoep 1:24714b45cd1b 370 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
xorjoep 1:24714b45cd1b 371 acc2 = __SMLALDX(x0, c0, acc2);
xorjoep 1:24714b45cd1b 372
xorjoep 1:24714b45cd1b 373 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
xorjoep 1:24714b45cd1b 374 acc3 = __SMLALDX(x1, c0, acc3);
xorjoep 1:24714b45cd1b 375
xorjoep 1:24714b45cd1b 376 } while (--k);
xorjoep 1:24714b45cd1b 377
xorjoep 1:24714b45cd1b 378 /* For the next MAC operations, SIMD is not used
xorjoep 1:24714b45cd1b 379 * So, the 16 bit pointer if inputB, py is updated */
xorjoep 1:24714b45cd1b 380
xorjoep 1:24714b45cd1b 381 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep 1:24714b45cd1b 382 ** No loop unrolling is used. */
xorjoep 1:24714b45cd1b 383 k = srcBLen % 0x4U;
xorjoep 1:24714b45cd1b 384
xorjoep 1:24714b45cd1b 385 if (k == 1U)
xorjoep 1:24714b45cd1b 386 {
xorjoep 1:24714b45cd1b 387 /* Read y[srcBLen - 5] */
xorjoep 1:24714b45cd1b 388 c0 = *(py+1);
xorjoep 1:24714b45cd1b 389
xorjoep 1:24714b45cd1b 390 #ifdef ARM_MATH_BIG_ENDIAN
xorjoep 1:24714b45cd1b 391
xorjoep 1:24714b45cd1b 392 c0 = c0 << 16U;
xorjoep 1:24714b45cd1b 393
xorjoep 1:24714b45cd1b 394 #else
xorjoep 1:24714b45cd1b 395
xorjoep 1:24714b45cd1b 396 c0 = c0 & 0x0000FFFF;
xorjoep 1:24714b45cd1b 397
xorjoep 1:24714b45cd1b 398 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep 1:24714b45cd1b 399
xorjoep 1:24714b45cd1b 400 /* Read x[7] */
xorjoep 1:24714b45cd1b 401 x3 = *__SIMD32(px);
xorjoep 1:24714b45cd1b 402 px++;
xorjoep 1:24714b45cd1b 403
xorjoep 1:24714b45cd1b 404 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 405 acc0 = __SMLALD(x0, c0, acc0);
xorjoep 1:24714b45cd1b 406 acc1 = __SMLALD(x1, c0, acc1);
xorjoep 1:24714b45cd1b 407 acc2 = __SMLALDX(x1, c0, acc2);
xorjoep 1:24714b45cd1b 408 acc3 = __SMLALDX(x3, c0, acc3);
xorjoep 1:24714b45cd1b 409 }
xorjoep 1:24714b45cd1b 410
xorjoep 1:24714b45cd1b 411 if (k == 2U)
xorjoep 1:24714b45cd1b 412 {
xorjoep 1:24714b45cd1b 413 /* Read y[srcBLen - 5], y[srcBLen - 6] */
xorjoep 1:24714b45cd1b 414 c0 = _SIMD32_OFFSET(py);
xorjoep 1:24714b45cd1b 415
xorjoep 1:24714b45cd1b 416 /* Read x[7], x[8] */
xorjoep 1:24714b45cd1b 417 x3 = *__SIMD32(px);
xorjoep 1:24714b45cd1b 418
xorjoep 1:24714b45cd1b 419 /* Read x[9] */
xorjoep 1:24714b45cd1b 420 x2 = _SIMD32_OFFSET(px+1);
xorjoep 1:24714b45cd1b 421 px += 2U;
xorjoep 1:24714b45cd1b 422
xorjoep 1:24714b45cd1b 423 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 424 acc0 = __SMLALDX(x0, c0, acc0);
xorjoep 1:24714b45cd1b 425 acc1 = __SMLALDX(x1, c0, acc1);
xorjoep 1:24714b45cd1b 426 acc2 = __SMLALDX(x3, c0, acc2);
xorjoep 1:24714b45cd1b 427 acc3 = __SMLALDX(x2, c0, acc3);
xorjoep 1:24714b45cd1b 428 }
xorjoep 1:24714b45cd1b 429
xorjoep 1:24714b45cd1b 430 if (k == 3U)
xorjoep 1:24714b45cd1b 431 {
xorjoep 1:24714b45cd1b 432 /* Read y[srcBLen - 5], y[srcBLen - 6] */
xorjoep 1:24714b45cd1b 433 c0 = _SIMD32_OFFSET(py);
xorjoep 1:24714b45cd1b 434
xorjoep 1:24714b45cd1b 435 /* Read x[7], x[8] */
xorjoep 1:24714b45cd1b 436 x3 = *__SIMD32(px);
xorjoep 1:24714b45cd1b 437
xorjoep 1:24714b45cd1b 438 /* Read x[9] */
xorjoep 1:24714b45cd1b 439 x2 = _SIMD32_OFFSET(px+1);
xorjoep 1:24714b45cd1b 440
xorjoep 1:24714b45cd1b 441 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 442 acc0 = __SMLALDX(x0, c0, acc0);
xorjoep 1:24714b45cd1b 443 acc1 = __SMLALDX(x1, c0, acc1);
xorjoep 1:24714b45cd1b 444 acc2 = __SMLALDX(x3, c0, acc2);
xorjoep 1:24714b45cd1b 445 acc3 = __SMLALDX(x2, c0, acc3);
xorjoep 1:24714b45cd1b 446
xorjoep 1:24714b45cd1b 447 c0 = *(py-1);
xorjoep 1:24714b45cd1b 448
xorjoep 1:24714b45cd1b 449 #ifdef ARM_MATH_BIG_ENDIAN
xorjoep 1:24714b45cd1b 450
xorjoep 1:24714b45cd1b 451 c0 = c0 << 16U;
xorjoep 1:24714b45cd1b 452 #else
xorjoep 1:24714b45cd1b 453
xorjoep 1:24714b45cd1b 454 c0 = c0 & 0x0000FFFF;
xorjoep 1:24714b45cd1b 455 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
xorjoep 1:24714b45cd1b 456
xorjoep 1:24714b45cd1b 457 /* Read x[10] */
xorjoep 1:24714b45cd1b 458 x3 = _SIMD32_OFFSET(px+2);
xorjoep 1:24714b45cd1b 459 px += 3U;
xorjoep 1:24714b45cd1b 460
xorjoep 1:24714b45cd1b 461 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 462 acc0 = __SMLALDX(x1, c0, acc0);
xorjoep 1:24714b45cd1b 463 acc1 = __SMLALD(x2, c0, acc1);
xorjoep 1:24714b45cd1b 464 acc2 = __SMLALDX(x2, c0, acc2);
xorjoep 1:24714b45cd1b 465 acc3 = __SMLALDX(x3, c0, acc3);
xorjoep 1:24714b45cd1b 466 }
xorjoep 1:24714b45cd1b 467
xorjoep 1:24714b45cd1b 468
xorjoep 1:24714b45cd1b 469 /* Store the results in the accumulators in the destination buffer. */
xorjoep 1:24714b45cd1b 470
xorjoep 1:24714b45cd1b 471 #ifndef ARM_MATH_BIG_ENDIAN
xorjoep 1:24714b45cd1b 472
xorjoep 1:24714b45cd1b 473 *__SIMD32(pOut)++ =
xorjoep 1:24714b45cd1b 474 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
xorjoep 1:24714b45cd1b 475 *__SIMD32(pOut)++ =
xorjoep 1:24714b45cd1b 476 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
xorjoep 1:24714b45cd1b 477
xorjoep 1:24714b45cd1b 478 #else
xorjoep 1:24714b45cd1b 479
xorjoep 1:24714b45cd1b 480 *__SIMD32(pOut)++ =
xorjoep 1:24714b45cd1b 481 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
xorjoep 1:24714b45cd1b 482 *__SIMD32(pOut)++ =
xorjoep 1:24714b45cd1b 483 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
xorjoep 1:24714b45cd1b 484
xorjoep 1:24714b45cd1b 485 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
xorjoep 1:24714b45cd1b 486
xorjoep 1:24714b45cd1b 487 /* Increment the pointer pIn1 index, count by 4 */
xorjoep 1:24714b45cd1b 488 count += 4U;
xorjoep 1:24714b45cd1b 489
xorjoep 1:24714b45cd1b 490 /* Update the inputA and inputB pointers for next MAC calculation */
xorjoep 1:24714b45cd1b 491 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep 1:24714b45cd1b 492 {
xorjoep 1:24714b45cd1b 493 px = pIn1 + firstIndex - srcBLen + 1 + count;
xorjoep 1:24714b45cd1b 494 }
xorjoep 1:24714b45cd1b 495 else
xorjoep 1:24714b45cd1b 496 {
xorjoep 1:24714b45cd1b 497 px = pIn1 + count;
xorjoep 1:24714b45cd1b 498 }
xorjoep 1:24714b45cd1b 499 py = pSrc2;
xorjoep 1:24714b45cd1b 500
xorjoep 1:24714b45cd1b 501 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 502 blkCnt--;
xorjoep 1:24714b45cd1b 503 }
xorjoep 1:24714b45cd1b 504
xorjoep 1:24714b45cd1b 505 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
xorjoep 1:24714b45cd1b 506 ** No loop unrolling is used. */
xorjoep 1:24714b45cd1b 507 blkCnt = (uint32_t) blockSize2 % 0x4U;
xorjoep 1:24714b45cd1b 508
xorjoep 1:24714b45cd1b 509 while (blkCnt > 0U)
xorjoep 1:24714b45cd1b 510 {
xorjoep 1:24714b45cd1b 511 /* Accumulator is made zero for every iteration */
xorjoep 1:24714b45cd1b 512 sum = 0;
xorjoep 1:24714b45cd1b 513
xorjoep 1:24714b45cd1b 514 /* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep 1:24714b45cd1b 515 k = srcBLen >> 2U;
xorjoep 1:24714b45cd1b 516
xorjoep 1:24714b45cd1b 517 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep 1:24714b45cd1b 518 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep 1:24714b45cd1b 519 while (k > 0U)
xorjoep 1:24714b45cd1b 520 {
xorjoep 1:24714b45cd1b 521 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 522 sum += (q63_t) ((q31_t) * px++ * *py--);
xorjoep 1:24714b45cd1b 523 sum += (q63_t) ((q31_t) * px++ * *py--);
xorjoep 1:24714b45cd1b 524 sum += (q63_t) ((q31_t) * px++ * *py--);
xorjoep 1:24714b45cd1b 525 sum += (q63_t) ((q31_t) * px++ * *py--);
xorjoep 1:24714b45cd1b 526
xorjoep 1:24714b45cd1b 527 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 528 k--;
xorjoep 1:24714b45cd1b 529 }
xorjoep 1:24714b45cd1b 530
xorjoep 1:24714b45cd1b 531 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep 1:24714b45cd1b 532 ** No loop unrolling is used. */
xorjoep 1:24714b45cd1b 533 k = srcBLen % 0x4U;
xorjoep 1:24714b45cd1b 534
xorjoep 1:24714b45cd1b 535 while (k > 0U)
xorjoep 1:24714b45cd1b 536 {
xorjoep 1:24714b45cd1b 537 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 538 sum += (q63_t) ((q31_t) * px++ * *py--);
xorjoep 1:24714b45cd1b 539
xorjoep 1:24714b45cd1b 540 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 541 k--;
xorjoep 1:24714b45cd1b 542 }
xorjoep 1:24714b45cd1b 543
xorjoep 1:24714b45cd1b 544 /* Store the result in the accumulator in the destination buffer. */
xorjoep 1:24714b45cd1b 545 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
xorjoep 1:24714b45cd1b 546
xorjoep 1:24714b45cd1b 547 /* Increment the pointer pIn1 index, count by 1 */
xorjoep 1:24714b45cd1b 548 count++;
xorjoep 1:24714b45cd1b 549
xorjoep 1:24714b45cd1b 550 /* Update the inputA and inputB pointers for next MAC calculation */
xorjoep 1:24714b45cd1b 551 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep 1:24714b45cd1b 552 {
xorjoep 1:24714b45cd1b 553 px = pIn1 + firstIndex - srcBLen + 1 + count;
xorjoep 1:24714b45cd1b 554 }
xorjoep 1:24714b45cd1b 555 else
xorjoep 1:24714b45cd1b 556 {
xorjoep 1:24714b45cd1b 557 px = pIn1 + count;
xorjoep 1:24714b45cd1b 558 }
xorjoep 1:24714b45cd1b 559 py = pSrc2;
xorjoep 1:24714b45cd1b 560
xorjoep 1:24714b45cd1b 561 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 562 blkCnt--;
xorjoep 1:24714b45cd1b 563 }
xorjoep 1:24714b45cd1b 564 }
xorjoep 1:24714b45cd1b 565 else
xorjoep 1:24714b45cd1b 566 {
xorjoep 1:24714b45cd1b 567 /* If the srcBLen is not a multiple of 4,
xorjoep 1:24714b45cd1b 568 * the blockSize2 loop cannot be unrolled by 4 */
xorjoep 1:24714b45cd1b 569 blkCnt = (uint32_t) blockSize2;
xorjoep 1:24714b45cd1b 570
xorjoep 1:24714b45cd1b 571 while (blkCnt > 0U)
xorjoep 1:24714b45cd1b 572 {
xorjoep 1:24714b45cd1b 573 /* Accumulator is made zero for every iteration */
xorjoep 1:24714b45cd1b 574 sum = 0;
xorjoep 1:24714b45cd1b 575
xorjoep 1:24714b45cd1b 576 /* srcBLen number of MACS should be performed */
xorjoep 1:24714b45cd1b 577 k = srcBLen;
xorjoep 1:24714b45cd1b 578
xorjoep 1:24714b45cd1b 579 while (k > 0U)
xorjoep 1:24714b45cd1b 580 {
xorjoep 1:24714b45cd1b 581 /* Perform the multiply-accumulate */
xorjoep 1:24714b45cd1b 582 sum += (q63_t) ((q31_t) * px++ * *py--);
xorjoep 1:24714b45cd1b 583
xorjoep 1:24714b45cd1b 584 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 585 k--;
xorjoep 1:24714b45cd1b 586 }
xorjoep 1:24714b45cd1b 587
xorjoep 1:24714b45cd1b 588 /* Store the result in the accumulator in the destination buffer. */
xorjoep 1:24714b45cd1b 589 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
xorjoep 1:24714b45cd1b 590
xorjoep 1:24714b45cd1b 591 /* Increment the MAC count */
xorjoep 1:24714b45cd1b 592 count++;
xorjoep 1:24714b45cd1b 593
xorjoep 1:24714b45cd1b 594 /* Update the inputA and inputB pointers for next MAC calculation */
xorjoep 1:24714b45cd1b 595 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep 1:24714b45cd1b 596 {
xorjoep 1:24714b45cd1b 597 px = pIn1 + firstIndex - srcBLen + 1 + count;
xorjoep 1:24714b45cd1b 598 }
xorjoep 1:24714b45cd1b 599 else
xorjoep 1:24714b45cd1b 600 {
xorjoep 1:24714b45cd1b 601 px = pIn1 + count;
xorjoep 1:24714b45cd1b 602 }
xorjoep 1:24714b45cd1b 603 py = pSrc2;
xorjoep 1:24714b45cd1b 604
xorjoep 1:24714b45cd1b 605 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 606 blkCnt--;
xorjoep 1:24714b45cd1b 607 }
xorjoep 1:24714b45cd1b 608 }
xorjoep 1:24714b45cd1b 609
xorjoep 1:24714b45cd1b 610
xorjoep 1:24714b45cd1b 611 /* --------------------------
xorjoep 1:24714b45cd1b 612 * Initializations of stage3
xorjoep 1:24714b45cd1b 613 * -------------------------*/
xorjoep 1:24714b45cd1b 614
xorjoep 1:24714b45cd1b 615 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
xorjoep 1:24714b45cd1b 616 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
xorjoep 1:24714b45cd1b 617 * ....
xorjoep 1:24714b45cd1b 618 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
xorjoep 1:24714b45cd1b 619 * sum += x[srcALen-1] * y[srcBLen-1]
xorjoep 1:24714b45cd1b 620 */
xorjoep 1:24714b45cd1b 621
xorjoep 1:24714b45cd1b 622 /* In this stage the MAC operations are decreased by 1 for every iteration.
xorjoep 1:24714b45cd1b 623 The count variable holds the number of MAC operations performed */
xorjoep 1:24714b45cd1b 624 count = srcBLen - 1U;
xorjoep 1:24714b45cd1b 625
xorjoep 1:24714b45cd1b 626 /* Working pointer of inputA */
xorjoep 1:24714b45cd1b 627 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
xorjoep 1:24714b45cd1b 628 px = pSrc1;
xorjoep 1:24714b45cd1b 629
xorjoep 1:24714b45cd1b 630 /* Working pointer of inputB */
xorjoep 1:24714b45cd1b 631 pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep 1:24714b45cd1b 632 pIn2 = pSrc2 - 1U;
xorjoep 1:24714b45cd1b 633 py = pIn2;
xorjoep 1:24714b45cd1b 634
xorjoep 1:24714b45cd1b 635 /* -------------------
xorjoep 1:24714b45cd1b 636 * Stage3 process
xorjoep 1:24714b45cd1b 637 * ------------------*/
xorjoep 1:24714b45cd1b 638
xorjoep 1:24714b45cd1b 639 /* For loop unrolling by 4, this stage is divided into two. */
xorjoep 1:24714b45cd1b 640 /* First part of this stage computes the MAC operations greater than 4 */
xorjoep 1:24714b45cd1b 641 /* Second part of this stage computes the MAC operations less than or equal to 4 */
xorjoep 1:24714b45cd1b 642
xorjoep 1:24714b45cd1b 643 /* The first part of the stage starts here */
xorjoep 1:24714b45cd1b 644 j = count >> 2U;
xorjoep 1:24714b45cd1b 645
xorjoep 1:24714b45cd1b 646 while ((j > 0U) && (blockSize3 > 0))
xorjoep 1:24714b45cd1b 647 {
xorjoep 1:24714b45cd1b 648 /* Accumulator is made zero for every iteration */
xorjoep 1:24714b45cd1b 649 sum = 0;
xorjoep 1:24714b45cd1b 650
xorjoep 1:24714b45cd1b 651 /* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep 1:24714b45cd1b 652 k = count >> 2U;
xorjoep 1:24714b45cd1b 653
xorjoep 1:24714b45cd1b 654 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep 1:24714b45cd1b 655 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep 1:24714b45cd1b 656 while (k > 0U)
xorjoep 1:24714b45cd1b 657 {
xorjoep 1:24714b45cd1b 658 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
xorjoep 1:24714b45cd1b 659 * with y[srcBLen - 1], y[srcBLen - 2] respectively */
xorjoep 1:24714b45cd1b 660 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
xorjoep 1:24714b45cd1b 661 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
xorjoep 1:24714b45cd1b 662 * with y[srcBLen - 3], y[srcBLen - 4] respectively */
xorjoep 1:24714b45cd1b 663 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
xorjoep 1:24714b45cd1b 664
xorjoep 1:24714b45cd1b 665 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 666 k--;
xorjoep 1:24714b45cd1b 667 }
xorjoep 1:24714b45cd1b 668
xorjoep 1:24714b45cd1b 669 /* For the next MAC operations, the pointer py is used without SIMD
xorjoep 1:24714b45cd1b 670 * So, py is incremented by 1 */
xorjoep 1:24714b45cd1b 671 py = py + 1U;
xorjoep 1:24714b45cd1b 672
xorjoep 1:24714b45cd1b 673 /* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep 1:24714b45cd1b 674 ** No loop unrolling is used. */
xorjoep 1:24714b45cd1b 675 k = count % 0x4U;
xorjoep 1:24714b45cd1b 676
xorjoep 1:24714b45cd1b 677 while (k > 0U)
xorjoep 1:24714b45cd1b 678 {
xorjoep 1:24714b45cd1b 679 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
xorjoep 1:24714b45cd1b 680 sum = __SMLALD(*px++, *py--, sum);
xorjoep 1:24714b45cd1b 681
xorjoep 1:24714b45cd1b 682 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 683 k--;
xorjoep 1:24714b45cd1b 684 }
xorjoep 1:24714b45cd1b 685
xorjoep 1:24714b45cd1b 686 /* Store the result in the accumulator in the destination buffer. */
xorjoep 1:24714b45cd1b 687 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
xorjoep 1:24714b45cd1b 688
xorjoep 1:24714b45cd1b 689 /* Update the inputA and inputB pointers for next MAC calculation */
xorjoep 1:24714b45cd1b 690 px = ++pSrc1;
xorjoep 1:24714b45cd1b 691 py = pIn2;
xorjoep 1:24714b45cd1b 692
xorjoep 1:24714b45cd1b 693 /* Decrement the MAC count */
xorjoep 1:24714b45cd1b 694 count--;
xorjoep 1:24714b45cd1b 695
xorjoep 1:24714b45cd1b 696 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 697 blockSize3--;
xorjoep 1:24714b45cd1b 698
xorjoep 1:24714b45cd1b 699 j--;
xorjoep 1:24714b45cd1b 700 }
xorjoep 1:24714b45cd1b 701
xorjoep 1:24714b45cd1b 702 /* The second part of the stage starts here */
xorjoep 1:24714b45cd1b 703 /* SIMD is not used for the next MAC operations,
xorjoep 1:24714b45cd1b 704 * so pointer py is updated to read only one sample at a time */
xorjoep 1:24714b45cd1b 705 py = py + 1U;
xorjoep 1:24714b45cd1b 706
xorjoep 1:24714b45cd1b 707 while (blockSize3 > 0)
xorjoep 1:24714b45cd1b 708 {
xorjoep 1:24714b45cd1b 709 /* Accumulator is made zero for every iteration */
xorjoep 1:24714b45cd1b 710 sum = 0;
xorjoep 1:24714b45cd1b 711
xorjoep 1:24714b45cd1b 712 /* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep 1:24714b45cd1b 713 k = count;
xorjoep 1:24714b45cd1b 714
xorjoep 1:24714b45cd1b 715 while (k > 0U)
xorjoep 1:24714b45cd1b 716 {
xorjoep 1:24714b45cd1b 717 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 718 /* sum += x[srcALen-1] * y[srcBLen-1] */
xorjoep 1:24714b45cd1b 719 sum = __SMLALD(*px++, *py--, sum);
xorjoep 1:24714b45cd1b 720
xorjoep 1:24714b45cd1b 721 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 722 k--;
xorjoep 1:24714b45cd1b 723 }
xorjoep 1:24714b45cd1b 724
xorjoep 1:24714b45cd1b 725 /* Store the result in the accumulator in the destination buffer. */
xorjoep 1:24714b45cd1b 726 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
xorjoep 1:24714b45cd1b 727
xorjoep 1:24714b45cd1b 728 /* Update the inputA and inputB pointers for next MAC calculation */
xorjoep 1:24714b45cd1b 729 px = ++pSrc1;
xorjoep 1:24714b45cd1b 730 py = pSrc2;
xorjoep 1:24714b45cd1b 731
xorjoep 1:24714b45cd1b 732 /* Decrement the MAC count */
xorjoep 1:24714b45cd1b 733 count--;
xorjoep 1:24714b45cd1b 734
xorjoep 1:24714b45cd1b 735 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 736 blockSize3--;
xorjoep 1:24714b45cd1b 737 }
xorjoep 1:24714b45cd1b 738
xorjoep 1:24714b45cd1b 739 /* set status as ARM_MATH_SUCCESS */
xorjoep 1:24714b45cd1b 740 status = ARM_MATH_SUCCESS;
xorjoep 1:24714b45cd1b 741 }
xorjoep 1:24714b45cd1b 742
xorjoep 1:24714b45cd1b 743 /* Return to application */
xorjoep 1:24714b45cd1b 744 return (status);
xorjoep 1:24714b45cd1b 745
xorjoep 1:24714b45cd1b 746 #else
xorjoep 1:24714b45cd1b 747
xorjoep 1:24714b45cd1b 748 /* Run the below code for Cortex-M0 */
xorjoep 1:24714b45cd1b 749
xorjoep 1:24714b45cd1b 750 q15_t *pIn1 = pSrcA; /* inputA pointer */
xorjoep 1:24714b45cd1b 751 q15_t *pIn2 = pSrcB; /* inputB pointer */
xorjoep 1:24714b45cd1b 752 q63_t sum; /* Accumulator */
xorjoep 1:24714b45cd1b 753 uint32_t i, j; /* loop counters */
xorjoep 1:24714b45cd1b 754 arm_status status; /* status of Partial convolution */
xorjoep 1:24714b45cd1b 755
xorjoep 1:24714b45cd1b 756 /* Check for range of output samples to be calculated */
xorjoep 1:24714b45cd1b 757 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
xorjoep 1:24714b45cd1b 758 {
xorjoep 1:24714b45cd1b 759 /* Set status as ARM_ARGUMENT_ERROR */
xorjoep 1:24714b45cd1b 760 status = ARM_MATH_ARGUMENT_ERROR;
xorjoep 1:24714b45cd1b 761 }
xorjoep 1:24714b45cd1b 762 else
xorjoep 1:24714b45cd1b 763 {
xorjoep 1:24714b45cd1b 764 /* Loop to calculate convolution for output length number of values */
xorjoep 1:24714b45cd1b 765 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
xorjoep 1:24714b45cd1b 766 {
xorjoep 1:24714b45cd1b 767 /* Initialize sum with zero to carry on MAC operations */
xorjoep 1:24714b45cd1b 768 sum = 0;
xorjoep 1:24714b45cd1b 769
xorjoep 1:24714b45cd1b 770 /* Loop to perform MAC operations according to convolution equation */
xorjoep 1:24714b45cd1b 771 for (j = 0; j <= i; j++)
xorjoep 1:24714b45cd1b 772 {
xorjoep 1:24714b45cd1b 773 /* Check the array limitations */
xorjoep 1:24714b45cd1b 774 if (((i - j) < srcBLen) && (j < srcALen))
xorjoep 1:24714b45cd1b 775 {
xorjoep 1:24714b45cd1b 776 /* z[i] += x[i-j] * y[j] */
xorjoep 1:24714b45cd1b 777 sum += ((q31_t) pIn1[j] * (pIn2[i - j]));
xorjoep 1:24714b45cd1b 778 }
xorjoep 1:24714b45cd1b 779 }
xorjoep 1:24714b45cd1b 780
xorjoep 1:24714b45cd1b 781 /* Store the output in the destination buffer */
xorjoep 1:24714b45cd1b 782 pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
xorjoep 1:24714b45cd1b 783 }
xorjoep 1:24714b45cd1b 784 /* set status as ARM_SUCCESS as there are no argument errors */
xorjoep 1:24714b45cd1b 785 status = ARM_MATH_SUCCESS;
xorjoep 1:24714b45cd1b 786 }
xorjoep 1:24714b45cd1b 787 return (status);
xorjoep 1:24714b45cd1b 788
xorjoep 1:24714b45cd1b 789 #endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
xorjoep 1:24714b45cd1b 790
xorjoep 1:24714b45cd1b 791 }
xorjoep 1:24714b45cd1b 792
xorjoep 1:24714b45cd1b 793 /**
xorjoep 1:24714b45cd1b 794 * @} end of PartialConv group
xorjoep 1:24714b45cd1b 795 */