The CMSIS DSP 5 library

Dependents:   Nucleo-Heart-Rate ejercicioVrms2 PROYECTOFINAL ejercicioVrms ... more

Committer:
xorjoep
Date:
Thu Jun 21 11:56:27 2018 +0000
Revision:
3:4098b9d3d571
Parent:
1:24714b45cd1b
headers is a folder not a library

Who changed what in which revision?

UserRevisionLine numberNew contents of line
xorjoep 1:24714b45cd1b 1 /* ----------------------------------------------------------------------
xorjoep 1:24714b45cd1b 2 * Project: CMSIS DSP Library
xorjoep 1:24714b45cd1b 3 * Title: arm_conv_partial_f32.c
xorjoep 1:24714b45cd1b 4 * Description: Partial convolution of floating-point sequences
xorjoep 1:24714b45cd1b 5 *
xorjoep 1:24714b45cd1b 6 * $Date: 27. January 2017
xorjoep 1:24714b45cd1b 7 * $Revision: V.1.5.1
xorjoep 1:24714b45cd1b 8 *
xorjoep 1:24714b45cd1b 9 * Target Processor: Cortex-M cores
xorjoep 1:24714b45cd1b 10 * -------------------------------------------------------------------- */
xorjoep 1:24714b45cd1b 11 /*
xorjoep 1:24714b45cd1b 12 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
xorjoep 1:24714b45cd1b 13 *
xorjoep 1:24714b45cd1b 14 * SPDX-License-Identifier: Apache-2.0
xorjoep 1:24714b45cd1b 15 *
xorjoep 1:24714b45cd1b 16 * Licensed under the Apache License, Version 2.0 (the License); you may
xorjoep 1:24714b45cd1b 17 * not use this file except in compliance with the License.
xorjoep 1:24714b45cd1b 18 * You may obtain a copy of the License at
xorjoep 1:24714b45cd1b 19 *
xorjoep 1:24714b45cd1b 20 * www.apache.org/licenses/LICENSE-2.0
xorjoep 1:24714b45cd1b 21 *
xorjoep 1:24714b45cd1b 22 * Unless required by applicable law or agreed to in writing, software
xorjoep 1:24714b45cd1b 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
xorjoep 1:24714b45cd1b 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
xorjoep 1:24714b45cd1b 25 * See the License for the specific language governing permissions and
xorjoep 1:24714b45cd1b 26 * limitations under the License.
xorjoep 1:24714b45cd1b 27 */
xorjoep 1:24714b45cd1b 28
xorjoep 1:24714b45cd1b 29 #include "arm_math.h"
xorjoep 1:24714b45cd1b 30
xorjoep 1:24714b45cd1b 31 /**
xorjoep 1:24714b45cd1b 32 * @ingroup groupFilters
xorjoep 1:24714b45cd1b 33 */
xorjoep 1:24714b45cd1b 34
xorjoep 1:24714b45cd1b 35 /**
xorjoep 1:24714b45cd1b 36 * @defgroup PartialConv Partial Convolution
xorjoep 1:24714b45cd1b 37 *
xorjoep 1:24714b45cd1b 38 * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.
xorjoep 1:24714b45cd1b 39 * Each function has two additional arguments.
xorjoep 1:24714b45cd1b 40 * <code>firstIndex</code> specifies the starting index of the subset of output samples.
xorjoep 1:24714b45cd1b 41 * <code>numPoints</code> is the number of output samples to compute.
xorjoep 1:24714b45cd1b 42 * The function computes the output in the range
xorjoep 1:24714b45cd1b 43 * <code>[firstIndex, ..., firstIndex+numPoints-1]</code>.
xorjoep 1:24714b45cd1b 44 * The output array <code>pDst</code> contains <code>numPoints</code> values.
xorjoep 1:24714b45cd1b 45 *
xorjoep 1:24714b45cd1b 46 * The allowable range of output indices is [0 srcALen+srcBLen-2].
xorjoep 1:24714b45cd1b 47 * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.
xorjoep 1:24714b45cd1b 48 * Otherwise the functions return ARM_MATH_SUCCESS.
xorjoep 1:24714b45cd1b 49 * \note Refer arm_conv_f32() for details on fixed point behavior.
xorjoep 1:24714b45cd1b 50 *
xorjoep 1:24714b45cd1b 51 *
xorjoep 1:24714b45cd1b 52 * <b>Fast Versions</b>
xorjoep 1:24714b45cd1b 53 *
xorjoep 1:24714b45cd1b 54 * \par
xorjoep 1:24714b45cd1b 55 * Fast versions are supported for Q31 and Q15 of partial convolution. Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires
xorjoep 1:24714b45cd1b 56 * the input signals should be scaled down to avoid intermediate overflows.
xorjoep 1:24714b45cd1b 57 *
xorjoep 1:24714b45cd1b 58 *
xorjoep 1:24714b45cd1b 59 * <b>Opt Versions</b>
xorjoep 1:24714b45cd1b 60 *
xorjoep 1:24714b45cd1b 61 * \par
xorjoep 1:24714b45cd1b 62 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
xorjoep 1:24714b45cd1b 63 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of partial convolution
xorjoep 1:24714b45cd1b 64 */
xorjoep 1:24714b45cd1b 65
xorjoep 1:24714b45cd1b 66 /**
xorjoep 1:24714b45cd1b 67 * @addtogroup PartialConv
xorjoep 1:24714b45cd1b 68 * @{
xorjoep 1:24714b45cd1b 69 */
xorjoep 1:24714b45cd1b 70
xorjoep 1:24714b45cd1b 71 /**
xorjoep 1:24714b45cd1b 72 * @brief Partial convolution of floating-point sequences.
xorjoep 1:24714b45cd1b 73 * @param[in] *pSrcA points to the first input sequence.
xorjoep 1:24714b45cd1b 74 * @param[in] srcALen length of the first input sequence.
xorjoep 1:24714b45cd1b 75 * @param[in] *pSrcB points to the second input sequence.
xorjoep 1:24714b45cd1b 76 * @param[in] srcBLen length of the second input sequence.
xorjoep 1:24714b45cd1b 77 * @param[out] *pDst points to the location where the output result is written.
xorjoep 1:24714b45cd1b 78 * @param[in] firstIndex is the first output sample to start with.
xorjoep 1:24714b45cd1b 79 * @param[in] numPoints is the number of output points to be computed.
xorjoep 1:24714b45cd1b 80 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
xorjoep 1:24714b45cd1b 81 */
xorjoep 1:24714b45cd1b 82
xorjoep 1:24714b45cd1b 83 arm_status arm_conv_partial_f32(
xorjoep 1:24714b45cd1b 84 float32_t * pSrcA,
xorjoep 1:24714b45cd1b 85 uint32_t srcALen,
xorjoep 1:24714b45cd1b 86 float32_t * pSrcB,
xorjoep 1:24714b45cd1b 87 uint32_t srcBLen,
xorjoep 1:24714b45cd1b 88 float32_t * pDst,
xorjoep 1:24714b45cd1b 89 uint32_t firstIndex,
xorjoep 1:24714b45cd1b 90 uint32_t numPoints)
xorjoep 1:24714b45cd1b 91 {
xorjoep 1:24714b45cd1b 92
xorjoep 1:24714b45cd1b 93
xorjoep 1:24714b45cd1b 94 #if defined (ARM_MATH_DSP)
xorjoep 1:24714b45cd1b 95
xorjoep 1:24714b45cd1b 96 /* Run the below code for Cortex-M4 and Cortex-M3 */
xorjoep 1:24714b45cd1b 97
xorjoep 1:24714b45cd1b 98 float32_t *pIn1 = pSrcA; /* inputA pointer */
xorjoep 1:24714b45cd1b 99 float32_t *pIn2 = pSrcB; /* inputB pointer */
xorjoep 1:24714b45cd1b 100 float32_t *pOut = pDst; /* output pointer */
xorjoep 1:24714b45cd1b 101 float32_t *px; /* Intermediate inputA pointer */
xorjoep 1:24714b45cd1b 102 float32_t *py; /* Intermediate inputB pointer */
xorjoep 1:24714b45cd1b 103 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
xorjoep 1:24714b45cd1b 104 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
xorjoep 1:24714b45cd1b 105 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
xorjoep 1:24714b45cd1b 106 uint32_t j, k, count = 0U, blkCnt, check;
xorjoep 1:24714b45cd1b 107 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
xorjoep 1:24714b45cd1b 108 arm_status status; /* status of Partial convolution */
xorjoep 1:24714b45cd1b 109
xorjoep 1:24714b45cd1b 110
xorjoep 1:24714b45cd1b 111 /* Check for range of output samples to be calculated */
xorjoep 1:24714b45cd1b 112 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
xorjoep 1:24714b45cd1b 113 {
xorjoep 1:24714b45cd1b 114 /* Set status as ARM_MATH_ARGUMENT_ERROR */
xorjoep 1:24714b45cd1b 115 status = ARM_MATH_ARGUMENT_ERROR;
xorjoep 1:24714b45cd1b 116 }
xorjoep 1:24714b45cd1b 117 else
xorjoep 1:24714b45cd1b 118 {
xorjoep 1:24714b45cd1b 119
xorjoep 1:24714b45cd1b 120 /* The algorithm implementation is based on the lengths of the inputs. */
xorjoep 1:24714b45cd1b 121 /* srcB is always made to slide across srcA. */
xorjoep 1:24714b45cd1b 122 /* So srcBLen is always considered as shorter or equal to srcALen */
xorjoep 1:24714b45cd1b 123 if (srcALen >= srcBLen)
xorjoep 1:24714b45cd1b 124 {
xorjoep 1:24714b45cd1b 125 /* Initialization of inputA pointer */
xorjoep 1:24714b45cd1b 126 pIn1 = pSrcA;
xorjoep 1:24714b45cd1b 127
xorjoep 1:24714b45cd1b 128 /* Initialization of inputB pointer */
xorjoep 1:24714b45cd1b 129 pIn2 = pSrcB;
xorjoep 1:24714b45cd1b 130 }
xorjoep 1:24714b45cd1b 131 else
xorjoep 1:24714b45cd1b 132 {
xorjoep 1:24714b45cd1b 133 /* Initialization of inputA pointer */
xorjoep 1:24714b45cd1b 134 pIn1 = pSrcB;
xorjoep 1:24714b45cd1b 135
xorjoep 1:24714b45cd1b 136 /* Initialization of inputB pointer */
xorjoep 1:24714b45cd1b 137 pIn2 = pSrcA;
xorjoep 1:24714b45cd1b 138
xorjoep 1:24714b45cd1b 139 /* srcBLen is always considered as shorter or equal to srcALen */
xorjoep 1:24714b45cd1b 140 j = srcBLen;
xorjoep 1:24714b45cd1b 141 srcBLen = srcALen;
xorjoep 1:24714b45cd1b 142 srcALen = j;
xorjoep 1:24714b45cd1b 143 }
xorjoep 1:24714b45cd1b 144
xorjoep 1:24714b45cd1b 145 /* Conditions to check which loopCounter holds
xorjoep 1:24714b45cd1b 146 * the first and last indices of the output samples to be calculated. */
xorjoep 1:24714b45cd1b 147 check = firstIndex + numPoints;
xorjoep 1:24714b45cd1b 148 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
xorjoep 1:24714b45cd1b 149 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
xorjoep 1:24714b45cd1b 150 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
xorjoep 1:24714b45cd1b 151 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
xorjoep 1:24714b45cd1b 152 (int32_t) numPoints) : 0;
xorjoep 1:24714b45cd1b 153 blockSize2 = ((int32_t) check - blockSize3) -
xorjoep 1:24714b45cd1b 154 (blockSize1 + (int32_t) firstIndex);
xorjoep 1:24714b45cd1b 155 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
xorjoep 1:24714b45cd1b 156
xorjoep 1:24714b45cd1b 157 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
xorjoep 1:24714b45cd1b 158 /* The function is internally
xorjoep 1:24714b45cd1b 159 * divided into three stages according to the number of multiplications that has to be
xorjoep 1:24714b45cd1b 160 * taken place between inputA samples and inputB samples. In the first stage of the
xorjoep 1:24714b45cd1b 161 * algorithm, the multiplications increase by one for every iteration.
xorjoep 1:24714b45cd1b 162 * In the second stage of the algorithm, srcBLen number of multiplications are done.
xorjoep 1:24714b45cd1b 163 * In the third stage of the algorithm, the multiplications decrease by one
xorjoep 1:24714b45cd1b 164 * for every iteration. */
xorjoep 1:24714b45cd1b 165
xorjoep 1:24714b45cd1b 166 /* Set the output pointer to point to the firstIndex
xorjoep 1:24714b45cd1b 167 * of the output sample to be calculated. */
xorjoep 1:24714b45cd1b 168 pOut = pDst + firstIndex;
xorjoep 1:24714b45cd1b 169
xorjoep 1:24714b45cd1b 170 /* --------------------------
xorjoep 1:24714b45cd1b 171 * Initializations of stage1
xorjoep 1:24714b45cd1b 172 * -------------------------*/
xorjoep 1:24714b45cd1b 173
xorjoep 1:24714b45cd1b 174 /* sum = x[0] * y[0]
xorjoep 1:24714b45cd1b 175 * sum = x[0] * y[1] + x[1] * y[0]
xorjoep 1:24714b45cd1b 176 * ....
xorjoep 1:24714b45cd1b 177 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
xorjoep 1:24714b45cd1b 178 */
xorjoep 1:24714b45cd1b 179
xorjoep 1:24714b45cd1b 180 /* In this stage the MAC operations are increased by 1 for every iteration.
xorjoep 1:24714b45cd1b 181 The count variable holds the number of MAC operations performed.
xorjoep 1:24714b45cd1b 182 Since the partial convolution starts from from firstIndex
xorjoep 1:24714b45cd1b 183 Number of Macs to be performed is firstIndex + 1 */
xorjoep 1:24714b45cd1b 184 count = 1U + firstIndex;
xorjoep 1:24714b45cd1b 185
xorjoep 1:24714b45cd1b 186 /* Working pointer of inputA */
xorjoep 1:24714b45cd1b 187 px = pIn1;
xorjoep 1:24714b45cd1b 188
xorjoep 1:24714b45cd1b 189 /* Working pointer of inputB */
xorjoep 1:24714b45cd1b 190 pSrc1 = pIn2 + firstIndex;
xorjoep 1:24714b45cd1b 191 py = pSrc1;
xorjoep 1:24714b45cd1b 192
xorjoep 1:24714b45cd1b 193 /* ------------------------
xorjoep 1:24714b45cd1b 194 * Stage1 process
xorjoep 1:24714b45cd1b 195 * ----------------------*/
xorjoep 1:24714b45cd1b 196
xorjoep 1:24714b45cd1b 197 /* The first stage starts here */
xorjoep 1:24714b45cd1b 198 while (blockSize1 > 0)
xorjoep 1:24714b45cd1b 199 {
xorjoep 1:24714b45cd1b 200 /* Accumulator is made zero for every iteration */
xorjoep 1:24714b45cd1b 201 sum = 0.0f;
xorjoep 1:24714b45cd1b 202
xorjoep 1:24714b45cd1b 203 /* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep 1:24714b45cd1b 204 k = count >> 2U;
xorjoep 1:24714b45cd1b 205
xorjoep 1:24714b45cd1b 206 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep 1:24714b45cd1b 207 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep 1:24714b45cd1b 208 while (k > 0U)
xorjoep 1:24714b45cd1b 209 {
xorjoep 1:24714b45cd1b 210 /* x[0] * y[srcBLen - 1] */
xorjoep 1:24714b45cd1b 211 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 212
xorjoep 1:24714b45cd1b 213 /* x[1] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 214 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 215
xorjoep 1:24714b45cd1b 216 /* x[2] * y[srcBLen - 3] */
xorjoep 1:24714b45cd1b 217 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 218
xorjoep 1:24714b45cd1b 219 /* x[3] * y[srcBLen - 4] */
xorjoep 1:24714b45cd1b 220 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 221
xorjoep 1:24714b45cd1b 222 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 223 k--;
xorjoep 1:24714b45cd1b 224 }
xorjoep 1:24714b45cd1b 225
xorjoep 1:24714b45cd1b 226 /* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep 1:24714b45cd1b 227 ** No loop unrolling is used. */
xorjoep 1:24714b45cd1b 228 k = count % 0x4U;
xorjoep 1:24714b45cd1b 229
xorjoep 1:24714b45cd1b 230 while (k > 0U)
xorjoep 1:24714b45cd1b 231 {
xorjoep 1:24714b45cd1b 232 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 233 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 234
xorjoep 1:24714b45cd1b 235 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 236 k--;
xorjoep 1:24714b45cd1b 237 }
xorjoep 1:24714b45cd1b 238
xorjoep 1:24714b45cd1b 239 /* Store the result in the accumulator in the destination buffer. */
xorjoep 1:24714b45cd1b 240 *pOut++ = sum;
xorjoep 1:24714b45cd1b 241
xorjoep 1:24714b45cd1b 242 /* Update the inputA and inputB pointers for next MAC calculation */
xorjoep 1:24714b45cd1b 243 py = ++pSrc1;
xorjoep 1:24714b45cd1b 244 px = pIn1;
xorjoep 1:24714b45cd1b 245
xorjoep 1:24714b45cd1b 246 /* Increment the MAC count */
xorjoep 1:24714b45cd1b 247 count++;
xorjoep 1:24714b45cd1b 248
xorjoep 1:24714b45cd1b 249 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 250 blockSize1--;
xorjoep 1:24714b45cd1b 251 }
xorjoep 1:24714b45cd1b 252
xorjoep 1:24714b45cd1b 253 /* --------------------------
xorjoep 1:24714b45cd1b 254 * Initializations of stage2
xorjoep 1:24714b45cd1b 255 * ------------------------*/
xorjoep 1:24714b45cd1b 256
xorjoep 1:24714b45cd1b 257 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
xorjoep 1:24714b45cd1b 258 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
xorjoep 1:24714b45cd1b 259 * ....
xorjoep 1:24714b45cd1b 260 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
xorjoep 1:24714b45cd1b 261 */
xorjoep 1:24714b45cd1b 262
xorjoep 1:24714b45cd1b 263 /* Working pointer of inputA */
xorjoep 1:24714b45cd1b 264 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep 1:24714b45cd1b 265 {
xorjoep 1:24714b45cd1b 266 px = pIn1 + firstIndex - srcBLen + 1;
xorjoep 1:24714b45cd1b 267 }
xorjoep 1:24714b45cd1b 268 else
xorjoep 1:24714b45cd1b 269 {
xorjoep 1:24714b45cd1b 270 px = pIn1;
xorjoep 1:24714b45cd1b 271 }
xorjoep 1:24714b45cd1b 272
xorjoep 1:24714b45cd1b 273 /* Working pointer of inputB */
xorjoep 1:24714b45cd1b 274 pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep 1:24714b45cd1b 275 py = pSrc2;
xorjoep 1:24714b45cd1b 276
xorjoep 1:24714b45cd1b 277 /* count is index by which the pointer pIn1 to be incremented */
xorjoep 1:24714b45cd1b 278 count = 0U;
xorjoep 1:24714b45cd1b 279
xorjoep 1:24714b45cd1b 280 /* -------------------
xorjoep 1:24714b45cd1b 281 * Stage2 process
xorjoep 1:24714b45cd1b 282 * ------------------*/
xorjoep 1:24714b45cd1b 283
xorjoep 1:24714b45cd1b 284 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
xorjoep 1:24714b45cd1b 285 * So, to loop unroll over blockSize2,
xorjoep 1:24714b45cd1b 286 * srcBLen should be greater than or equal to 4 */
xorjoep 1:24714b45cd1b 287 if (srcBLen >= 4U)
xorjoep 1:24714b45cd1b 288 {
xorjoep 1:24714b45cd1b 289 /* Loop unroll over blockSize2, by 4 */
xorjoep 1:24714b45cd1b 290 blkCnt = ((uint32_t) blockSize2 >> 2U);
xorjoep 1:24714b45cd1b 291
xorjoep 1:24714b45cd1b 292 while (blkCnt > 0U)
xorjoep 1:24714b45cd1b 293 {
xorjoep 1:24714b45cd1b 294 /* Set all accumulators to zero */
xorjoep 1:24714b45cd1b 295 acc0 = 0.0f;
xorjoep 1:24714b45cd1b 296 acc1 = 0.0f;
xorjoep 1:24714b45cd1b 297 acc2 = 0.0f;
xorjoep 1:24714b45cd1b 298 acc3 = 0.0f;
xorjoep 1:24714b45cd1b 299
xorjoep 1:24714b45cd1b 300 /* read x[0], x[1], x[2] samples */
xorjoep 1:24714b45cd1b 301 x0 = *(px++);
xorjoep 1:24714b45cd1b 302 x1 = *(px++);
xorjoep 1:24714b45cd1b 303 x2 = *(px++);
xorjoep 1:24714b45cd1b 304
xorjoep 1:24714b45cd1b 305 /* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep 1:24714b45cd1b 306 k = srcBLen >> 2U;
xorjoep 1:24714b45cd1b 307
xorjoep 1:24714b45cd1b 308 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep 1:24714b45cd1b 309 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep 1:24714b45cd1b 310 do
xorjoep 1:24714b45cd1b 311 {
xorjoep 1:24714b45cd1b 312 /* Read y[srcBLen - 1] sample */
xorjoep 1:24714b45cd1b 313 c0 = *(py--);
xorjoep 1:24714b45cd1b 314
xorjoep 1:24714b45cd1b 315 /* Read x[3] sample */
xorjoep 1:24714b45cd1b 316 x3 = *(px++);
xorjoep 1:24714b45cd1b 317
xorjoep 1:24714b45cd1b 318 /* Perform the multiply-accumulate */
xorjoep 1:24714b45cd1b 319 /* acc0 += x[0] * y[srcBLen - 1] */
xorjoep 1:24714b45cd1b 320 acc0 += x0 * c0;
xorjoep 1:24714b45cd1b 321
xorjoep 1:24714b45cd1b 322 /* acc1 += x[1] * y[srcBLen - 1] */
xorjoep 1:24714b45cd1b 323 acc1 += x1 * c0;
xorjoep 1:24714b45cd1b 324
xorjoep 1:24714b45cd1b 325 /* acc2 += x[2] * y[srcBLen - 1] */
xorjoep 1:24714b45cd1b 326 acc2 += x2 * c0;
xorjoep 1:24714b45cd1b 327
xorjoep 1:24714b45cd1b 328 /* acc3 += x[3] * y[srcBLen - 1] */
xorjoep 1:24714b45cd1b 329 acc3 += x3 * c0;
xorjoep 1:24714b45cd1b 330
xorjoep 1:24714b45cd1b 331 /* Read y[srcBLen - 2] sample */
xorjoep 1:24714b45cd1b 332 c0 = *(py--);
xorjoep 1:24714b45cd1b 333
xorjoep 1:24714b45cd1b 334 /* Read x[4] sample */
xorjoep 1:24714b45cd1b 335 x0 = *(px++);
xorjoep 1:24714b45cd1b 336
xorjoep 1:24714b45cd1b 337 /* Perform the multiply-accumulate */
xorjoep 1:24714b45cd1b 338 /* acc0 += x[1] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 339 acc0 += x1 * c0;
xorjoep 1:24714b45cd1b 340 /* acc1 += x[2] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 341 acc1 += x2 * c0;
xorjoep 1:24714b45cd1b 342 /* acc2 += x[3] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 343 acc2 += x3 * c0;
xorjoep 1:24714b45cd1b 344 /* acc3 += x[4] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 345 acc3 += x0 * c0;
xorjoep 1:24714b45cd1b 346
xorjoep 1:24714b45cd1b 347 /* Read y[srcBLen - 3] sample */
xorjoep 1:24714b45cd1b 348 c0 = *(py--);
xorjoep 1:24714b45cd1b 349
xorjoep 1:24714b45cd1b 350 /* Read x[5] sample */
xorjoep 1:24714b45cd1b 351 x1 = *(px++);
xorjoep 1:24714b45cd1b 352
xorjoep 1:24714b45cd1b 353 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 354 /* acc0 += x[2] * y[srcBLen - 3] */
xorjoep 1:24714b45cd1b 355 acc0 += x2 * c0;
xorjoep 1:24714b45cd1b 356 /* acc1 += x[3] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 357 acc1 += x3 * c0;
xorjoep 1:24714b45cd1b 358 /* acc2 += x[4] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 359 acc2 += x0 * c0;
xorjoep 1:24714b45cd1b 360 /* acc3 += x[5] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 361 acc3 += x1 * c0;
xorjoep 1:24714b45cd1b 362
xorjoep 1:24714b45cd1b 363 /* Read y[srcBLen - 4] sample */
xorjoep 1:24714b45cd1b 364 c0 = *(py--);
xorjoep 1:24714b45cd1b 365
xorjoep 1:24714b45cd1b 366 /* Read x[6] sample */
xorjoep 1:24714b45cd1b 367 x2 = *(px++);
xorjoep 1:24714b45cd1b 368
xorjoep 1:24714b45cd1b 369 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 370 /* acc0 += x[3] * y[srcBLen - 4] */
xorjoep 1:24714b45cd1b 371 acc0 += x3 * c0;
xorjoep 1:24714b45cd1b 372 /* acc1 += x[4] * y[srcBLen - 4] */
xorjoep 1:24714b45cd1b 373 acc1 += x0 * c0;
xorjoep 1:24714b45cd1b 374 /* acc2 += x[5] * y[srcBLen - 4] */
xorjoep 1:24714b45cd1b 375 acc2 += x1 * c0;
xorjoep 1:24714b45cd1b 376 /* acc3 += x[6] * y[srcBLen - 4] */
xorjoep 1:24714b45cd1b 377 acc3 += x2 * c0;
xorjoep 1:24714b45cd1b 378
xorjoep 1:24714b45cd1b 379
xorjoep 1:24714b45cd1b 380 } while (--k);
xorjoep 1:24714b45cd1b 381
xorjoep 1:24714b45cd1b 382 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep 1:24714b45cd1b 383 ** No loop unrolling is used. */
xorjoep 1:24714b45cd1b 384 k = srcBLen % 0x4U;
xorjoep 1:24714b45cd1b 385
xorjoep 1:24714b45cd1b 386 while (k > 0U)
xorjoep 1:24714b45cd1b 387 {
xorjoep 1:24714b45cd1b 388 /* Read y[srcBLen - 5] sample */
xorjoep 1:24714b45cd1b 389 c0 = *(py--);
xorjoep 1:24714b45cd1b 390
xorjoep 1:24714b45cd1b 391 /* Read x[7] sample */
xorjoep 1:24714b45cd1b 392 x3 = *(px++);
xorjoep 1:24714b45cd1b 393
xorjoep 1:24714b45cd1b 394 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 395 /* acc0 += x[4] * y[srcBLen - 5] */
xorjoep 1:24714b45cd1b 396 acc0 += x0 * c0;
xorjoep 1:24714b45cd1b 397 /* acc1 += x[5] * y[srcBLen - 5] */
xorjoep 1:24714b45cd1b 398 acc1 += x1 * c0;
xorjoep 1:24714b45cd1b 399 /* acc2 += x[6] * y[srcBLen - 5] */
xorjoep 1:24714b45cd1b 400 acc2 += x2 * c0;
xorjoep 1:24714b45cd1b 401 /* acc3 += x[7] * y[srcBLen - 5] */
xorjoep 1:24714b45cd1b 402 acc3 += x3 * c0;
xorjoep 1:24714b45cd1b 403
xorjoep 1:24714b45cd1b 404 /* Reuse the present samples for the next MAC */
xorjoep 1:24714b45cd1b 405 x0 = x1;
xorjoep 1:24714b45cd1b 406 x1 = x2;
xorjoep 1:24714b45cd1b 407 x2 = x3;
xorjoep 1:24714b45cd1b 408
xorjoep 1:24714b45cd1b 409 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 410 k--;
xorjoep 1:24714b45cd1b 411 }
xorjoep 1:24714b45cd1b 412
xorjoep 1:24714b45cd1b 413 /* Store the result in the accumulator in the destination buffer. */
xorjoep 1:24714b45cd1b 414 *pOut++ = acc0;
xorjoep 1:24714b45cd1b 415 *pOut++ = acc1;
xorjoep 1:24714b45cd1b 416 *pOut++ = acc2;
xorjoep 1:24714b45cd1b 417 *pOut++ = acc3;
xorjoep 1:24714b45cd1b 418
xorjoep 1:24714b45cd1b 419 /* Increment the pointer pIn1 index, count by 1 */
xorjoep 1:24714b45cd1b 420 count += 4U;
xorjoep 1:24714b45cd1b 421
xorjoep 1:24714b45cd1b 422 /* Update the inputA and inputB pointers for next MAC calculation */
xorjoep 1:24714b45cd1b 423 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep 1:24714b45cd1b 424 {
xorjoep 1:24714b45cd1b 425 px = pIn1 + firstIndex - srcBLen + 1 + count;
xorjoep 1:24714b45cd1b 426 }
xorjoep 1:24714b45cd1b 427 else
xorjoep 1:24714b45cd1b 428 {
xorjoep 1:24714b45cd1b 429 px = pIn1 + count;
xorjoep 1:24714b45cd1b 430 }
xorjoep 1:24714b45cd1b 431 py = pSrc2;
xorjoep 1:24714b45cd1b 432
xorjoep 1:24714b45cd1b 433 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 434 blkCnt--;
xorjoep 1:24714b45cd1b 435 }
xorjoep 1:24714b45cd1b 436
xorjoep 1:24714b45cd1b 437 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
xorjoep 1:24714b45cd1b 438 ** No loop unrolling is used. */
xorjoep 1:24714b45cd1b 439 blkCnt = (uint32_t) blockSize2 % 0x4U;
xorjoep 1:24714b45cd1b 440
xorjoep 1:24714b45cd1b 441 while (blkCnt > 0U)
xorjoep 1:24714b45cd1b 442 {
xorjoep 1:24714b45cd1b 443 /* Accumulator is made zero for every iteration */
xorjoep 1:24714b45cd1b 444 sum = 0.0f;
xorjoep 1:24714b45cd1b 445
xorjoep 1:24714b45cd1b 446 /* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep 1:24714b45cd1b 447 k = srcBLen >> 2U;
xorjoep 1:24714b45cd1b 448
xorjoep 1:24714b45cd1b 449 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep 1:24714b45cd1b 450 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep 1:24714b45cd1b 451 while (k > 0U)
xorjoep 1:24714b45cd1b 452 {
xorjoep 1:24714b45cd1b 453 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 454 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 455 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 456 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 457 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 458
xorjoep 1:24714b45cd1b 459 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 460 k--;
xorjoep 1:24714b45cd1b 461 }
xorjoep 1:24714b45cd1b 462
xorjoep 1:24714b45cd1b 463 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
xorjoep 1:24714b45cd1b 464 ** No loop unrolling is used. */
xorjoep 1:24714b45cd1b 465 k = srcBLen % 0x4U;
xorjoep 1:24714b45cd1b 466
xorjoep 1:24714b45cd1b 467 while (k > 0U)
xorjoep 1:24714b45cd1b 468 {
xorjoep 1:24714b45cd1b 469 /* Perform the multiply-accumulate */
xorjoep 1:24714b45cd1b 470 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 471
xorjoep 1:24714b45cd1b 472 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 473 k--;
xorjoep 1:24714b45cd1b 474 }
xorjoep 1:24714b45cd1b 475
xorjoep 1:24714b45cd1b 476 /* Store the result in the accumulator in the destination buffer. */
xorjoep 1:24714b45cd1b 477 *pOut++ = sum;
xorjoep 1:24714b45cd1b 478
xorjoep 1:24714b45cd1b 479 /* Increment the MAC count */
xorjoep 1:24714b45cd1b 480 count++;
xorjoep 1:24714b45cd1b 481
xorjoep 1:24714b45cd1b 482 /* Update the inputA and inputB pointers for next MAC calculation */
xorjoep 1:24714b45cd1b 483 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep 1:24714b45cd1b 484 {
xorjoep 1:24714b45cd1b 485 px = pIn1 + firstIndex - srcBLen + 1 + count;
xorjoep 1:24714b45cd1b 486 }
xorjoep 1:24714b45cd1b 487 else
xorjoep 1:24714b45cd1b 488 {
xorjoep 1:24714b45cd1b 489 px = pIn1 + count;
xorjoep 1:24714b45cd1b 490 }
xorjoep 1:24714b45cd1b 491 py = pSrc2;
xorjoep 1:24714b45cd1b 492
xorjoep 1:24714b45cd1b 493 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 494 blkCnt--;
xorjoep 1:24714b45cd1b 495 }
xorjoep 1:24714b45cd1b 496 }
xorjoep 1:24714b45cd1b 497 else
xorjoep 1:24714b45cd1b 498 {
xorjoep 1:24714b45cd1b 499 /* If the srcBLen is not a multiple of 4,
xorjoep 1:24714b45cd1b 500 * the blockSize2 loop cannot be unrolled by 4 */
xorjoep 1:24714b45cd1b 501 blkCnt = (uint32_t) blockSize2;
xorjoep 1:24714b45cd1b 502
xorjoep 1:24714b45cd1b 503 while (blkCnt > 0U)
xorjoep 1:24714b45cd1b 504 {
xorjoep 1:24714b45cd1b 505 /* Accumulator is made zero for every iteration */
xorjoep 1:24714b45cd1b 506 sum = 0.0f;
xorjoep 1:24714b45cd1b 507
xorjoep 1:24714b45cd1b 508 /* srcBLen number of MACS should be performed */
xorjoep 1:24714b45cd1b 509 k = srcBLen;
xorjoep 1:24714b45cd1b 510
xorjoep 1:24714b45cd1b 511 while (k > 0U)
xorjoep 1:24714b45cd1b 512 {
xorjoep 1:24714b45cd1b 513 /* Perform the multiply-accumulate */
xorjoep 1:24714b45cd1b 514 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 515
xorjoep 1:24714b45cd1b 516 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 517 k--;
xorjoep 1:24714b45cd1b 518 }
xorjoep 1:24714b45cd1b 519
xorjoep 1:24714b45cd1b 520 /* Store the result in the accumulator in the destination buffer. */
xorjoep 1:24714b45cd1b 521 *pOut++ = sum;
xorjoep 1:24714b45cd1b 522
xorjoep 1:24714b45cd1b 523 /* Increment the MAC count */
xorjoep 1:24714b45cd1b 524 count++;
xorjoep 1:24714b45cd1b 525
xorjoep 1:24714b45cd1b 526 /* Update the inputA and inputB pointers for next MAC calculation */
xorjoep 1:24714b45cd1b 527 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
xorjoep 1:24714b45cd1b 528 {
xorjoep 1:24714b45cd1b 529 px = pIn1 + firstIndex - srcBLen + 1 + count;
xorjoep 1:24714b45cd1b 530 }
xorjoep 1:24714b45cd1b 531 else
xorjoep 1:24714b45cd1b 532 {
xorjoep 1:24714b45cd1b 533 px = pIn1 + count;
xorjoep 1:24714b45cd1b 534 }
xorjoep 1:24714b45cd1b 535 py = pSrc2;
xorjoep 1:24714b45cd1b 536
xorjoep 1:24714b45cd1b 537 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 538 blkCnt--;
xorjoep 1:24714b45cd1b 539 }
xorjoep 1:24714b45cd1b 540 }
xorjoep 1:24714b45cd1b 541
xorjoep 1:24714b45cd1b 542
xorjoep 1:24714b45cd1b 543 /* --------------------------
xorjoep 1:24714b45cd1b 544 * Initializations of stage3
xorjoep 1:24714b45cd1b 545 * -------------------------*/
xorjoep 1:24714b45cd1b 546
xorjoep 1:24714b45cd1b 547 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
xorjoep 1:24714b45cd1b 548 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
xorjoep 1:24714b45cd1b 549 * ....
xorjoep 1:24714b45cd1b 550 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
xorjoep 1:24714b45cd1b 551 * sum += x[srcALen-1] * y[srcBLen-1]
xorjoep 1:24714b45cd1b 552 */
xorjoep 1:24714b45cd1b 553
xorjoep 1:24714b45cd1b 554 /* In this stage the MAC operations are decreased by 1 for every iteration.
xorjoep 1:24714b45cd1b 555 The count variable holds the number of MAC operations performed */
xorjoep 1:24714b45cd1b 556 count = srcBLen - 1U;
xorjoep 1:24714b45cd1b 557
xorjoep 1:24714b45cd1b 558 /* Working pointer of inputA */
xorjoep 1:24714b45cd1b 559 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
xorjoep 1:24714b45cd1b 560 px = pSrc1;
xorjoep 1:24714b45cd1b 561
xorjoep 1:24714b45cd1b 562 /* Working pointer of inputB */
xorjoep 1:24714b45cd1b 563 pSrc2 = pIn2 + (srcBLen - 1U);
xorjoep 1:24714b45cd1b 564 py = pSrc2;
xorjoep 1:24714b45cd1b 565
xorjoep 1:24714b45cd1b 566 while (blockSize3 > 0)
xorjoep 1:24714b45cd1b 567 {
xorjoep 1:24714b45cd1b 568 /* Accumulator is made zero for every iteration */
xorjoep 1:24714b45cd1b 569 sum = 0.0f;
xorjoep 1:24714b45cd1b 570
xorjoep 1:24714b45cd1b 571 /* Apply loop unrolling and compute 4 MACs simultaneously. */
xorjoep 1:24714b45cd1b 572 k = count >> 2U;
xorjoep 1:24714b45cd1b 573
xorjoep 1:24714b45cd1b 574 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
xorjoep 1:24714b45cd1b 575 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
xorjoep 1:24714b45cd1b 576 while (k > 0U)
xorjoep 1:24714b45cd1b 577 {
xorjoep 1:24714b45cd1b 578 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
xorjoep 1:24714b45cd1b 579 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 580
xorjoep 1:24714b45cd1b 581 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
xorjoep 1:24714b45cd1b 582 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 583
xorjoep 1:24714b45cd1b 584 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
xorjoep 1:24714b45cd1b 585 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 586
xorjoep 1:24714b45cd1b 587 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
xorjoep 1:24714b45cd1b 588 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 589
xorjoep 1:24714b45cd1b 590 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 591 k--;
xorjoep 1:24714b45cd1b 592 }
xorjoep 1:24714b45cd1b 593
xorjoep 1:24714b45cd1b 594 /* If the count is not a multiple of 4, compute any remaining MACs here.
xorjoep 1:24714b45cd1b 595 ** No loop unrolling is used. */
xorjoep 1:24714b45cd1b 596 k = count % 0x4U;
xorjoep 1:24714b45cd1b 597
xorjoep 1:24714b45cd1b 598 while (k > 0U)
xorjoep 1:24714b45cd1b 599 {
xorjoep 1:24714b45cd1b 600 /* Perform the multiply-accumulates */
xorjoep 1:24714b45cd1b 601 /* sum += x[srcALen-1] * y[srcBLen-1] */
xorjoep 1:24714b45cd1b 602 sum += *px++ * *py--;
xorjoep 1:24714b45cd1b 603
xorjoep 1:24714b45cd1b 604 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 605 k--;
xorjoep 1:24714b45cd1b 606 }
xorjoep 1:24714b45cd1b 607
xorjoep 1:24714b45cd1b 608 /* Store the result in the accumulator in the destination buffer. */
xorjoep 1:24714b45cd1b 609 *pOut++ = sum;
xorjoep 1:24714b45cd1b 610
xorjoep 1:24714b45cd1b 611 /* Update the inputA and inputB pointers for next MAC calculation */
xorjoep 1:24714b45cd1b 612 px = ++pSrc1;
xorjoep 1:24714b45cd1b 613 py = pSrc2;
xorjoep 1:24714b45cd1b 614
xorjoep 1:24714b45cd1b 615 /* Decrement the MAC count */
xorjoep 1:24714b45cd1b 616 count--;
xorjoep 1:24714b45cd1b 617
xorjoep 1:24714b45cd1b 618 /* Decrement the loop counter */
xorjoep 1:24714b45cd1b 619 blockSize3--;
xorjoep 1:24714b45cd1b 620
xorjoep 1:24714b45cd1b 621 }
xorjoep 1:24714b45cd1b 622
xorjoep 1:24714b45cd1b 623 /* set status as ARM_MATH_SUCCESS */
xorjoep 1:24714b45cd1b 624 status = ARM_MATH_SUCCESS;
xorjoep 1:24714b45cd1b 625 }
xorjoep 1:24714b45cd1b 626
xorjoep 1:24714b45cd1b 627 /* Return to application */
xorjoep 1:24714b45cd1b 628 return (status);
xorjoep 1:24714b45cd1b 629
xorjoep 1:24714b45cd1b 630 #else
xorjoep 1:24714b45cd1b 631
xorjoep 1:24714b45cd1b 632 /* Run the below code for Cortex-M0 */
xorjoep 1:24714b45cd1b 633
xorjoep 1:24714b45cd1b 634 float32_t *pIn1 = pSrcA; /* inputA pointer */
xorjoep 1:24714b45cd1b 635 float32_t *pIn2 = pSrcB; /* inputB pointer */
xorjoep 1:24714b45cd1b 636 float32_t sum; /* Accumulator */
xorjoep 1:24714b45cd1b 637 uint32_t i, j; /* loop counters */
xorjoep 1:24714b45cd1b 638 arm_status status; /* status of Partial convolution */
xorjoep 1:24714b45cd1b 639
xorjoep 1:24714b45cd1b 640 /* Check for range of output samples to be calculated */
xorjoep 1:24714b45cd1b 641 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
xorjoep 1:24714b45cd1b 642 {
xorjoep 1:24714b45cd1b 643 /* Set status as ARM_ARGUMENT_ERROR */
xorjoep 1:24714b45cd1b 644 status = ARM_MATH_ARGUMENT_ERROR;
xorjoep 1:24714b45cd1b 645 }
xorjoep 1:24714b45cd1b 646 else
xorjoep 1:24714b45cd1b 647 {
xorjoep 1:24714b45cd1b 648 /* Loop to calculate convolution for output length number of values */
xorjoep 1:24714b45cd1b 649 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
xorjoep 1:24714b45cd1b 650 {
xorjoep 1:24714b45cd1b 651 /* Initialize sum with zero to carry on MAC operations */
xorjoep 1:24714b45cd1b 652 sum = 0.0f;
xorjoep 1:24714b45cd1b 653
xorjoep 1:24714b45cd1b 654 /* Loop to perform MAC operations according to convolution equation */
xorjoep 1:24714b45cd1b 655 for (j = 0U; j <= i; j++)
xorjoep 1:24714b45cd1b 656 {
xorjoep 1:24714b45cd1b 657 /* Check the array limitations for inputs */
xorjoep 1:24714b45cd1b 658 if ((((i - j) < srcBLen) && (j < srcALen)))
xorjoep 1:24714b45cd1b 659 {
xorjoep 1:24714b45cd1b 660 /* z[i] += x[i-j] * y[j] */
xorjoep 1:24714b45cd1b 661 sum += pIn1[j] * pIn2[i - j];
xorjoep 1:24714b45cd1b 662 }
xorjoep 1:24714b45cd1b 663 }
xorjoep 1:24714b45cd1b 664 /* Store the output in the destination buffer */
xorjoep 1:24714b45cd1b 665 pDst[i] = sum;
xorjoep 1:24714b45cd1b 666 }
xorjoep 1:24714b45cd1b 667 /* set status as ARM_SUCCESS as there are no argument errors */
xorjoep 1:24714b45cd1b 668 status = ARM_MATH_SUCCESS;
xorjoep 1:24714b45cd1b 669 }
xorjoep 1:24714b45cd1b 670 return (status);
xorjoep 1:24714b45cd1b 671
xorjoep 1:24714b45cd1b 672 #endif /* #if defined (ARM_MATH_DSP) */
xorjoep 1:24714b45cd1b 673
xorjoep 1:24714b45cd1b 674 }
xorjoep 1:24714b45cd1b 675
xorjoep 1:24714b45cd1b 676 /**
xorjoep 1:24714b45cd1b 677 * @} end of PartialConv group
xorjoep 1:24714b45cd1b 678 */