Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
arm_conv_partial_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_conv_partial_q15.c 00004 * Description: Partial convolution of Q15 sequences 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup PartialConv 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Partial convolution of Q15 sequences. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. 00047 * @param[in] firstIndex is the first output sample to start with. 00048 * @param[in] numPoints is the number of output points to be computed. 00049 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00050 * 00051 * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00052 * 00053 * \par 00054 * Refer the function <code>arm_conv_partial_opt_q15()</code> for a faster implementation of this function using scratch buffers. 00055 * 00056 */ 00057 00058 arm_status arm_conv_partial_q15( 00059 q15_t * pSrcA, 00060 uint32_t srcALen, 00061 q15_t * pSrcB, 00062 uint32_t srcBLen, 00063 q15_t * pDst, 00064 uint32_t firstIndex, 00065 uint32_t numPoints) 00066 { 00067 00068 00069 #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) 00070 00071 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00072 00073 q15_t *pIn1; /* inputA pointer */ 00074 q15_t *pIn2; /* inputB pointer */ 00075 q15_t *pOut = pDst; /* output pointer */ 00076 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00077 q15_t *px; /* Intermediate inputA pointer */ 00078 q15_t *py; /* Intermediate inputB pointer */ 00079 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00080 q31_t x0, x1, x2, x3, c0; /* Temporary input variables */ 00081 uint32_t j, k, count, check, blkCnt; 00082 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00083 arm_status status; /* status of Partial convolution */ 00084 00085 /* Check for range of output samples to be calculated */ 00086 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) 00087 { 00088 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00089 status = ARM_MATH_ARGUMENT_ERROR; 00090 } 00091 else 00092 { 00093 00094 /* The algorithm implementation is based on the lengths of the inputs. */ 00095 /* srcB is always made to slide across srcA. */ 00096 /* So srcBLen is always considered as shorter or equal to srcALen */ 00097 if (srcALen >= srcBLen) 00098 { 00099 /* Initialization of inputA pointer */ 00100 pIn1 = pSrcA; 00101 00102 /* Initialization of inputB pointer */ 00103 pIn2 = pSrcB; 00104 } 00105 else 00106 { 00107 /* Initialization of inputA pointer */ 00108 pIn1 = pSrcB; 00109 00110 /* Initialization of inputB pointer */ 00111 pIn2 = pSrcA; 00112 00113 /* srcBLen is always considered as shorter or equal to srcALen */ 00114 j = srcBLen; 00115 srcBLen = srcALen; 00116 srcALen = j; 00117 } 00118 00119 /* Conditions to check which loopCounter holds 00120 * the first and last indices of the output samples to be calculated. */ 00121 check = firstIndex + numPoints; 00122 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; 00123 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; 00124 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00125 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : 00126 (int32_t) numPoints) : 0; 00127 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00128 (int32_t) firstIndex); 00129 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00130 00131 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00132 /* The function is internally 00133 * divided into three stages according to the number of multiplications that has to be 00134 * taken place between inputA samples and inputB samples. In the first stage of the 00135 * algorithm, the multiplications increase by one for every iteration. 00136 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00137 * In the third stage of the algorithm, the multiplications decrease by one 00138 * for every iteration. */ 00139 00140 /* Set the output pointer to point to the firstIndex 00141 * of the output sample to be calculated. */ 00142 pOut = pDst + firstIndex; 00143 00144 /* -------------------------- 00145 * Initializations of stage1 00146 * -------------------------*/ 00147 00148 /* sum = x[0] * y[0] 00149 * sum = x[0] * y[1] + x[1] * y[0] 00150 * .... 00151 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00152 */ 00153 00154 /* In this stage the MAC operations are increased by 1 for every iteration. 00155 The count variable holds the number of MAC operations performed. 00156 Since the partial convolution starts from firstIndex 00157 Number of Macs to be performed is firstIndex + 1 */ 00158 count = 1U + firstIndex; 00159 00160 /* Working pointer of inputA */ 00161 px = pIn1; 00162 00163 /* Working pointer of inputB */ 00164 pSrc2 = pIn2 + firstIndex; 00165 py = pSrc2; 00166 00167 /* ------------------------ 00168 * Stage1 process 00169 * ----------------------*/ 00170 00171 /* For loop unrolling by 4, this stage is divided into two. */ 00172 /* First part of this stage computes the MAC operations less than 4 */ 00173 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00174 00175 /* The first part of the stage starts here */ 00176 while ((count < 4U) && (blockSize1 > 0)) 00177 { 00178 /* Accumulator is made zero for every iteration */ 00179 sum = 0; 00180 00181 /* Loop over number of MAC operations between 00182 * inputA samples and inputB samples */ 00183 k = count; 00184 00185 while (k > 0U) 00186 { 00187 /* Perform the multiply-accumulates */ 00188 sum = __SMLALD(*px++, *py--, sum); 00189 00190 /* Decrement the loop counter */ 00191 k--; 00192 } 00193 00194 /* Store the result in the accumulator in the destination buffer. */ 00195 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00196 00197 /* Update the inputA and inputB pointers for next MAC calculation */ 00198 py = ++pSrc2; 00199 px = pIn1; 00200 00201 /* Increment the MAC count */ 00202 count++; 00203 00204 /* Decrement the loop counter */ 00205 blockSize1--; 00206 } 00207 00208 /* The second part of the stage starts here */ 00209 /* The internal loop, over count, is unrolled by 4 */ 00210 /* To, read the last two inputB samples using SIMD: 00211 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00212 py = py - 1; 00213 00214 while (blockSize1 > 0) 00215 { 00216 /* Accumulator is made zero for every iteration */ 00217 sum = 0; 00218 00219 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00220 k = count >> 2U; 00221 00222 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00223 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00224 while (k > 0U) 00225 { 00226 /* Perform the multiply-accumulates */ 00227 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00228 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00229 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00230 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00231 00232 /* Decrement the loop counter */ 00233 k--; 00234 } 00235 00236 /* For the next MAC operations, the pointer py is used without SIMD 00237 * So, py is incremented by 1 */ 00238 py = py + 1U; 00239 00240 /* If the count is not a multiple of 4, compute any remaining MACs here. 00241 ** No loop unrolling is used. */ 00242 k = count % 0x4U; 00243 00244 while (k > 0U) 00245 { 00246 /* Perform the multiply-accumulates */ 00247 sum = __SMLALD(*px++, *py--, sum); 00248 00249 /* Decrement the loop counter */ 00250 k--; 00251 } 00252 00253 /* Store the result in the accumulator in the destination buffer. */ 00254 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00255 00256 /* Update the inputA and inputB pointers for next MAC calculation */ 00257 py = ++pSrc2 - 1U; 00258 px = pIn1; 00259 00260 /* Increment the MAC count */ 00261 count++; 00262 00263 /* Decrement the loop counter */ 00264 blockSize1--; 00265 } 00266 00267 /* -------------------------- 00268 * Initializations of stage2 00269 * ------------------------*/ 00270 00271 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00272 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00273 * .... 00274 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00275 */ 00276 00277 /* Working pointer of inputA */ 00278 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00279 { 00280 px = pIn1 + firstIndex - srcBLen + 1; 00281 } 00282 else 00283 { 00284 px = pIn1; 00285 } 00286 00287 /* Working pointer of inputB */ 00288 pSrc2 = pIn2 + (srcBLen - 1U); 00289 py = pSrc2; 00290 00291 /* count is the index by which the pointer pIn1 to be incremented */ 00292 count = 0U; 00293 00294 00295 /* -------------------- 00296 * Stage2 process 00297 * -------------------*/ 00298 00299 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00300 * So, to loop unroll over blockSize2, 00301 * srcBLen should be greater than or equal to 4 */ 00302 if (srcBLen >= 4U) 00303 { 00304 /* Loop unroll over blockSize2, by 4 */ 00305 blkCnt = blockSize2 >> 2U; 00306 00307 while (blkCnt > 0U) 00308 { 00309 py = py - 1U; 00310 00311 /* Set all accumulators to zero */ 00312 acc0 = 0; 00313 acc1 = 0; 00314 acc2 = 0; 00315 acc3 = 0; 00316 00317 00318 /* read x[0], x[1] samples */ 00319 x0 = *__SIMD32(px); 00320 /* read x[1], x[2] samples */ 00321 x1 = _SIMD32_OFFSET(px+1); 00322 px+= 2U; 00323 00324 00325 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00326 k = srcBLen >> 2U; 00327 00328 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00329 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00330 do 00331 { 00332 /* Read the last two inputB samples using SIMD: 00333 * y[srcBLen - 1] and y[srcBLen - 2] */ 00334 c0 = *__SIMD32(py)--; 00335 00336 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00337 acc0 = __SMLALDX(x0, c0, acc0); 00338 00339 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00340 acc1 = __SMLALDX(x1, c0, acc1); 00341 00342 /* Read x[2], x[3] */ 00343 x2 = *__SIMD32(px); 00344 00345 /* Read x[3], x[4] */ 00346 x3 = _SIMD32_OFFSET(px+1); 00347 00348 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00349 acc2 = __SMLALDX(x2, c0, acc2); 00350 00351 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00352 acc3 = __SMLALDX(x3, c0, acc3); 00353 00354 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00355 c0 = *__SIMD32(py)--; 00356 00357 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00358 acc0 = __SMLALDX(x2, c0, acc0); 00359 00360 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00361 acc1 = __SMLALDX(x3, c0, acc1); 00362 00363 /* Read x[4], x[5] */ 00364 x0 = _SIMD32_OFFSET(px+2); 00365 00366 /* Read x[5], x[6] */ 00367 x1 = _SIMD32_OFFSET(px+3); 00368 px += 4U; 00369 00370 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00371 acc2 = __SMLALDX(x0, c0, acc2); 00372 00373 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00374 acc3 = __SMLALDX(x1, c0, acc3); 00375 00376 } while (--k); 00377 00378 /* For the next MAC operations, SIMD is not used 00379 * So, the 16 bit pointer if inputB, py is updated */ 00380 00381 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00382 ** No loop unrolling is used. */ 00383 k = srcBLen % 0x4U; 00384 00385 if (k == 1U) 00386 { 00387 /* Read y[srcBLen - 5] */ 00388 c0 = *(py+1); 00389 00390 #ifdef ARM_MATH_BIG_ENDIAN 00391 00392 c0 = c0 << 16U; 00393 00394 #else 00395 00396 c0 = c0 & 0x0000FFFF; 00397 00398 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00399 00400 /* Read x[7] */ 00401 x3 = *__SIMD32(px); 00402 px++; 00403 00404 /* Perform the multiply-accumulates */ 00405 acc0 = __SMLALD(x0, c0, acc0); 00406 acc1 = __SMLALD(x1, c0, acc1); 00407 acc2 = __SMLALDX(x1, c0, acc2); 00408 acc3 = __SMLALDX(x3, c0, acc3); 00409 } 00410 00411 if (k == 2U) 00412 { 00413 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00414 c0 = _SIMD32_OFFSET(py); 00415 00416 /* Read x[7], x[8] */ 00417 x3 = *__SIMD32(px); 00418 00419 /* Read x[9] */ 00420 x2 = _SIMD32_OFFSET(px+1); 00421 px += 2U; 00422 00423 /* Perform the multiply-accumulates */ 00424 acc0 = __SMLALDX(x0, c0, acc0); 00425 acc1 = __SMLALDX(x1, c0, acc1); 00426 acc2 = __SMLALDX(x3, c0, acc2); 00427 acc3 = __SMLALDX(x2, c0, acc3); 00428 } 00429 00430 if (k == 3U) 00431 { 00432 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00433 c0 = _SIMD32_OFFSET(py); 00434 00435 /* Read x[7], x[8] */ 00436 x3 = *__SIMD32(px); 00437 00438 /* Read x[9] */ 00439 x2 = _SIMD32_OFFSET(px+1); 00440 00441 /* Perform the multiply-accumulates */ 00442 acc0 = __SMLALDX(x0, c0, acc0); 00443 acc1 = __SMLALDX(x1, c0, acc1); 00444 acc2 = __SMLALDX(x3, c0, acc2); 00445 acc3 = __SMLALDX(x2, c0, acc3); 00446 00447 c0 = *(py-1); 00448 00449 #ifdef ARM_MATH_BIG_ENDIAN 00450 00451 c0 = c0 << 16U; 00452 #else 00453 00454 c0 = c0 & 0x0000FFFF; 00455 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00456 00457 /* Read x[10] */ 00458 x3 = _SIMD32_OFFSET(px+2); 00459 px += 3U; 00460 00461 /* Perform the multiply-accumulates */ 00462 acc0 = __SMLALDX(x1, c0, acc0); 00463 acc1 = __SMLALD(x2, c0, acc1); 00464 acc2 = __SMLALDX(x2, c0, acc2); 00465 acc3 = __SMLALDX(x3, c0, acc3); 00466 } 00467 00468 00469 /* Store the results in the accumulators in the destination buffer. */ 00470 00471 #ifndef ARM_MATH_BIG_ENDIAN 00472 00473 *__SIMD32(pOut)++ = 00474 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00475 *__SIMD32(pOut)++ = 00476 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00477 00478 #else 00479 00480 *__SIMD32(pOut)++ = 00481 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00482 *__SIMD32(pOut)++ = 00483 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00484 00485 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00486 00487 /* Increment the pointer pIn1 index, count by 4 */ 00488 count += 4U; 00489 00490 /* Update the inputA and inputB pointers for next MAC calculation */ 00491 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00492 { 00493 px = pIn1 + firstIndex - srcBLen + 1 + count; 00494 } 00495 else 00496 { 00497 px = pIn1 + count; 00498 } 00499 py = pSrc2; 00500 00501 /* Decrement the loop counter */ 00502 blkCnt--; 00503 } 00504 00505 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00506 ** No loop unrolling is used. */ 00507 blkCnt = (uint32_t) blockSize2 % 0x4U; 00508 00509 while (blkCnt > 0U) 00510 { 00511 /* Accumulator is made zero for every iteration */ 00512 sum = 0; 00513 00514 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00515 k = srcBLen >> 2U; 00516 00517 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00518 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00519 while (k > 0U) 00520 { 00521 /* Perform the multiply-accumulates */ 00522 sum += (q63_t) ((q31_t) * px++ * *py--); 00523 sum += (q63_t) ((q31_t) * px++ * *py--); 00524 sum += (q63_t) ((q31_t) * px++ * *py--); 00525 sum += (q63_t) ((q31_t) * px++ * *py--); 00526 00527 /* Decrement the loop counter */ 00528 k--; 00529 } 00530 00531 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00532 ** No loop unrolling is used. */ 00533 k = srcBLen % 0x4U; 00534 00535 while (k > 0U) 00536 { 00537 /* Perform the multiply-accumulates */ 00538 sum += (q63_t) ((q31_t) * px++ * *py--); 00539 00540 /* Decrement the loop counter */ 00541 k--; 00542 } 00543 00544 /* Store the result in the accumulator in the destination buffer. */ 00545 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00546 00547 /* Increment the pointer pIn1 index, count by 1 */ 00548 count++; 00549 00550 /* Update the inputA and inputB pointers for next MAC calculation */ 00551 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00552 { 00553 px = pIn1 + firstIndex - srcBLen + 1 + count; 00554 } 00555 else 00556 { 00557 px = pIn1 + count; 00558 } 00559 py = pSrc2; 00560 00561 /* Decrement the loop counter */ 00562 blkCnt--; 00563 } 00564 } 00565 else 00566 { 00567 /* If the srcBLen is not a multiple of 4, 00568 * the blockSize2 loop cannot be unrolled by 4 */ 00569 blkCnt = (uint32_t) blockSize2; 00570 00571 while (blkCnt > 0U) 00572 { 00573 /* Accumulator is made zero for every iteration */ 00574 sum = 0; 00575 00576 /* srcBLen number of MACS should be performed */ 00577 k = srcBLen; 00578 00579 while (k > 0U) 00580 { 00581 /* Perform the multiply-accumulate */ 00582 sum += (q63_t) ((q31_t) * px++ * *py--); 00583 00584 /* Decrement the loop counter */ 00585 k--; 00586 } 00587 00588 /* Store the result in the accumulator in the destination buffer. */ 00589 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00590 00591 /* Increment the MAC count */ 00592 count++; 00593 00594 /* Update the inputA and inputB pointers for next MAC calculation */ 00595 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00596 { 00597 px = pIn1 + firstIndex - srcBLen + 1 + count; 00598 } 00599 else 00600 { 00601 px = pIn1 + count; 00602 } 00603 py = pSrc2; 00604 00605 /* Decrement the loop counter */ 00606 blkCnt--; 00607 } 00608 } 00609 00610 00611 /* -------------------------- 00612 * Initializations of stage3 00613 * -------------------------*/ 00614 00615 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00616 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00617 * .... 00618 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00619 * sum += x[srcALen-1] * y[srcBLen-1] 00620 */ 00621 00622 /* In this stage the MAC operations are decreased by 1 for every iteration. 00623 The count variable holds the number of MAC operations performed */ 00624 count = srcBLen - 1U; 00625 00626 /* Working pointer of inputA */ 00627 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); 00628 px = pSrc1; 00629 00630 /* Working pointer of inputB */ 00631 pSrc2 = pIn2 + (srcBLen - 1U); 00632 pIn2 = pSrc2 - 1U; 00633 py = pIn2; 00634 00635 /* ------------------- 00636 * Stage3 process 00637 * ------------------*/ 00638 00639 /* For loop unrolling by 4, this stage is divided into two. */ 00640 /* First part of this stage computes the MAC operations greater than 4 */ 00641 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00642 00643 /* The first part of the stage starts here */ 00644 j = count >> 2U; 00645 00646 while ((j > 0U) && (blockSize3 > 0)) 00647 { 00648 /* Accumulator is made zero for every iteration */ 00649 sum = 0; 00650 00651 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00652 k = count >> 2U; 00653 00654 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00655 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00656 while (k > 0U) 00657 { 00658 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00659 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00660 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00661 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00662 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00663 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00664 00665 /* Decrement the loop counter */ 00666 k--; 00667 } 00668 00669 /* For the next MAC operations, the pointer py is used without SIMD 00670 * So, py is incremented by 1 */ 00671 py = py + 1U; 00672 00673 /* If the count is not a multiple of 4, compute any remaining MACs here. 00674 ** No loop unrolling is used. */ 00675 k = count % 0x4U; 00676 00677 while (k > 0U) 00678 { 00679 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00680 sum = __SMLALD(*px++, *py--, sum); 00681 00682 /* Decrement the loop counter */ 00683 k--; 00684 } 00685 00686 /* Store the result in the accumulator in the destination buffer. */ 00687 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00688 00689 /* Update the inputA and inputB pointers for next MAC calculation */ 00690 px = ++pSrc1; 00691 py = pIn2; 00692 00693 /* Decrement the MAC count */ 00694 count--; 00695 00696 /* Decrement the loop counter */ 00697 blockSize3--; 00698 00699 j--; 00700 } 00701 00702 /* The second part of the stage starts here */ 00703 /* SIMD is not used for the next MAC operations, 00704 * so pointer py is updated to read only one sample at a time */ 00705 py = py + 1U; 00706 00707 while (blockSize3 > 0) 00708 { 00709 /* Accumulator is made zero for every iteration */ 00710 sum = 0; 00711 00712 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00713 k = count; 00714 00715 while (k > 0U) 00716 { 00717 /* Perform the multiply-accumulates */ 00718 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00719 sum = __SMLALD(*px++, *py--, sum); 00720 00721 /* Decrement the loop counter */ 00722 k--; 00723 } 00724 00725 /* Store the result in the accumulator in the destination buffer. */ 00726 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00727 00728 /* Update the inputA and inputB pointers for next MAC calculation */ 00729 px = ++pSrc1; 00730 py = pSrc2; 00731 00732 /* Decrement the MAC count */ 00733 count--; 00734 00735 /* Decrement the loop counter */ 00736 blockSize3--; 00737 } 00738 00739 /* set status as ARM_MATH_SUCCESS */ 00740 status = ARM_MATH_SUCCESS; 00741 } 00742 00743 /* Return to application */ 00744 return (status); 00745 00746 #else 00747 00748 /* Run the below code for Cortex-M0 */ 00749 00750 q15_t *pIn1 = pSrcA; /* inputA pointer */ 00751 q15_t *pIn2 = pSrcB; /* inputB pointer */ 00752 q63_t sum; /* Accumulator */ 00753 uint32_t i, j; /* loop counters */ 00754 arm_status status; /* status of Partial convolution */ 00755 00756 /* Check for range of output samples to be calculated */ 00757 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) 00758 { 00759 /* Set status as ARM_ARGUMENT_ERROR */ 00760 status = ARM_MATH_ARGUMENT_ERROR; 00761 } 00762 else 00763 { 00764 /* Loop to calculate convolution for output length number of values */ 00765 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00766 { 00767 /* Initialize sum with zero to carry on MAC operations */ 00768 sum = 0; 00769 00770 /* Loop to perform MAC operations according to convolution equation */ 00771 for (j = 0; j <= i; j++) 00772 { 00773 /* Check the array limitations */ 00774 if (((i - j) < srcBLen) && (j < srcALen)) 00775 { 00776 /* z[i] += x[i-j] * y[j] */ 00777 sum += ((q31_t) pIn1[j] * (pIn2[i - j])); 00778 } 00779 } 00780 00781 /* Store the output in the destination buffer */ 00782 pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U); 00783 } 00784 /* set status as ARM_SUCCESS as there are no argument errors */ 00785 status = ARM_MATH_SUCCESS; 00786 } 00787 return (status); 00788 00789 #endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */ 00790 00791 } 00792 00793 /** 00794 * @} end of PartialConv group 00795 */ 00796
Generated on Tue Jul 12 2022 16:46:23 by
