Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
arm_conv_partial_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_conv_partial_fast_q15.c 00004 * Description: Fast Q15 Partial convolution 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup PartialConv 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. 00047 * @param[in] firstIndex is the first output sample to start with. 00048 * @param[in] numPoints is the number of output points to be computed. 00049 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00050 * 00051 * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. 00052 */ 00053 00054 00055 arm_status arm_conv_partial_fast_q15( 00056 q15_t * pSrcA, 00057 uint32_t srcALen, 00058 q15_t * pSrcB, 00059 uint32_t srcBLen, 00060 q15_t * pDst, 00061 uint32_t firstIndex, 00062 uint32_t numPoints) 00063 { 00064 #ifndef UNALIGNED_SUPPORT_DISABLE 00065 00066 q15_t *pIn1; /* inputA pointer */ 00067 q15_t *pIn2; /* inputB pointer */ 00068 q15_t *pOut = pDst; /* output pointer */ 00069 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00070 q15_t *px; /* Intermediate inputA pointer */ 00071 q15_t *py; /* Intermediate inputB pointer */ 00072 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00073 q31_t x0, x1, x2, x3, c0; 00074 uint32_t j, k, count, check, blkCnt; 00075 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00076 arm_status status; /* status of Partial convolution */ 00077 00078 /* Check for range of output samples to be calculated */ 00079 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) 00080 { 00081 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00082 status = ARM_MATH_ARGUMENT_ERROR; 00083 } 00084 else 00085 { 00086 00087 /* The algorithm implementation is based on the lengths of the inputs. */ 00088 /* srcB is always made to slide across srcA. */ 00089 /* So srcBLen is always considered as shorter or equal to srcALen */ 00090 if (srcALen >=srcBLen) 00091 { 00092 /* Initialization of inputA pointer */ 00093 pIn1 = pSrcA; 00094 00095 /* Initialization of inputB pointer */ 00096 pIn2 = pSrcB; 00097 } 00098 else 00099 { 00100 /* Initialization of inputA pointer */ 00101 pIn1 = pSrcB; 00102 00103 /* Initialization of inputB pointer */ 00104 pIn2 = pSrcA; 00105 00106 /* srcBLen is always considered as shorter or equal to srcALen */ 00107 j = srcBLen; 00108 srcBLen = srcALen; 00109 srcALen = j; 00110 } 00111 00112 /* Conditions to check which loopCounter holds 00113 * the first and last indices of the output samples to be calculated. */ 00114 check = firstIndex + numPoints; 00115 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; 00116 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; 00117 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00118 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : 00119 (int32_t) numPoints) : 0; 00120 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00121 (int32_t) firstIndex); 00122 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00123 00124 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00125 /* The function is internally 00126 * divided into three stages according to the number of multiplications that has to be 00127 * taken place between inputA samples and inputB samples. In the first stage of the 00128 * algorithm, the multiplications increase by one for every iteration. 00129 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00130 * In the third stage of the algorithm, the multiplications decrease by one 00131 * for every iteration. */ 00132 00133 /* Set the output pointer to point to the firstIndex 00134 * of the output sample to be calculated. */ 00135 pOut = pDst + firstIndex; 00136 00137 /* -------------------------- 00138 * Initializations of stage1 00139 * -------------------------*/ 00140 00141 /* sum = x[0] * y[0] 00142 * sum = x[0] * y[1] + x[1] * y[0] 00143 * .... 00144 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00145 */ 00146 00147 /* In this stage the MAC operations are increased by 1 for every iteration. 00148 The count variable holds the number of MAC operations performed. 00149 Since the partial convolution starts from firstIndex 00150 Number of Macs to be performed is firstIndex + 1 */ 00151 count = 1U + firstIndex; 00152 00153 /* Working pointer of inputA */ 00154 px = pIn1; 00155 00156 /* Working pointer of inputB */ 00157 pSrc2 = pIn2 + firstIndex; 00158 py = pSrc2; 00159 00160 /* ------------------------ 00161 * Stage1 process 00162 * ----------------------*/ 00163 00164 /* For loop unrolling by 4, this stage is divided into two. */ 00165 /* First part of this stage computes the MAC operations less than 4 */ 00166 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00167 00168 /* The first part of the stage starts here */ 00169 while ((count < 4U) && (blockSize1 > 0)) 00170 { 00171 /* Accumulator is made zero for every iteration */ 00172 sum = 0; 00173 00174 /* Loop over number of MAC operations between 00175 * inputA samples and inputB samples */ 00176 k = count; 00177 00178 while (k > 0U) 00179 { 00180 /* Perform the multiply-accumulates */ 00181 sum = __SMLAD(*px++, *py--, sum); 00182 00183 /* Decrement the loop counter */ 00184 k--; 00185 } 00186 00187 /* Store the result in the accumulator in the destination buffer. */ 00188 *pOut++ = (q15_t) (sum >> 15); 00189 00190 /* Update the inputA and inputB pointers for next MAC calculation */ 00191 py = ++pSrc2; 00192 px = pIn1; 00193 00194 /* Increment the MAC count */ 00195 count++; 00196 00197 /* Decrement the loop counter */ 00198 blockSize1--; 00199 } 00200 00201 /* The second part of the stage starts here */ 00202 /* The internal loop, over count, is unrolled by 4 */ 00203 /* To, read the last two inputB samples using SIMD: 00204 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00205 py = py - 1; 00206 00207 while (blockSize1 > 0) 00208 { 00209 /* Accumulator is made zero for every iteration */ 00210 sum = 0; 00211 00212 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00213 k = count >> 2U; 00214 00215 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00216 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00217 while (k > 0U) 00218 { 00219 /* Perform the multiply-accumulates */ 00220 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00221 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00222 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00223 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00224 00225 /* Decrement the loop counter */ 00226 k--; 00227 } 00228 00229 /* For the next MAC operations, the pointer py is used without SIMD 00230 * So, py is incremented by 1 */ 00231 py = py + 1U; 00232 00233 /* If the count is not a multiple of 4, compute any remaining MACs here. 00234 ** No loop unrolling is used. */ 00235 k = count % 0x4U; 00236 00237 while (k > 0U) 00238 { 00239 /* Perform the multiply-accumulates */ 00240 sum = __SMLAD(*px++, *py--, sum); 00241 00242 /* Decrement the loop counter */ 00243 k--; 00244 } 00245 00246 /* Store the result in the accumulator in the destination buffer. */ 00247 *pOut++ = (q15_t) (sum >> 15); 00248 00249 /* Update the inputA and inputB pointers for next MAC calculation */ 00250 py = ++pSrc2 - 1U; 00251 px = pIn1; 00252 00253 /* Increment the MAC count */ 00254 count++; 00255 00256 /* Decrement the loop counter */ 00257 blockSize1--; 00258 } 00259 00260 /* -------------------------- 00261 * Initializations of stage2 00262 * ------------------------*/ 00263 00264 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00265 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00266 * .... 00267 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00268 */ 00269 00270 /* Working pointer of inputA */ 00271 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00272 { 00273 px = pIn1 + firstIndex - srcBLen + 1; 00274 } 00275 else 00276 { 00277 px = pIn1; 00278 } 00279 00280 /* Working pointer of inputB */ 00281 pSrc2 = pIn2 + (srcBLen - 1U); 00282 py = pSrc2; 00283 00284 /* count is the index by which the pointer pIn1 to be incremented */ 00285 count = 0U; 00286 00287 00288 /* -------------------- 00289 * Stage2 process 00290 * -------------------*/ 00291 00292 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00293 * So, to loop unroll over blockSize2, 00294 * srcBLen should be greater than or equal to 4 */ 00295 if (srcBLen >= 4U) 00296 { 00297 /* Loop unroll over blockSize2, by 4 */ 00298 blkCnt = ((uint32_t) blockSize2 >> 2U); 00299 00300 while (blkCnt > 0U) 00301 { 00302 py = py - 1U; 00303 00304 /* Set all accumulators to zero */ 00305 acc0 = 0; 00306 acc1 = 0; 00307 acc2 = 0; 00308 acc3 = 0; 00309 00310 00311 /* read x[0], x[1] samples */ 00312 x0 = *__SIMD32(px); 00313 /* read x[1], x[2] samples */ 00314 x1 = _SIMD32_OFFSET(px+1); 00315 px+= 2U; 00316 00317 00318 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00319 k = srcBLen >> 2U; 00320 00321 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00322 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00323 do 00324 { 00325 /* Read the last two inputB samples using SIMD: 00326 * y[srcBLen - 1] and y[srcBLen - 2] */ 00327 c0 = *__SIMD32(py)--; 00328 00329 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00330 acc0 = __SMLADX(x0, c0, acc0); 00331 00332 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00333 acc1 = __SMLADX(x1, c0, acc1); 00334 00335 /* Read x[2], x[3] */ 00336 x2 = *__SIMD32(px); 00337 00338 /* Read x[3], x[4] */ 00339 x3 = _SIMD32_OFFSET(px+1); 00340 00341 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00342 acc2 = __SMLADX(x2, c0, acc2); 00343 00344 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00345 acc3 = __SMLADX(x3, c0, acc3); 00346 00347 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00348 c0 = *__SIMD32(py)--; 00349 00350 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00351 acc0 = __SMLADX(x2, c0, acc0); 00352 00353 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00354 acc1 = __SMLADX(x3, c0, acc1); 00355 00356 /* Read x[4], x[5] */ 00357 x0 = _SIMD32_OFFSET(px+2); 00358 00359 /* Read x[5], x[6] */ 00360 x1 = _SIMD32_OFFSET(px+3); 00361 px += 4U; 00362 00363 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00364 acc2 = __SMLADX(x0, c0, acc2); 00365 00366 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00367 acc3 = __SMLADX(x1, c0, acc3); 00368 00369 } while (--k); 00370 00371 /* For the next MAC operations, SIMD is not used 00372 * So, the 16 bit pointer if inputB, py is updated */ 00373 00374 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00375 ** No loop unrolling is used. */ 00376 k = srcBLen % 0x4U; 00377 00378 if (k == 1U) 00379 { 00380 /* Read y[srcBLen - 5] */ 00381 c0 = *(py+1); 00382 #ifdef ARM_MATH_BIG_ENDIAN 00383 00384 c0 = c0 << 16U; 00385 00386 #else 00387 00388 c0 = c0 & 0x0000FFFF; 00389 00390 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00391 00392 /* Read x[7] */ 00393 x3 = *__SIMD32(px); 00394 px++; 00395 00396 /* Perform the multiply-accumulates */ 00397 acc0 = __SMLAD(x0, c0, acc0); 00398 acc1 = __SMLAD(x1, c0, acc1); 00399 acc2 = __SMLADX(x1, c0, acc2); 00400 acc3 = __SMLADX(x3, c0, acc3); 00401 } 00402 00403 if (k == 2U) 00404 { 00405 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00406 c0 = _SIMD32_OFFSET(py); 00407 00408 /* Read x[7], x[8] */ 00409 x3 = *__SIMD32(px); 00410 00411 /* Read x[9] */ 00412 x2 = _SIMD32_OFFSET(px+1); 00413 px += 2U; 00414 00415 /* Perform the multiply-accumulates */ 00416 acc0 = __SMLADX(x0, c0, acc0); 00417 acc1 = __SMLADX(x1, c0, acc1); 00418 acc2 = __SMLADX(x3, c0, acc2); 00419 acc3 = __SMLADX(x2, c0, acc3); 00420 } 00421 00422 if (k == 3U) 00423 { 00424 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00425 c0 = _SIMD32_OFFSET(py); 00426 00427 /* Read x[7], x[8] */ 00428 x3 = *__SIMD32(px); 00429 00430 /* Read x[9] */ 00431 x2 = _SIMD32_OFFSET(px+1); 00432 00433 /* Perform the multiply-accumulates */ 00434 acc0 = __SMLADX(x0, c0, acc0); 00435 acc1 = __SMLADX(x1, c0, acc1); 00436 acc2 = __SMLADX(x3, c0, acc2); 00437 acc3 = __SMLADX(x2, c0, acc3); 00438 00439 c0 = *(py-1); 00440 #ifdef ARM_MATH_BIG_ENDIAN 00441 00442 c0 = c0 << 16U; 00443 #else 00444 00445 c0 = c0 & 0x0000FFFF; 00446 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00447 00448 /* Read x[10] */ 00449 x3 = _SIMD32_OFFSET(px+2); 00450 px += 3U; 00451 00452 /* Perform the multiply-accumulates */ 00453 acc0 = __SMLADX(x1, c0, acc0); 00454 acc1 = __SMLAD(x2, c0, acc1); 00455 acc2 = __SMLADX(x2, c0, acc2); 00456 acc3 = __SMLADX(x3, c0, acc3); 00457 } 00458 00459 /* Store the results in the accumulators in the destination buffer. */ 00460 #ifndef ARM_MATH_BIG_ENDIAN 00461 00462 *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16); 00463 *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16); 00464 00465 #else 00466 00467 *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16); 00468 *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16); 00469 00470 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00471 00472 /* Increment the pointer pIn1 index, count by 4 */ 00473 count += 4U; 00474 00475 /* Update the inputA and inputB pointers for next MAC calculation */ 00476 px = pIn1 + count; 00477 py = pSrc2; 00478 00479 /* Decrement the loop counter */ 00480 blkCnt--; 00481 } 00482 00483 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00484 ** No loop unrolling is used. */ 00485 blkCnt = (uint32_t) blockSize2 % 0x4U; 00486 00487 while (blkCnt > 0U) 00488 { 00489 /* Accumulator is made zero for every iteration */ 00490 sum = 0; 00491 00492 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00493 k = srcBLen >> 2U; 00494 00495 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00496 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00497 while (k > 0U) 00498 { 00499 /* Perform the multiply-accumulates */ 00500 sum += ((q31_t) * px++ * *py--); 00501 sum += ((q31_t) * px++ * *py--); 00502 sum += ((q31_t) * px++ * *py--); 00503 sum += ((q31_t) * px++ * *py--); 00504 00505 /* Decrement the loop counter */ 00506 k--; 00507 } 00508 00509 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00510 ** No loop unrolling is used. */ 00511 k = srcBLen % 0x4U; 00512 00513 while (k > 0U) 00514 { 00515 /* Perform the multiply-accumulates */ 00516 sum += ((q31_t) * px++ * *py--); 00517 00518 /* Decrement the loop counter */ 00519 k--; 00520 } 00521 00522 /* Store the result in the accumulator in the destination buffer. */ 00523 *pOut++ = (q15_t) (sum >> 15); 00524 00525 /* Increment the pointer pIn1 index, count by 1 */ 00526 count++; 00527 00528 /* Update the inputA and inputB pointers for next MAC calculation */ 00529 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00530 { 00531 px = pIn1 + firstIndex - srcBLen + 1 + count; 00532 } 00533 else 00534 { 00535 px = pIn1 + count; 00536 } 00537 py = pSrc2; 00538 00539 /* Decrement the loop counter */ 00540 blkCnt--; 00541 } 00542 } 00543 else 00544 { 00545 /* If the srcBLen is not a multiple of 4, 00546 * the blockSize2 loop cannot be unrolled by 4 */ 00547 blkCnt = (uint32_t) blockSize2; 00548 00549 while (blkCnt > 0U) 00550 { 00551 /* Accumulator is made zero for every iteration */ 00552 sum = 0; 00553 00554 /* srcBLen number of MACS should be performed */ 00555 k = srcBLen; 00556 00557 while (k > 0U) 00558 { 00559 /* Perform the multiply-accumulate */ 00560 sum += ((q31_t) * px++ * *py--); 00561 00562 /* Decrement the loop counter */ 00563 k--; 00564 } 00565 00566 /* Store the result in the accumulator in the destination buffer. */ 00567 *pOut++ = (q15_t) (sum >> 15); 00568 00569 /* Increment the MAC count */ 00570 count++; 00571 00572 /* Update the inputA and inputB pointers for next MAC calculation */ 00573 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00574 { 00575 px = pIn1 + firstIndex - srcBLen + 1 + count; 00576 } 00577 else 00578 { 00579 px = pIn1 + count; 00580 } 00581 py = pSrc2; 00582 00583 /* Decrement the loop counter */ 00584 blkCnt--; 00585 } 00586 } 00587 00588 00589 /* -------------------------- 00590 * Initializations of stage3 00591 * -------------------------*/ 00592 00593 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00594 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00595 * .... 00596 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00597 * sum += x[srcALen-1] * y[srcBLen-1] 00598 */ 00599 00600 /* In this stage the MAC operations are decreased by 1 for every iteration. 00601 The count variable holds the number of MAC operations performed */ 00602 count = srcBLen - 1U; 00603 00604 /* Working pointer of inputA */ 00605 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); 00606 px = pSrc1; 00607 00608 /* Working pointer of inputB */ 00609 pSrc2 = pIn2 + (srcBLen - 1U); 00610 pIn2 = pSrc2 - 1U; 00611 py = pIn2; 00612 00613 /* ------------------- 00614 * Stage3 process 00615 * ------------------*/ 00616 00617 /* For loop unrolling by 4, this stage is divided into two. */ 00618 /* First part of this stage computes the MAC operations greater than 4 */ 00619 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00620 00621 /* The first part of the stage starts here */ 00622 j = count >> 2U; 00623 00624 while ((j > 0U) && (blockSize3 > 0)) 00625 { 00626 /* Accumulator is made zero for every iteration */ 00627 sum = 0; 00628 00629 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00630 k = count >> 2U; 00631 00632 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00633 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00634 while (k > 0U) 00635 { 00636 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00637 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00638 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00639 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00640 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00641 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00642 00643 /* Decrement the loop counter */ 00644 k--; 00645 } 00646 00647 /* For the next MAC operations, the pointer py is used without SIMD 00648 * So, py is incremented by 1 */ 00649 py = py + 1U; 00650 00651 /* If the count is not a multiple of 4, compute any remaining MACs here. 00652 ** No loop unrolling is used. */ 00653 k = count % 0x4U; 00654 00655 while (k > 0U) 00656 { 00657 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00658 sum = __SMLAD(*px++, *py--, sum); 00659 00660 /* Decrement the loop counter */ 00661 k--; 00662 } 00663 00664 /* Store the result in the accumulator in the destination buffer. */ 00665 *pOut++ = (q15_t) (sum >> 15); 00666 00667 /* Update the inputA and inputB pointers for next MAC calculation */ 00668 px = ++pSrc1; 00669 py = pIn2; 00670 00671 /* Decrement the MAC count */ 00672 count--; 00673 00674 /* Decrement the loop counter */ 00675 blockSize3--; 00676 00677 j--; 00678 } 00679 00680 /* The second part of the stage starts here */ 00681 /* SIMD is not used for the next MAC operations, 00682 * so pointer py is updated to read only one sample at a time */ 00683 py = py + 1U; 00684 00685 while (blockSize3 > 0) 00686 { 00687 /* Accumulator is made zero for every iteration */ 00688 sum = 0; 00689 00690 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00691 k = count; 00692 00693 while (k > 0U) 00694 { 00695 /* Perform the multiply-accumulates */ 00696 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00697 sum = __SMLAD(*px++, *py--, sum); 00698 00699 /* Decrement the loop counter */ 00700 k--; 00701 } 00702 00703 /* Store the result in the accumulator in the destination buffer. */ 00704 *pOut++ = (q15_t) (sum >> 15); 00705 00706 /* Update the inputA and inputB pointers for next MAC calculation */ 00707 px = ++pSrc1; 00708 py = pSrc2; 00709 00710 /* Decrement the MAC count */ 00711 count--; 00712 00713 /* Decrement the loop counter */ 00714 blockSize3--; 00715 } 00716 00717 /* set status as ARM_MATH_SUCCESS */ 00718 status = ARM_MATH_SUCCESS; 00719 } 00720 00721 /* Return to application */ 00722 return (status); 00723 00724 #else 00725 00726 q15_t *pIn1; /* inputA pointer */ 00727 q15_t *pIn2; /* inputB pointer */ 00728 q15_t *pOut = pDst; /* output pointer */ 00729 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00730 q15_t *px; /* Intermediate inputA pointer */ 00731 q15_t *py; /* Intermediate inputB pointer */ 00732 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00733 q31_t x0, x1, x2, x3, c0; 00734 uint32_t j, k, count, check, blkCnt; 00735 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00736 arm_status status; /* status of Partial convolution */ 00737 q15_t a, b; 00738 00739 /* Check for range of output samples to be calculated */ 00740 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) 00741 { 00742 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00743 status = ARM_MATH_ARGUMENT_ERROR; 00744 } 00745 else 00746 { 00747 00748 /* The algorithm implementation is based on the lengths of the inputs. */ 00749 /* srcB is always made to slide across srcA. */ 00750 /* So srcBLen is always considered as shorter or equal to srcALen */ 00751 if (srcALen >=srcBLen) 00752 { 00753 /* Initialization of inputA pointer */ 00754 pIn1 = pSrcA; 00755 00756 /* Initialization of inputB pointer */ 00757 pIn2 = pSrcB; 00758 } 00759 else 00760 { 00761 /* Initialization of inputA pointer */ 00762 pIn1 = pSrcB; 00763 00764 /* Initialization of inputB pointer */ 00765 pIn2 = pSrcA; 00766 00767 /* srcBLen is always considered as shorter or equal to srcALen */ 00768 j = srcBLen; 00769 srcBLen = srcALen; 00770 srcALen = j; 00771 } 00772 00773 /* Conditions to check which loopCounter holds 00774 * the first and last indices of the output samples to be calculated. */ 00775 check = firstIndex + numPoints; 00776 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; 00777 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; 00778 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; 00779 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : 00780 (int32_t) numPoints) : 0; 00781 blockSize2 = ((int32_t) check - blockSize3) - 00782 (blockSize1 + (int32_t) firstIndex); 00783 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00784 00785 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00786 /* The function is internally 00787 * divided into three stages according to the number of multiplications that has to be 00788 * taken place between inputA samples and inputB samples. In the first stage of the 00789 * algorithm, the multiplications increase by one for every iteration. 00790 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00791 * In the third stage of the algorithm, the multiplications decrease by one 00792 * for every iteration. */ 00793 00794 /* Set the output pointer to point to the firstIndex 00795 * of the output sample to be calculated. */ 00796 pOut = pDst + firstIndex; 00797 00798 /* -------------------------- 00799 * Initializations of stage1 00800 * -------------------------*/ 00801 00802 /* sum = x[0] * y[0] 00803 * sum = x[0] * y[1] + x[1] * y[0] 00804 * .... 00805 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00806 */ 00807 00808 /* In this stage the MAC operations are increased by 1 for every iteration. 00809 The count variable holds the number of MAC operations performed. 00810 Since the partial convolution starts from firstIndex 00811 Number of Macs to be performed is firstIndex + 1 */ 00812 count = 1U + firstIndex; 00813 00814 /* Working pointer of inputA */ 00815 px = pIn1; 00816 00817 /* Working pointer of inputB */ 00818 pSrc2 = pIn2 + firstIndex; 00819 py = pSrc2; 00820 00821 /* ------------------------ 00822 * Stage1 process 00823 * ----------------------*/ 00824 00825 /* For loop unrolling by 4, this stage is divided into two. */ 00826 /* First part of this stage computes the MAC operations less than 4 */ 00827 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00828 00829 /* The first part of the stage starts here */ 00830 while ((count < 4U) && (blockSize1 > 0)) 00831 { 00832 /* Accumulator is made zero for every iteration */ 00833 sum = 0; 00834 00835 /* Loop over number of MAC operations between 00836 * inputA samples and inputB samples */ 00837 k = count; 00838 00839 while (k > 0U) 00840 { 00841 /* Perform the multiply-accumulates */ 00842 sum += ((q31_t) * px++ * *py--); 00843 00844 /* Decrement the loop counter */ 00845 k--; 00846 } 00847 00848 /* Store the result in the accumulator in the destination buffer. */ 00849 *pOut++ = (q15_t) (sum >> 15); 00850 00851 /* Update the inputA and inputB pointers for next MAC calculation */ 00852 py = ++pSrc2; 00853 px = pIn1; 00854 00855 /* Increment the MAC count */ 00856 count++; 00857 00858 /* Decrement the loop counter */ 00859 blockSize1--; 00860 } 00861 00862 /* The second part of the stage starts here */ 00863 /* The internal loop, over count, is unrolled by 4 */ 00864 /* To, read the last two inputB samples using SIMD: 00865 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00866 py = py - 1; 00867 00868 while (blockSize1 > 0) 00869 { 00870 /* Accumulator is made zero for every iteration */ 00871 sum = 0; 00872 00873 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00874 k = count >> 2U; 00875 00876 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00877 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00878 py++; 00879 00880 while (k > 0U) 00881 { 00882 /* Perform the multiply-accumulates */ 00883 sum += ((q31_t) * px++ * *py--); 00884 sum += ((q31_t) * px++ * *py--); 00885 sum += ((q31_t) * px++ * *py--); 00886 sum += ((q31_t) * px++ * *py--); 00887 00888 /* Decrement the loop counter */ 00889 k--; 00890 } 00891 00892 /* If the count is not a multiple of 4, compute any remaining MACs here. 00893 ** No loop unrolling is used. */ 00894 k = count % 0x4U; 00895 00896 while (k > 0U) 00897 { 00898 /* Perform the multiply-accumulates */ 00899 sum += ((q31_t) * px++ * *py--); 00900 00901 /* Decrement the loop counter */ 00902 k--; 00903 } 00904 00905 /* Store the result in the accumulator in the destination buffer. */ 00906 *pOut++ = (q15_t) (sum >> 15); 00907 00908 /* Update the inputA and inputB pointers for next MAC calculation */ 00909 py = ++pSrc2 - 1U; 00910 px = pIn1; 00911 00912 /* Increment the MAC count */ 00913 count++; 00914 00915 /* Decrement the loop counter */ 00916 blockSize1--; 00917 } 00918 00919 /* -------------------------- 00920 * Initializations of stage2 00921 * ------------------------*/ 00922 00923 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00924 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00925 * .... 00926 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00927 */ 00928 00929 /* Working pointer of inputA */ 00930 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00931 { 00932 px = pIn1 + firstIndex - srcBLen + 1; 00933 } 00934 else 00935 { 00936 px = pIn1; 00937 } 00938 00939 /* Working pointer of inputB */ 00940 pSrc2 = pIn2 + (srcBLen - 1U); 00941 py = pSrc2; 00942 00943 /* count is the index by which the pointer pIn1 to be incremented */ 00944 count = 0U; 00945 00946 00947 /* -------------------- 00948 * Stage2 process 00949 * -------------------*/ 00950 00951 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00952 * So, to loop unroll over blockSize2, 00953 * srcBLen should be greater than or equal to 4 */ 00954 if (srcBLen >= 4U) 00955 { 00956 /* Loop unroll over blockSize2, by 4 */ 00957 blkCnt = ((uint32_t) blockSize2 >> 2U); 00958 00959 while (blkCnt > 0U) 00960 { 00961 py = py - 1U; 00962 00963 /* Set all accumulators to zero */ 00964 acc0 = 0; 00965 acc1 = 0; 00966 acc2 = 0; 00967 acc3 = 0; 00968 00969 /* read x[0], x[1] samples */ 00970 a = *px++; 00971 b = *px++; 00972 00973 #ifndef ARM_MATH_BIG_ENDIAN 00974 00975 x0 = __PKHBT(a, b, 16); 00976 a = *px; 00977 x1 = __PKHBT(b, a, 16); 00978 00979 #else 00980 00981 x0 = __PKHBT(b, a, 16); 00982 a = *px; 00983 x1 = __PKHBT(a, b, 16); 00984 00985 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00986 00987 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00988 k = srcBLen >> 2U; 00989 00990 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00991 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00992 do 00993 { 00994 /* Read the last two inputB samples using SIMD: 00995 * y[srcBLen - 1] and y[srcBLen - 2] */ 00996 a = *py; 00997 b = *(py+1); 00998 py -= 2; 00999 01000 #ifndef ARM_MATH_BIG_ENDIAN 01001 01002 c0 = __PKHBT(a, b, 16); 01003 01004 #else 01005 01006 c0 = __PKHBT(b, a, 16);; 01007 01008 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01009 01010 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 01011 acc0 = __SMLADX(x0, c0, acc0); 01012 01013 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 01014 acc1 = __SMLADX(x1, c0, acc1); 01015 01016 a = *px; 01017 b = *(px + 1); 01018 01019 #ifndef ARM_MATH_BIG_ENDIAN 01020 01021 x2 = __PKHBT(a, b, 16); 01022 a = *(px + 2); 01023 x3 = __PKHBT(b, a, 16); 01024 01025 #else 01026 01027 x2 = __PKHBT(b, a, 16); 01028 a = *(px + 2); 01029 x3 = __PKHBT(a, b, 16); 01030 01031 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01032 01033 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 01034 acc2 = __SMLADX(x2, c0, acc2); 01035 01036 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 01037 acc3 = __SMLADX(x3, c0, acc3); 01038 01039 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 01040 a = *py; 01041 b = *(py+1); 01042 py -= 2; 01043 01044 #ifndef ARM_MATH_BIG_ENDIAN 01045 01046 c0 = __PKHBT(a, b, 16); 01047 01048 #else 01049 01050 c0 = __PKHBT(b, a, 16);; 01051 01052 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01053 01054 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 01055 acc0 = __SMLADX(x2, c0, acc0); 01056 01057 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 01058 acc1 = __SMLADX(x3, c0, acc1); 01059 01060 /* Read x[4], x[5], x[6] */ 01061 a = *(px + 2); 01062 b = *(px + 3); 01063 01064 #ifndef ARM_MATH_BIG_ENDIAN 01065 01066 x0 = __PKHBT(a, b, 16); 01067 a = *(px + 4); 01068 x1 = __PKHBT(b, a, 16); 01069 01070 #else 01071 01072 x0 = __PKHBT(b, a, 16); 01073 a = *(px + 4); 01074 x1 = __PKHBT(a, b, 16); 01075 01076 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01077 01078 px += 4U; 01079 01080 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 01081 acc2 = __SMLADX(x0, c0, acc2); 01082 01083 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 01084 acc3 = __SMLADX(x1, c0, acc3); 01085 01086 } while (--k); 01087 01088 /* For the next MAC operations, SIMD is not used 01089 * So, the 16 bit pointer if inputB, py is updated */ 01090 01091 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01092 ** No loop unrolling is used. */ 01093 k = srcBLen % 0x4U; 01094 01095 if (k == 1U) 01096 { 01097 /* Read y[srcBLen - 5] */ 01098 c0 = *(py+1); 01099 01100 #ifdef ARM_MATH_BIG_ENDIAN 01101 01102 c0 = c0 << 16U; 01103 01104 #else 01105 01106 c0 = c0 & 0x0000FFFF; 01107 01108 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01109 01110 /* Read x[7] */ 01111 a = *px; 01112 b = *(px+1); 01113 px++; 01114 01115 #ifndef ARM_MATH_BIG_ENDIAN 01116 01117 x3 = __PKHBT(a, b, 16); 01118 01119 #else 01120 01121 x3 = __PKHBT(b, a, 16);; 01122 01123 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01124 01125 01126 /* Perform the multiply-accumulates */ 01127 acc0 = __SMLAD(x0, c0, acc0); 01128 acc1 = __SMLAD(x1, c0, acc1); 01129 acc2 = __SMLADX(x1, c0, acc2); 01130 acc3 = __SMLADX(x3, c0, acc3); 01131 } 01132 01133 if (k == 2U) 01134 { 01135 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 01136 a = *py; 01137 b = *(py+1); 01138 01139 #ifndef ARM_MATH_BIG_ENDIAN 01140 01141 c0 = __PKHBT(a, b, 16); 01142 01143 #else 01144 01145 c0 = __PKHBT(b, a, 16);; 01146 01147 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01148 01149 /* Read x[7], x[8], x[9] */ 01150 a = *px; 01151 b = *(px + 1); 01152 01153 #ifndef ARM_MATH_BIG_ENDIAN 01154 01155 x3 = __PKHBT(a, b, 16); 01156 a = *(px + 2); 01157 x2 = __PKHBT(b, a, 16); 01158 01159 #else 01160 01161 x3 = __PKHBT(b, a, 16); 01162 a = *(px + 2); 01163 x2 = __PKHBT(a, b, 16); 01164 01165 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01166 px += 2U; 01167 01168 /* Perform the multiply-accumulates */ 01169 acc0 = __SMLADX(x0, c0, acc0); 01170 acc1 = __SMLADX(x1, c0, acc1); 01171 acc2 = __SMLADX(x3, c0, acc2); 01172 acc3 = __SMLADX(x2, c0, acc3); 01173 } 01174 01175 if (k == 3U) 01176 { 01177 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 01178 a = *py; 01179 b = *(py+1); 01180 01181 #ifndef ARM_MATH_BIG_ENDIAN 01182 01183 c0 = __PKHBT(a, b, 16); 01184 01185 #else 01186 01187 c0 = __PKHBT(b, a, 16);; 01188 01189 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01190 01191 /* Read x[7], x[8], x[9] */ 01192 a = *px; 01193 b = *(px + 1); 01194 01195 #ifndef ARM_MATH_BIG_ENDIAN 01196 01197 x3 = __PKHBT(a, b, 16); 01198 a = *(px + 2); 01199 x2 = __PKHBT(b, a, 16); 01200 01201 #else 01202 01203 x3 = __PKHBT(b, a, 16); 01204 a = *(px + 2); 01205 x2 = __PKHBT(a, b, 16); 01206 01207 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01208 01209 /* Perform the multiply-accumulates */ 01210 acc0 = __SMLADX(x0, c0, acc0); 01211 acc1 = __SMLADX(x1, c0, acc1); 01212 acc2 = __SMLADX(x3, c0, acc2); 01213 acc3 = __SMLADX(x2, c0, acc3); 01214 01215 /* Read y[srcBLen - 7] */ 01216 c0 = *(py-1); 01217 #ifdef ARM_MATH_BIG_ENDIAN 01218 01219 c0 = c0 << 16U; 01220 #else 01221 01222 c0 = c0 & 0x0000FFFF; 01223 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01224 01225 /* Read x[10] */ 01226 a = *(px+2); 01227 b = *(px+3); 01228 01229 #ifndef ARM_MATH_BIG_ENDIAN 01230 01231 x3 = __PKHBT(a, b, 16); 01232 01233 #else 01234 01235 x3 = __PKHBT(b, a, 16);; 01236 01237 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01238 01239 px += 3U; 01240 01241 /* Perform the multiply-accumulates */ 01242 acc0 = __SMLADX(x1, c0, acc0); 01243 acc1 = __SMLAD(x2, c0, acc1); 01244 acc2 = __SMLADX(x2, c0, acc2); 01245 acc3 = __SMLADX(x3, c0, acc3); 01246 } 01247 01248 /* Store the results in the accumulators in the destination buffer. */ 01249 *pOut++ = (q15_t)(acc0 >> 15); 01250 *pOut++ = (q15_t)(acc1 >> 15); 01251 *pOut++ = (q15_t)(acc2 >> 15); 01252 *pOut++ = (q15_t)(acc3 >> 15); 01253 01254 /* Increment the pointer pIn1 index, count by 4 */ 01255 count += 4U; 01256 01257 /* Update the inputA and inputB pointers for next MAC calculation */ 01258 px = pIn1 + count; 01259 py = pSrc2; 01260 01261 /* Decrement the loop counter */ 01262 blkCnt--; 01263 } 01264 01265 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 01266 ** No loop unrolling is used. */ 01267 blkCnt = (uint32_t) blockSize2 % 0x4U; 01268 01269 while (blkCnt > 0U) 01270 { 01271 /* Accumulator is made zero for every iteration */ 01272 sum = 0; 01273 01274 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01275 k = srcBLen >> 2U; 01276 01277 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01278 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01279 while (k > 0U) 01280 { 01281 /* Perform the multiply-accumulates */ 01282 sum += ((q31_t) * px++ * *py--); 01283 sum += ((q31_t) * px++ * *py--); 01284 sum += ((q31_t) * px++ * *py--); 01285 sum += ((q31_t) * px++ * *py--); 01286 01287 /* Decrement the loop counter */ 01288 k--; 01289 } 01290 01291 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01292 ** No loop unrolling is used. */ 01293 k = srcBLen % 0x4U; 01294 01295 while (k > 0U) 01296 { 01297 /* Perform the multiply-accumulates */ 01298 sum += ((q31_t) * px++ * *py--); 01299 01300 /* Decrement the loop counter */ 01301 k--; 01302 } 01303 01304 /* Store the result in the accumulator in the destination buffer. */ 01305 *pOut++ = (q15_t) (sum >> 15); 01306 01307 /* Increment the pointer pIn1 index, count by 1 */ 01308 count++; 01309 01310 /* Update the inputA and inputB pointers for next MAC calculation */ 01311 px = pIn1 + count; 01312 py = pSrc2; 01313 01314 /* Decrement the loop counter */ 01315 blkCnt--; 01316 } 01317 } 01318 else 01319 { 01320 /* If the srcBLen is not a multiple of 4, 01321 * the blockSize2 loop cannot be unrolled by 4 */ 01322 blkCnt = (uint32_t) blockSize2; 01323 01324 while (blkCnt > 0U) 01325 { 01326 /* Accumulator is made zero for every iteration */ 01327 sum = 0; 01328 01329 /* srcBLen number of MACS should be performed */ 01330 k = srcBLen; 01331 01332 while (k > 0U) 01333 { 01334 /* Perform the multiply-accumulate */ 01335 sum += ((q31_t) * px++ * *py--); 01336 01337 /* Decrement the loop counter */ 01338 k--; 01339 } 01340 01341 /* Store the result in the accumulator in the destination buffer. */ 01342 *pOut++ = (q15_t) (sum >> 15); 01343 01344 /* Increment the MAC count */ 01345 count++; 01346 01347 /* Update the inputA and inputB pointers for next MAC calculation */ 01348 px = pIn1 + count; 01349 py = pSrc2; 01350 01351 /* Decrement the loop counter */ 01352 blkCnt--; 01353 } 01354 } 01355 01356 01357 /* -------------------------- 01358 * Initializations of stage3 01359 * -------------------------*/ 01360 01361 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 01362 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 01363 * .... 01364 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 01365 * sum += x[srcALen-1] * y[srcBLen-1] 01366 */ 01367 01368 /* In this stage the MAC operations are decreased by 1 for every iteration. 01369 The count variable holds the number of MAC operations performed */ 01370 count = srcBLen - 1U; 01371 01372 /* Working pointer of inputA */ 01373 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); 01374 px = pSrc1; 01375 01376 /* Working pointer of inputB */ 01377 pSrc2 = pIn2 + (srcBLen - 1U); 01378 pIn2 = pSrc2 - 1U; 01379 py = pIn2; 01380 01381 /* ------------------- 01382 * Stage3 process 01383 * ------------------*/ 01384 01385 /* For loop unrolling by 4, this stage is divided into two. */ 01386 /* First part of this stage computes the MAC operations greater than 4 */ 01387 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 01388 01389 /* The first part of the stage starts here */ 01390 j = count >> 2U; 01391 01392 while ((j > 0U) && (blockSize3 > 0)) 01393 { 01394 /* Accumulator is made zero for every iteration */ 01395 sum = 0; 01396 01397 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01398 k = count >> 2U; 01399 01400 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01401 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01402 py++; 01403 01404 while (k > 0U) 01405 { 01406 /* Perform the multiply-accumulates */ 01407 sum += ((q31_t) * px++ * *py--); 01408 sum += ((q31_t) * px++ * *py--); 01409 sum += ((q31_t) * px++ * *py--); 01410 sum += ((q31_t) * px++ * *py--); 01411 /* Decrement the loop counter */ 01412 k--; 01413 } 01414 01415 01416 /* If the count is not a multiple of 4, compute any remaining MACs here. 01417 ** No loop unrolling is used. */ 01418 k = count % 0x4U; 01419 01420 while (k > 0U) 01421 { 01422 /* Perform the multiply-accumulates */ 01423 sum += ((q31_t) * px++ * *py--); 01424 01425 /* Decrement the loop counter */ 01426 k--; 01427 } 01428 01429 /* Store the result in the accumulator in the destination buffer. */ 01430 *pOut++ = (q15_t) (sum >> 15); 01431 01432 /* Update the inputA and inputB pointers for next MAC calculation */ 01433 px = ++pSrc1; 01434 py = pIn2; 01435 01436 /* Decrement the MAC count */ 01437 count--; 01438 01439 /* Decrement the loop counter */ 01440 blockSize3--; 01441 01442 j--; 01443 } 01444 01445 /* The second part of the stage starts here */ 01446 /* SIMD is not used for the next MAC operations, 01447 * so pointer py is updated to read only one sample at a time */ 01448 py = py + 1U; 01449 01450 while (blockSize3 > 0) 01451 { 01452 /* Accumulator is made zero for every iteration */ 01453 sum = 0; 01454 01455 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01456 k = count; 01457 01458 while (k > 0U) 01459 { 01460 /* Perform the multiply-accumulates */ 01461 /* sum += x[srcALen-1] * y[srcBLen-1] */ 01462 sum += ((q31_t) * px++ * *py--); 01463 01464 /* Decrement the loop counter */ 01465 k--; 01466 } 01467 01468 /* Store the result in the accumulator in the destination buffer. */ 01469 *pOut++ = (q15_t) (sum >> 15); 01470 01471 /* Update the inputA and inputB pointers for next MAC calculation */ 01472 px = ++pSrc1; 01473 py = pSrc2; 01474 01475 /* Decrement the MAC count */ 01476 count--; 01477 01478 /* Decrement the loop counter */ 01479 blockSize3--; 01480 } 01481 01482 /* set status as ARM_MATH_SUCCESS */ 01483 status = ARM_MATH_SUCCESS; 01484 } 01485 01486 /* Return to application */ 01487 return (status); 01488 01489 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 01490 } 01491 01492 /** 01493 * @} end of PartialConv group 01494 */ 01495
Generated on Tue Jul 12 2022 16:46:23 by
 1.7.2
 1.7.2