CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_conv_partial_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_fast_q15.c 00009 * 00010 * Description: Fast Q15 Partial convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00062 * 00063 * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. 00064 */ 00065 00066 00067 arm_status arm_conv_partial_fast_q15( 00068 q15_t * pSrcA, 00069 uint32_t srcALen, 00070 q15_t * pSrcB, 00071 uint32_t srcBLen, 00072 q15_t * pDst, 00073 uint32_t firstIndex, 00074 uint32_t numPoints) 00075 { 00076 #ifndef UNALIGNED_SUPPORT_DISABLE 00077 00078 q15_t *pIn1; /* inputA pointer */ 00079 q15_t *pIn2; /* inputB pointer */ 00080 q15_t *pOut = pDst; /* output pointer */ 00081 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00082 q15_t *px; /* Intermediate inputA pointer */ 00083 q15_t *py; /* Intermediate inputB pointer */ 00084 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00085 q31_t x0, x1, x2, x3, c0; 00086 uint32_t j, k, count, check, blkCnt; 00087 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00088 arm_status status; /* status of Partial convolution */ 00089 00090 /* Check for range of output samples to be calculated */ 00091 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00092 { 00093 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00094 status = ARM_MATH_ARGUMENT_ERROR; 00095 } 00096 else 00097 { 00098 00099 /* The algorithm implementation is based on the lengths of the inputs. */ 00100 /* srcB is always made to slide across srcA. */ 00101 /* So srcBLen is always considered as shorter or equal to srcALen */ 00102 if(srcALen >=srcBLen) 00103 { 00104 /* Initialization of inputA pointer */ 00105 pIn1 = pSrcA; 00106 00107 /* Initialization of inputB pointer */ 00108 pIn2 = pSrcB; 00109 } 00110 else 00111 { 00112 /* Initialization of inputA pointer */ 00113 pIn1 = pSrcB; 00114 00115 /* Initialization of inputB pointer */ 00116 pIn2 = pSrcA; 00117 00118 /* srcBLen is always considered as shorter or equal to srcALen */ 00119 j = srcBLen; 00120 srcBLen = srcALen; 00121 srcALen = j; 00122 } 00123 00124 /* Conditions to check which loopCounter holds 00125 * the first and last indices of the output samples to be calculated. */ 00126 check = firstIndex + numPoints; 00127 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; 00128 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; 00129 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00130 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00131 (int32_t) numPoints) : 0; 00132 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00133 (int32_t) firstIndex); 00134 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00135 00136 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00137 /* The function is internally 00138 * divided into three stages according to the number of multiplications that has to be 00139 * taken place between inputA samples and inputB samples. In the first stage of the 00140 * algorithm, the multiplications increase by one for every iteration. 00141 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00142 * In the third stage of the algorithm, the multiplications decrease by one 00143 * for every iteration. */ 00144 00145 /* Set the output pointer to point to the firstIndex 00146 * of the output sample to be calculated. */ 00147 pOut = pDst + firstIndex; 00148 00149 /* -------------------------- 00150 * Initializations of stage1 00151 * -------------------------*/ 00152 00153 /* sum = x[0] * y[0] 00154 * sum = x[0] * y[1] + x[1] * y[0] 00155 * .... 00156 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00157 */ 00158 00159 /* In this stage the MAC operations are increased by 1 for every iteration. 00160 The count variable holds the number of MAC operations performed. 00161 Since the partial convolution starts from firstIndex 00162 Number of Macs to be performed is firstIndex + 1 */ 00163 count = 1u + firstIndex; 00164 00165 /* Working pointer of inputA */ 00166 px = pIn1; 00167 00168 /* Working pointer of inputB */ 00169 pSrc2 = pIn2 + firstIndex; 00170 py = pSrc2; 00171 00172 /* ------------------------ 00173 * Stage1 process 00174 * ----------------------*/ 00175 00176 /* For loop unrolling by 4, this stage is divided into two. */ 00177 /* First part of this stage computes the MAC operations less than 4 */ 00178 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00179 00180 /* The first part of the stage starts here */ 00181 while((count < 4u) && (blockSize1 > 0)) 00182 { 00183 /* Accumulator is made zero for every iteration */ 00184 sum = 0; 00185 00186 /* Loop over number of MAC operations between 00187 * inputA samples and inputB samples */ 00188 k = count; 00189 00190 while(k > 0u) 00191 { 00192 /* Perform the multiply-accumulates */ 00193 sum = __SMLAD(*px++, *py--, sum); 00194 00195 /* Decrement the loop counter */ 00196 k--; 00197 } 00198 00199 /* Store the result in the accumulator in the destination buffer. */ 00200 *pOut++ = (q15_t) (sum >> 15); 00201 00202 /* Update the inputA and inputB pointers for next MAC calculation */ 00203 py = ++pSrc2; 00204 px = pIn1; 00205 00206 /* Increment the MAC count */ 00207 count++; 00208 00209 /* Decrement the loop counter */ 00210 blockSize1--; 00211 } 00212 00213 /* The second part of the stage starts here */ 00214 /* The internal loop, over count, is unrolled by 4 */ 00215 /* To, read the last two inputB samples using SIMD: 00216 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00217 py = py - 1; 00218 00219 while(blockSize1 > 0) 00220 { 00221 /* Accumulator is made zero for every iteration */ 00222 sum = 0; 00223 00224 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00225 k = count >> 2u; 00226 00227 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00228 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00229 while(k > 0u) 00230 { 00231 /* Perform the multiply-accumulates */ 00232 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00233 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00234 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00235 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00236 00237 /* Decrement the loop counter */ 00238 k--; 00239 } 00240 00241 /* For the next MAC operations, the pointer py is used without SIMD 00242 * So, py is incremented by 1 */ 00243 py = py + 1u; 00244 00245 /* If the count is not a multiple of 4, compute any remaining MACs here. 00246 ** No loop unrolling is used. */ 00247 k = count % 0x4u; 00248 00249 while(k > 0u) 00250 { 00251 /* Perform the multiply-accumulates */ 00252 sum = __SMLAD(*px++, *py--, sum); 00253 00254 /* Decrement the loop counter */ 00255 k--; 00256 } 00257 00258 /* Store the result in the accumulator in the destination buffer. */ 00259 *pOut++ = (q15_t) (sum >> 15); 00260 00261 /* Update the inputA and inputB pointers for next MAC calculation */ 00262 py = ++pSrc2 - 1u; 00263 px = pIn1; 00264 00265 /* Increment the MAC count */ 00266 count++; 00267 00268 /* Decrement the loop counter */ 00269 blockSize1--; 00270 } 00271 00272 /* -------------------------- 00273 * Initializations of stage2 00274 * ------------------------*/ 00275 00276 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00277 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00278 * .... 00279 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00280 */ 00281 00282 /* Working pointer of inputA */ 00283 if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00284 { 00285 px = pIn1 + firstIndex - srcBLen + 1; 00286 } 00287 else 00288 { 00289 px = pIn1; 00290 } 00291 00292 /* Working pointer of inputB */ 00293 pSrc2 = pIn2 + (srcBLen - 1u); 00294 py = pSrc2; 00295 00296 /* count is the index by which the pointer pIn1 to be incremented */ 00297 count = 0u; 00298 00299 00300 /* -------------------- 00301 * Stage2 process 00302 * -------------------*/ 00303 00304 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00305 * So, to loop unroll over blockSize2, 00306 * srcBLen should be greater than or equal to 4 */ 00307 if(srcBLen >= 4u) 00308 { 00309 /* Loop unroll over blockSize2, by 4 */ 00310 blkCnt = ((uint32_t) blockSize2 >> 2u); 00311 00312 while(blkCnt > 0u) 00313 { 00314 py = py - 1u; 00315 00316 /* Set all accumulators to zero */ 00317 acc0 = 0; 00318 acc1 = 0; 00319 acc2 = 0; 00320 acc3 = 0; 00321 00322 00323 /* read x[0], x[1] samples */ 00324 x0 = *__SIMD32(px); 00325 /* read x[1], x[2] samples */ 00326 x1 = _SIMD32_OFFSET(px+1); 00327 px+= 2u; 00328 00329 00330 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00331 k = srcBLen >> 2u; 00332 00333 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00334 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00335 do 00336 { 00337 /* Read the last two inputB samples using SIMD: 00338 * y[srcBLen - 1] and y[srcBLen - 2] */ 00339 c0 = *__SIMD32(py)--; 00340 00341 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00342 acc0 = __SMLADX(x0, c0, acc0); 00343 00344 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00345 acc1 = __SMLADX(x1, c0, acc1); 00346 00347 /* Read x[2], x[3] */ 00348 x2 = *__SIMD32(px); 00349 00350 /* Read x[3], x[4] */ 00351 x3 = _SIMD32_OFFSET(px+1); 00352 00353 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00354 acc2 = __SMLADX(x2, c0, acc2); 00355 00356 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00357 acc3 = __SMLADX(x3, c0, acc3); 00358 00359 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00360 c0 = *__SIMD32(py)--; 00361 00362 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00363 acc0 = __SMLADX(x2, c0, acc0); 00364 00365 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00366 acc1 = __SMLADX(x3, c0, acc1); 00367 00368 /* Read x[4], x[5] */ 00369 x0 = _SIMD32_OFFSET(px+2); 00370 00371 /* Read x[5], x[6] */ 00372 x1 = _SIMD32_OFFSET(px+3); 00373 px += 4u; 00374 00375 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00376 acc2 = __SMLADX(x0, c0, acc2); 00377 00378 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00379 acc3 = __SMLADX(x1, c0, acc3); 00380 00381 } while(--k); 00382 00383 /* For the next MAC operations, SIMD is not used 00384 * So, the 16 bit pointer if inputB, py is updated */ 00385 00386 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00387 ** No loop unrolling is used. */ 00388 k = srcBLen % 0x4u; 00389 00390 if(k == 1u) 00391 { 00392 /* Read y[srcBLen - 5] */ 00393 c0 = *(py+1); 00394 #ifdef ARM_MATH_BIG_ENDIAN 00395 00396 c0 = c0 << 16u; 00397 00398 #else 00399 00400 c0 = c0 & 0x0000FFFF; 00401 00402 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00403 00404 /* Read x[7] */ 00405 x3 = *__SIMD32(px); 00406 px++; 00407 00408 /* Perform the multiply-accumulates */ 00409 acc0 = __SMLAD(x0, c0, acc0); 00410 acc1 = __SMLAD(x1, c0, acc1); 00411 acc2 = __SMLADX(x1, c0, acc2); 00412 acc3 = __SMLADX(x3, c0, acc3); 00413 } 00414 00415 if(k == 2u) 00416 { 00417 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00418 c0 = _SIMD32_OFFSET(py); 00419 00420 /* Read x[7], x[8] */ 00421 x3 = *__SIMD32(px); 00422 00423 /* Read x[9] */ 00424 x2 = _SIMD32_OFFSET(px+1); 00425 px += 2u; 00426 00427 /* Perform the multiply-accumulates */ 00428 acc0 = __SMLADX(x0, c0, acc0); 00429 acc1 = __SMLADX(x1, c0, acc1); 00430 acc2 = __SMLADX(x3, c0, acc2); 00431 acc3 = __SMLADX(x2, c0, acc3); 00432 } 00433 00434 if(k == 3u) 00435 { 00436 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00437 c0 = _SIMD32_OFFSET(py); 00438 00439 /* Read x[7], x[8] */ 00440 x3 = *__SIMD32(px); 00441 00442 /* Read x[9] */ 00443 x2 = _SIMD32_OFFSET(px+1); 00444 00445 /* Perform the multiply-accumulates */ 00446 acc0 = __SMLADX(x0, c0, acc0); 00447 acc1 = __SMLADX(x1, c0, acc1); 00448 acc2 = __SMLADX(x3, c0, acc2); 00449 acc3 = __SMLADX(x2, c0, acc3); 00450 00451 c0 = *(py-1); 00452 #ifdef ARM_MATH_BIG_ENDIAN 00453 00454 c0 = c0 << 16u; 00455 #else 00456 00457 c0 = c0 & 0x0000FFFF; 00458 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00459 00460 /* Read x[10] */ 00461 x3 = _SIMD32_OFFSET(px+2); 00462 px += 3u; 00463 00464 /* Perform the multiply-accumulates */ 00465 acc0 = __SMLADX(x1, c0, acc0); 00466 acc1 = __SMLAD(x2, c0, acc1); 00467 acc2 = __SMLADX(x2, c0, acc2); 00468 acc3 = __SMLADX(x3, c0, acc3); 00469 } 00470 00471 /* Store the results in the accumulators in the destination buffer. */ 00472 #ifndef ARM_MATH_BIG_ENDIAN 00473 00474 *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16); 00475 *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16); 00476 00477 #else 00478 00479 *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16); 00480 *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16); 00481 00482 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00483 00484 /* Increment the pointer pIn1 index, count by 4 */ 00485 count += 4u; 00486 00487 /* Update the inputA and inputB pointers for next MAC calculation */ 00488 px = pIn1 + count; 00489 py = pSrc2; 00490 00491 /* Decrement the loop counter */ 00492 blkCnt--; 00493 } 00494 00495 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00496 ** No loop unrolling is used. */ 00497 blkCnt = (uint32_t) blockSize2 % 0x4u; 00498 00499 while(blkCnt > 0u) 00500 { 00501 /* Accumulator is made zero for every iteration */ 00502 sum = 0; 00503 00504 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00505 k = srcBLen >> 2u; 00506 00507 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00508 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00509 while(k > 0u) 00510 { 00511 /* Perform the multiply-accumulates */ 00512 sum += ((q31_t) * px++ * *py--); 00513 sum += ((q31_t) * px++ * *py--); 00514 sum += ((q31_t) * px++ * *py--); 00515 sum += ((q31_t) * px++ * *py--); 00516 00517 /* Decrement the loop counter */ 00518 k--; 00519 } 00520 00521 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00522 ** No loop unrolling is used. */ 00523 k = srcBLen % 0x4u; 00524 00525 while(k > 0u) 00526 { 00527 /* Perform the multiply-accumulates */ 00528 sum += ((q31_t) * px++ * *py--); 00529 00530 /* Decrement the loop counter */ 00531 k--; 00532 } 00533 00534 /* Store the result in the accumulator in the destination buffer. */ 00535 *pOut++ = (q15_t) (sum >> 15); 00536 00537 /* Increment the pointer pIn1 index, count by 1 */ 00538 count++; 00539 00540 /* Update the inputA and inputB pointers for next MAC calculation */ 00541 px = pIn1 + count; 00542 py = pSrc2; 00543 00544 /* Decrement the loop counter */ 00545 blkCnt--; 00546 } 00547 } 00548 else 00549 { 00550 /* If the srcBLen is not a multiple of 4, 00551 * the blockSize2 loop cannot be unrolled by 4 */ 00552 blkCnt = (uint32_t) blockSize2; 00553 00554 while(blkCnt > 0u) 00555 { 00556 /* Accumulator is made zero for every iteration */ 00557 sum = 0; 00558 00559 /* srcBLen number of MACS should be performed */ 00560 k = srcBLen; 00561 00562 while(k > 0u) 00563 { 00564 /* Perform the multiply-accumulate */ 00565 sum += ((q31_t) * px++ * *py--); 00566 00567 /* Decrement the loop counter */ 00568 k--; 00569 } 00570 00571 /* Store the result in the accumulator in the destination buffer. */ 00572 *pOut++ = (q15_t) (sum >> 15); 00573 00574 /* Increment the MAC count */ 00575 count++; 00576 00577 /* Update the inputA and inputB pointers for next MAC calculation */ 00578 px = pIn1 + count; 00579 py = pSrc2; 00580 00581 /* Decrement the loop counter */ 00582 blkCnt--; 00583 } 00584 } 00585 00586 00587 /* -------------------------- 00588 * Initializations of stage3 00589 * -------------------------*/ 00590 00591 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00592 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00593 * .... 00594 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00595 * sum += x[srcALen-1] * y[srcBLen-1] 00596 */ 00597 00598 /* In this stage the MAC operations are decreased by 1 for every iteration. 00599 The count variable holds the number of MAC operations performed */ 00600 count = srcBLen - 1u; 00601 00602 /* Working pointer of inputA */ 00603 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00604 px = pSrc1; 00605 00606 /* Working pointer of inputB */ 00607 pSrc2 = pIn2 + (srcBLen - 1u); 00608 pIn2 = pSrc2 - 1u; 00609 py = pIn2; 00610 00611 /* ------------------- 00612 * Stage3 process 00613 * ------------------*/ 00614 00615 /* For loop unrolling by 4, this stage is divided into two. */ 00616 /* First part of this stage computes the MAC operations greater than 4 */ 00617 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00618 00619 /* The first part of the stage starts here */ 00620 j = count >> 2u; 00621 00622 while((j > 0u) && (blockSize3 > 0)) 00623 { 00624 /* Accumulator is made zero for every iteration */ 00625 sum = 0; 00626 00627 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00628 k = count >> 2u; 00629 00630 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00631 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00632 while(k > 0u) 00633 { 00634 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00635 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00636 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00637 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00638 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00639 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00640 00641 /* Decrement the loop counter */ 00642 k--; 00643 } 00644 00645 /* For the next MAC operations, the pointer py is used without SIMD 00646 * So, py is incremented by 1 */ 00647 py = py + 1u; 00648 00649 /* If the count is not a multiple of 4, compute any remaining MACs here. 00650 ** No loop unrolling is used. */ 00651 k = count % 0x4u; 00652 00653 while(k > 0u) 00654 { 00655 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00656 sum = __SMLAD(*px++, *py--, sum); 00657 00658 /* Decrement the loop counter */ 00659 k--; 00660 } 00661 00662 /* Store the result in the accumulator in the destination buffer. */ 00663 *pOut++ = (q15_t) (sum >> 15); 00664 00665 /* Update the inputA and inputB pointers for next MAC calculation */ 00666 px = ++pSrc1; 00667 py = pIn2; 00668 00669 /* Decrement the MAC count */ 00670 count--; 00671 00672 /* Decrement the loop counter */ 00673 blockSize3--; 00674 00675 j--; 00676 } 00677 00678 /* The second part of the stage starts here */ 00679 /* SIMD is not used for the next MAC operations, 00680 * so pointer py is updated to read only one sample at a time */ 00681 py = py + 1u; 00682 00683 while(blockSize3 > 0) 00684 { 00685 /* Accumulator is made zero for every iteration */ 00686 sum = 0; 00687 00688 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00689 k = count; 00690 00691 while(k > 0u) 00692 { 00693 /* Perform the multiply-accumulates */ 00694 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00695 sum = __SMLAD(*px++, *py--, sum); 00696 00697 /* Decrement the loop counter */ 00698 k--; 00699 } 00700 00701 /* Store the result in the accumulator in the destination buffer. */ 00702 *pOut++ = (q15_t) (sum >> 15); 00703 00704 /* Update the inputA and inputB pointers for next MAC calculation */ 00705 px = ++pSrc1; 00706 py = pSrc2; 00707 00708 /* Decrement the MAC count */ 00709 count--; 00710 00711 /* Decrement the loop counter */ 00712 blockSize3--; 00713 } 00714 00715 /* set status as ARM_MATH_SUCCESS */ 00716 status = ARM_MATH_SUCCESS; 00717 } 00718 00719 /* Return to application */ 00720 return (status); 00721 00722 #else 00723 00724 q15_t *pIn1; /* inputA pointer */ 00725 q15_t *pIn2; /* inputB pointer */ 00726 q15_t *pOut = pDst; /* output pointer */ 00727 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00728 q15_t *px; /* Intermediate inputA pointer */ 00729 q15_t *py; /* Intermediate inputB pointer */ 00730 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00731 q31_t x0, x1, x2, x3, c0; 00732 uint32_t j, k, count, check, blkCnt; 00733 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00734 arm_status status; /* status of Partial convolution */ 00735 q15_t a, b; 00736 00737 /* Check for range of output samples to be calculated */ 00738 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00739 { 00740 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00741 status = ARM_MATH_ARGUMENT_ERROR; 00742 } 00743 else 00744 { 00745 00746 /* The algorithm implementation is based on the lengths of the inputs. */ 00747 /* srcB is always made to slide across srcA. */ 00748 /* So srcBLen is always considered as shorter or equal to srcALen */ 00749 if(srcALen >=srcBLen) 00750 { 00751 /* Initialization of inputA pointer */ 00752 pIn1 = pSrcA; 00753 00754 /* Initialization of inputB pointer */ 00755 pIn2 = pSrcB; 00756 } 00757 else 00758 { 00759 /* Initialization of inputA pointer */ 00760 pIn1 = pSrcB; 00761 00762 /* Initialization of inputB pointer */ 00763 pIn2 = pSrcA; 00764 00765 /* srcBLen is always considered as shorter or equal to srcALen */ 00766 j = srcBLen; 00767 srcBLen = srcALen; 00768 srcALen = j; 00769 } 00770 00771 /* Conditions to check which loopCounter holds 00772 * the first and last indices of the output samples to be calculated. */ 00773 check = firstIndex + numPoints; 00774 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; 00775 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; 00776 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; 00777 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00778 (int32_t) numPoints) : 0; 00779 blockSize2 = ((int32_t) check - blockSize3) - 00780 (blockSize1 + (int32_t) firstIndex); 00781 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00782 00783 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00784 /* The function is internally 00785 * divided into three stages according to the number of multiplications that has to be 00786 * taken place between inputA samples and inputB samples. In the first stage of the 00787 * algorithm, the multiplications increase by one for every iteration. 00788 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00789 * In the third stage of the algorithm, the multiplications decrease by one 00790 * for every iteration. */ 00791 00792 /* Set the output pointer to point to the firstIndex 00793 * of the output sample to be calculated. */ 00794 pOut = pDst + firstIndex; 00795 00796 /* -------------------------- 00797 * Initializations of stage1 00798 * -------------------------*/ 00799 00800 /* sum = x[0] * y[0] 00801 * sum = x[0] * y[1] + x[1] * y[0] 00802 * .... 00803 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00804 */ 00805 00806 /* In this stage the MAC operations are increased by 1 for every iteration. 00807 The count variable holds the number of MAC operations performed. 00808 Since the partial convolution starts from firstIndex 00809 Number of Macs to be performed is firstIndex + 1 */ 00810 count = 1u + firstIndex; 00811 00812 /* Working pointer of inputA */ 00813 px = pIn1; 00814 00815 /* Working pointer of inputB */ 00816 pSrc2 = pIn2 + firstIndex; 00817 py = pSrc2; 00818 00819 /* ------------------------ 00820 * Stage1 process 00821 * ----------------------*/ 00822 00823 /* For loop unrolling by 4, this stage is divided into two. */ 00824 /* First part of this stage computes the MAC operations less than 4 */ 00825 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00826 00827 /* The first part of the stage starts here */ 00828 while((count < 4u) && (blockSize1 > 0)) 00829 { 00830 /* Accumulator is made zero for every iteration */ 00831 sum = 0; 00832 00833 /* Loop over number of MAC operations between 00834 * inputA samples and inputB samples */ 00835 k = count; 00836 00837 while(k > 0u) 00838 { 00839 /* Perform the multiply-accumulates */ 00840 sum += ((q31_t) * px++ * *py--); 00841 00842 /* Decrement the loop counter */ 00843 k--; 00844 } 00845 00846 /* Store the result in the accumulator in the destination buffer. */ 00847 *pOut++ = (q15_t) (sum >> 15); 00848 00849 /* Update the inputA and inputB pointers for next MAC calculation */ 00850 py = ++pSrc2; 00851 px = pIn1; 00852 00853 /* Increment the MAC count */ 00854 count++; 00855 00856 /* Decrement the loop counter */ 00857 blockSize1--; 00858 } 00859 00860 /* The second part of the stage starts here */ 00861 /* The internal loop, over count, is unrolled by 4 */ 00862 /* To, read the last two inputB samples using SIMD: 00863 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00864 py = py - 1; 00865 00866 while(blockSize1 > 0) 00867 { 00868 /* Accumulator is made zero for every iteration */ 00869 sum = 0; 00870 00871 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00872 k = count >> 2u; 00873 00874 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00875 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00876 py++; 00877 00878 while(k > 0u) 00879 { 00880 /* Perform the multiply-accumulates */ 00881 sum += ((q31_t) * px++ * *py--); 00882 sum += ((q31_t) * px++ * *py--); 00883 sum += ((q31_t) * px++ * *py--); 00884 sum += ((q31_t) * px++ * *py--); 00885 00886 /* Decrement the loop counter */ 00887 k--; 00888 } 00889 00890 /* If the count is not a multiple of 4, compute any remaining MACs here. 00891 ** No loop unrolling is used. */ 00892 k = count % 0x4u; 00893 00894 while(k > 0u) 00895 { 00896 /* Perform the multiply-accumulates */ 00897 sum += ((q31_t) * px++ * *py--); 00898 00899 /* Decrement the loop counter */ 00900 k--; 00901 } 00902 00903 /* Store the result in the accumulator in the destination buffer. */ 00904 *pOut++ = (q15_t) (sum >> 15); 00905 00906 /* Update the inputA and inputB pointers for next MAC calculation */ 00907 py = ++pSrc2 - 1u; 00908 px = pIn1; 00909 00910 /* Increment the MAC count */ 00911 count++; 00912 00913 /* Decrement the loop counter */ 00914 blockSize1--; 00915 } 00916 00917 /* -------------------------- 00918 * Initializations of stage2 00919 * ------------------------*/ 00920 00921 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00922 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00923 * .... 00924 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00925 */ 00926 00927 /* Working pointer of inputA */ 00928 if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00929 { 00930 px = pIn1 + firstIndex - srcBLen + 1; 00931 } 00932 else 00933 { 00934 px = pIn1; 00935 } 00936 00937 /* Working pointer of inputB */ 00938 pSrc2 = pIn2 + (srcBLen - 1u); 00939 py = pSrc2; 00940 00941 /* count is the index by which the pointer pIn1 to be incremented */ 00942 count = 0u; 00943 00944 00945 /* -------------------- 00946 * Stage2 process 00947 * -------------------*/ 00948 00949 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00950 * So, to loop unroll over blockSize2, 00951 * srcBLen should be greater than or equal to 4 */ 00952 if(srcBLen >= 4u) 00953 { 00954 /* Loop unroll over blockSize2, by 4 */ 00955 blkCnt = ((uint32_t) blockSize2 >> 2u); 00956 00957 while(blkCnt > 0u) 00958 { 00959 py = py - 1u; 00960 00961 /* Set all accumulators to zero */ 00962 acc0 = 0; 00963 acc1 = 0; 00964 acc2 = 0; 00965 acc3 = 0; 00966 00967 /* read x[0], x[1] samples */ 00968 a = *px++; 00969 b = *px++; 00970 00971 #ifndef ARM_MATH_BIG_ENDIAN 00972 00973 x0 = __PKHBT(a, b, 16); 00974 a = *px; 00975 x1 = __PKHBT(b, a, 16); 00976 00977 #else 00978 00979 x0 = __PKHBT(b, a, 16); 00980 a = *px; 00981 x1 = __PKHBT(a, b, 16); 00982 00983 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00984 00985 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00986 k = srcBLen >> 2u; 00987 00988 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00989 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00990 do 00991 { 00992 /* Read the last two inputB samples using SIMD: 00993 * y[srcBLen - 1] and y[srcBLen - 2] */ 00994 a = *py; 00995 b = *(py+1); 00996 py -= 2; 00997 00998 #ifndef ARM_MATH_BIG_ENDIAN 00999 01000 c0 = __PKHBT(a, b, 16); 01001 01002 #else 01003 01004 c0 = __PKHBT(b, a, 16);; 01005 01006 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01007 01008 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 01009 acc0 = __SMLADX(x0, c0, acc0); 01010 01011 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 01012 acc1 = __SMLADX(x1, c0, acc1); 01013 01014 a = *px; 01015 b = *(px + 1); 01016 01017 #ifndef ARM_MATH_BIG_ENDIAN 01018 01019 x2 = __PKHBT(a, b, 16); 01020 a = *(px + 2); 01021 x3 = __PKHBT(b, a, 16); 01022 01023 #else 01024 01025 x2 = __PKHBT(b, a, 16); 01026 a = *(px + 2); 01027 x3 = __PKHBT(a, b, 16); 01028 01029 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01030 01031 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 01032 acc2 = __SMLADX(x2, c0, acc2); 01033 01034 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 01035 acc3 = __SMLADX(x3, c0, acc3); 01036 01037 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 01038 a = *py; 01039 b = *(py+1); 01040 py -= 2; 01041 01042 #ifndef ARM_MATH_BIG_ENDIAN 01043 01044 c0 = __PKHBT(a, b, 16); 01045 01046 #else 01047 01048 c0 = __PKHBT(b, a, 16);; 01049 01050 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01051 01052 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 01053 acc0 = __SMLADX(x2, c0, acc0); 01054 01055 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 01056 acc1 = __SMLADX(x3, c0, acc1); 01057 01058 /* Read x[4], x[5], x[6] */ 01059 a = *(px + 2); 01060 b = *(px + 3); 01061 01062 #ifndef ARM_MATH_BIG_ENDIAN 01063 01064 x0 = __PKHBT(a, b, 16); 01065 a = *(px + 4); 01066 x1 = __PKHBT(b, a, 16); 01067 01068 #else 01069 01070 x0 = __PKHBT(b, a, 16); 01071 a = *(px + 4); 01072 x1 = __PKHBT(a, b, 16); 01073 01074 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01075 01076 px += 4u; 01077 01078 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 01079 acc2 = __SMLADX(x0, c0, acc2); 01080 01081 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 01082 acc3 = __SMLADX(x1, c0, acc3); 01083 01084 } while(--k); 01085 01086 /* For the next MAC operations, SIMD is not used 01087 * So, the 16 bit pointer if inputB, py is updated */ 01088 01089 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01090 ** No loop unrolling is used. */ 01091 k = srcBLen % 0x4u; 01092 01093 if(k == 1u) 01094 { 01095 /* Read y[srcBLen - 5] */ 01096 c0 = *(py+1); 01097 01098 #ifdef ARM_MATH_BIG_ENDIAN 01099 01100 c0 = c0 << 16u; 01101 01102 #else 01103 01104 c0 = c0 & 0x0000FFFF; 01105 01106 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01107 01108 /* Read x[7] */ 01109 a = *px; 01110 b = *(px+1); 01111 px++; 01112 01113 #ifndef ARM_MATH_BIG_ENDIAN 01114 01115 x3 = __PKHBT(a, b, 16); 01116 01117 #else 01118 01119 x3 = __PKHBT(b, a, 16);; 01120 01121 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01122 01123 01124 /* Perform the multiply-accumulates */ 01125 acc0 = __SMLAD(x0, c0, acc0); 01126 acc1 = __SMLAD(x1, c0, acc1); 01127 acc2 = __SMLADX(x1, c0, acc2); 01128 acc3 = __SMLADX(x3, c0, acc3); 01129 } 01130 01131 if(k == 2u) 01132 { 01133 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 01134 a = *py; 01135 b = *(py+1); 01136 01137 #ifndef ARM_MATH_BIG_ENDIAN 01138 01139 c0 = __PKHBT(a, b, 16); 01140 01141 #else 01142 01143 c0 = __PKHBT(b, a, 16);; 01144 01145 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01146 01147 /* Read x[7], x[8], x[9] */ 01148 a = *px; 01149 b = *(px + 1); 01150 01151 #ifndef ARM_MATH_BIG_ENDIAN 01152 01153 x3 = __PKHBT(a, b, 16); 01154 a = *(px + 2); 01155 x2 = __PKHBT(b, a, 16); 01156 01157 #else 01158 01159 x3 = __PKHBT(b, a, 16); 01160 a = *(px + 2); 01161 x2 = __PKHBT(a, b, 16); 01162 01163 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01164 px += 2u; 01165 01166 /* Perform the multiply-accumulates */ 01167 acc0 = __SMLADX(x0, c0, acc0); 01168 acc1 = __SMLADX(x1, c0, acc1); 01169 acc2 = __SMLADX(x3, c0, acc2); 01170 acc3 = __SMLADX(x2, c0, acc3); 01171 } 01172 01173 if(k == 3u) 01174 { 01175 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 01176 a = *py; 01177 b = *(py+1); 01178 01179 #ifndef ARM_MATH_BIG_ENDIAN 01180 01181 c0 = __PKHBT(a, b, 16); 01182 01183 #else 01184 01185 c0 = __PKHBT(b, a, 16);; 01186 01187 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01188 01189 /* Read x[7], x[8], x[9] */ 01190 a = *px; 01191 b = *(px + 1); 01192 01193 #ifndef ARM_MATH_BIG_ENDIAN 01194 01195 x3 = __PKHBT(a, b, 16); 01196 a = *(px + 2); 01197 x2 = __PKHBT(b, a, 16); 01198 01199 #else 01200 01201 x3 = __PKHBT(b, a, 16); 01202 a = *(px + 2); 01203 x2 = __PKHBT(a, b, 16); 01204 01205 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01206 01207 /* Perform the multiply-accumulates */ 01208 acc0 = __SMLADX(x0, c0, acc0); 01209 acc1 = __SMLADX(x1, c0, acc1); 01210 acc2 = __SMLADX(x3, c0, acc2); 01211 acc3 = __SMLADX(x2, c0, acc3); 01212 01213 /* Read y[srcBLen - 7] */ 01214 c0 = *(py-1); 01215 #ifdef ARM_MATH_BIG_ENDIAN 01216 01217 c0 = c0 << 16u; 01218 #else 01219 01220 c0 = c0 & 0x0000FFFF; 01221 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01222 01223 /* Read x[10] */ 01224 a = *(px+2); 01225 b = *(px+3); 01226 01227 #ifndef ARM_MATH_BIG_ENDIAN 01228 01229 x3 = __PKHBT(a, b, 16); 01230 01231 #else 01232 01233 x3 = __PKHBT(b, a, 16);; 01234 01235 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01236 01237 px += 3u; 01238 01239 /* Perform the multiply-accumulates */ 01240 acc0 = __SMLADX(x1, c0, acc0); 01241 acc1 = __SMLAD(x2, c0, acc1); 01242 acc2 = __SMLADX(x2, c0, acc2); 01243 acc3 = __SMLADX(x3, c0, acc3); 01244 } 01245 01246 /* Store the results in the accumulators in the destination buffer. */ 01247 *pOut++ = (q15_t)(acc0 >> 15); 01248 *pOut++ = (q15_t)(acc1 >> 15); 01249 *pOut++ = (q15_t)(acc2 >> 15); 01250 *pOut++ = (q15_t)(acc3 >> 15); 01251 01252 /* Increment the pointer pIn1 index, count by 4 */ 01253 count += 4u; 01254 01255 /* Update the inputA and inputB pointers for next MAC calculation */ 01256 px = pIn1 + count; 01257 py = pSrc2; 01258 01259 /* Decrement the loop counter */ 01260 blkCnt--; 01261 } 01262 01263 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 01264 ** No loop unrolling is used. */ 01265 blkCnt = (uint32_t) blockSize2 % 0x4u; 01266 01267 while(blkCnt > 0u) 01268 { 01269 /* Accumulator is made zero for every iteration */ 01270 sum = 0; 01271 01272 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01273 k = srcBLen >> 2u; 01274 01275 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01276 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01277 while(k > 0u) 01278 { 01279 /* Perform the multiply-accumulates */ 01280 sum += ((q31_t) * px++ * *py--); 01281 sum += ((q31_t) * px++ * *py--); 01282 sum += ((q31_t) * px++ * *py--); 01283 sum += ((q31_t) * px++ * *py--); 01284 01285 /* Decrement the loop counter */ 01286 k--; 01287 } 01288 01289 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01290 ** No loop unrolling is used. */ 01291 k = srcBLen % 0x4u; 01292 01293 while(k > 0u) 01294 { 01295 /* Perform the multiply-accumulates */ 01296 sum += ((q31_t) * px++ * *py--); 01297 01298 /* Decrement the loop counter */ 01299 k--; 01300 } 01301 01302 /* Store the result in the accumulator in the destination buffer. */ 01303 *pOut++ = (q15_t) (sum >> 15); 01304 01305 /* Increment the pointer pIn1 index, count by 1 */ 01306 count++; 01307 01308 /* Update the inputA and inputB pointers for next MAC calculation */ 01309 px = pIn1 + count; 01310 py = pSrc2; 01311 01312 /* Decrement the loop counter */ 01313 blkCnt--; 01314 } 01315 } 01316 else 01317 { 01318 /* If the srcBLen is not a multiple of 4, 01319 * the blockSize2 loop cannot be unrolled by 4 */ 01320 blkCnt = (uint32_t) blockSize2; 01321 01322 while(blkCnt > 0u) 01323 { 01324 /* Accumulator is made zero for every iteration */ 01325 sum = 0; 01326 01327 /* srcBLen number of MACS should be performed */ 01328 k = srcBLen; 01329 01330 while(k > 0u) 01331 { 01332 /* Perform the multiply-accumulate */ 01333 sum += ((q31_t) * px++ * *py--); 01334 01335 /* Decrement the loop counter */ 01336 k--; 01337 } 01338 01339 /* Store the result in the accumulator in the destination buffer. */ 01340 *pOut++ = (q15_t) (sum >> 15); 01341 01342 /* Increment the MAC count */ 01343 count++; 01344 01345 /* Update the inputA and inputB pointers for next MAC calculation */ 01346 px = pIn1 + count; 01347 py = pSrc2; 01348 01349 /* Decrement the loop counter */ 01350 blkCnt--; 01351 } 01352 } 01353 01354 01355 /* -------------------------- 01356 * Initializations of stage3 01357 * -------------------------*/ 01358 01359 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 01360 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 01361 * .... 01362 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 01363 * sum += x[srcALen-1] * y[srcBLen-1] 01364 */ 01365 01366 /* In this stage the MAC operations are decreased by 1 for every iteration. 01367 The count variable holds the number of MAC operations performed */ 01368 count = srcBLen - 1u; 01369 01370 /* Working pointer of inputA */ 01371 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 01372 px = pSrc1; 01373 01374 /* Working pointer of inputB */ 01375 pSrc2 = pIn2 + (srcBLen - 1u); 01376 pIn2 = pSrc2 - 1u; 01377 py = pIn2; 01378 01379 /* ------------------- 01380 * Stage3 process 01381 * ------------------*/ 01382 01383 /* For loop unrolling by 4, this stage is divided into two. */ 01384 /* First part of this stage computes the MAC operations greater than 4 */ 01385 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 01386 01387 /* The first part of the stage starts here */ 01388 j = count >> 2u; 01389 01390 while((j > 0u) && (blockSize3 > 0)) 01391 { 01392 /* Accumulator is made zero for every iteration */ 01393 sum = 0; 01394 01395 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01396 k = count >> 2u; 01397 01398 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01399 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01400 py++; 01401 01402 while(k > 0u) 01403 { 01404 /* Perform the multiply-accumulates */ 01405 sum += ((q31_t) * px++ * *py--); 01406 sum += ((q31_t) * px++ * *py--); 01407 sum += ((q31_t) * px++ * *py--); 01408 sum += ((q31_t) * px++ * *py--); 01409 /* Decrement the loop counter */ 01410 k--; 01411 } 01412 01413 01414 /* If the count is not a multiple of 4, compute any remaining MACs here. 01415 ** No loop unrolling is used. */ 01416 k = count % 0x4u; 01417 01418 while(k > 0u) 01419 { 01420 /* Perform the multiply-accumulates */ 01421 sum += ((q31_t) * px++ * *py--); 01422 01423 /* Decrement the loop counter */ 01424 k--; 01425 } 01426 01427 /* Store the result in the accumulator in the destination buffer. */ 01428 *pOut++ = (q15_t) (sum >> 15); 01429 01430 /* Update the inputA and inputB pointers for next MAC calculation */ 01431 px = ++pSrc1; 01432 py = pIn2; 01433 01434 /* Decrement the MAC count */ 01435 count--; 01436 01437 /* Decrement the loop counter */ 01438 blockSize3--; 01439 01440 j--; 01441 } 01442 01443 /* The second part of the stage starts here */ 01444 /* SIMD is not used for the next MAC operations, 01445 * so pointer py is updated to read only one sample at a time */ 01446 py = py + 1u; 01447 01448 while(blockSize3 > 0) 01449 { 01450 /* Accumulator is made zero for every iteration */ 01451 sum = 0; 01452 01453 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01454 k = count; 01455 01456 while(k > 0u) 01457 { 01458 /* Perform the multiply-accumulates */ 01459 /* sum += x[srcALen-1] * y[srcBLen-1] */ 01460 sum += ((q31_t) * px++ * *py--); 01461 01462 /* Decrement the loop counter */ 01463 k--; 01464 } 01465 01466 /* Store the result in the accumulator in the destination buffer. */ 01467 *pOut++ = (q15_t) (sum >> 15); 01468 01469 /* Update the inputA and inputB pointers for next MAC calculation */ 01470 px = ++pSrc1; 01471 py = pSrc2; 01472 01473 /* Decrement the MAC count */ 01474 count--; 01475 01476 /* Decrement the loop counter */ 01477 blockSize3--; 01478 } 01479 01480 /* set status as ARM_MATH_SUCCESS */ 01481 status = ARM_MATH_SUCCESS; 01482 } 01483 01484 /* Return to application */ 01485 return (status); 01486 01487 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 01488 } 01489 01490 /** 01491 * @} end of PartialConv group 01492 */
Generated on Tue Jul 12 2022 11:59:16 by 1.7.2