CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_conv_partial_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_fast_q15.c 00009 * 00010 * Description: Fast Q15 Partial convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00062 * 00063 * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. 00064 */ 00065 00066 00067 arm_status arm_conv_partial_fast_q15( 00068 q15_t * pSrcA, 00069 uint32_t srcALen, 00070 q15_t * pSrcB, 00071 uint32_t srcBLen, 00072 q15_t * pDst, 00073 uint32_t firstIndex, 00074 uint32_t numPoints) 00075 { 00076 #ifndef UNALIGNED_SUPPORT_DISABLE 00077 00078 q15_t *pIn1; /* inputA pointer */ 00079 q15_t *pIn2; /* inputB pointer */ 00080 q15_t *pOut = pDst; /* output pointer */ 00081 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00082 q15_t *px; /* Intermediate inputA pointer */ 00083 q15_t *py; /* Intermediate inputB pointer */ 00084 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00085 q31_t x0, x1, x2, x3, c0; 00086 uint32_t j, k, count, check, blkCnt; 00087 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00088 arm_status status; /* status of Partial convolution */ 00089 00090 /* Check for range of output samples to be calculated */ 00091 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00092 { 00093 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00094 status = ARM_MATH_ARGUMENT_ERROR; 00095 } 00096 else 00097 { 00098 00099 /* The algorithm implementation is based on the lengths of the inputs. */ 00100 /* srcB is always made to slide across srcA. */ 00101 /* So srcBLen is always considered as shorter or equal to srcALen */ 00102 if(srcALen >=srcBLen) 00103 { 00104 /* Initialization of inputA pointer */ 00105 pIn1 = pSrcA; 00106 00107 /* Initialization of inputB pointer */ 00108 pIn2 = pSrcB; 00109 } 00110 else 00111 { 00112 /* Initialization of inputA pointer */ 00113 pIn1 = pSrcB; 00114 00115 /* Initialization of inputB pointer */ 00116 pIn2 = pSrcA; 00117 00118 /* srcBLen is always considered as shorter or equal to srcALen */ 00119 j = srcBLen; 00120 srcBLen = srcALen; 00121 srcALen = j; 00122 } 00123 00124 /* Conditions to check which loopCounter holds 00125 * the first and last indices of the output samples to be calculated. */ 00126 check = firstIndex + numPoints; 00127 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00128 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00129 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00130 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00131 (int32_t) numPoints) : 0; 00132 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00133 (int32_t) firstIndex); 00134 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00135 00136 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00137 /* The function is internally 00138 * divided into three stages according to the number of multiplications that has to be 00139 * taken place between inputA samples and inputB samples. In the first stage of the 00140 * algorithm, the multiplications increase by one for every iteration. 00141 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00142 * In the third stage of the algorithm, the multiplications decrease by one 00143 * for every iteration. */ 00144 00145 /* Set the output pointer to point to the firstIndex 00146 * of the output sample to be calculated. */ 00147 pOut = pDst + firstIndex; 00148 00149 /* -------------------------- 00150 * Initializations of stage1 00151 * -------------------------*/ 00152 00153 /* sum = x[0] * y[0] 00154 * sum = x[0] * y[1] + x[1] * y[0] 00155 * .... 00156 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00157 */ 00158 00159 /* In this stage the MAC operations are increased by 1 for every iteration. 00160 The count variable holds the number of MAC operations performed. 00161 Since the partial convolution starts from firstIndex 00162 Number of Macs to be performed is firstIndex + 1 */ 00163 count = 1u + firstIndex; 00164 00165 /* Working pointer of inputA */ 00166 px = pIn1; 00167 00168 /* Working pointer of inputB */ 00169 pSrc2 = pIn2 + firstIndex; 00170 py = pSrc2; 00171 00172 /* ------------------------ 00173 * Stage1 process 00174 * ----------------------*/ 00175 00176 /* For loop unrolling by 4, this stage is divided into two. */ 00177 /* First part of this stage computes the MAC operations less than 4 */ 00178 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00179 00180 /* The first part of the stage starts here */ 00181 while((count < 4u) && (blockSize1 > 0)) 00182 { 00183 /* Accumulator is made zero for every iteration */ 00184 sum = 0; 00185 00186 /* Loop over number of MAC operations between 00187 * inputA samples and inputB samples */ 00188 k = count; 00189 00190 while(k > 0u) 00191 { 00192 /* Perform the multiply-accumulates */ 00193 sum = __SMLAD(*px++, *py--, sum); 00194 00195 /* Decrement the loop counter */ 00196 k--; 00197 } 00198 00199 /* Store the result in the accumulator in the destination buffer. */ 00200 *pOut++ = (q15_t) (sum >> 15); 00201 00202 /* Update the inputA and inputB pointers for next MAC calculation */ 00203 py = ++pSrc2; 00204 px = pIn1; 00205 00206 /* Increment the MAC count */ 00207 count++; 00208 00209 /* Decrement the loop counter */ 00210 blockSize1--; 00211 } 00212 00213 /* The second part of the stage starts here */ 00214 /* The internal loop, over count, is unrolled by 4 */ 00215 /* To, read the last two inputB samples using SIMD: 00216 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00217 py = py - 1; 00218 00219 while(blockSize1 > 0) 00220 { 00221 /* Accumulator is made zero for every iteration */ 00222 sum = 0; 00223 00224 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00225 k = count >> 2u; 00226 00227 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00228 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00229 while(k > 0u) 00230 { 00231 /* Perform the multiply-accumulates */ 00232 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00233 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00234 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00235 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00236 00237 /* Decrement the loop counter */ 00238 k--; 00239 } 00240 00241 /* For the next MAC operations, the pointer py is used without SIMD 00242 * So, py is incremented by 1 */ 00243 py = py + 1u; 00244 00245 /* If the count is not a multiple of 4, compute any remaining MACs here. 00246 ** No loop unrolling is used. */ 00247 k = count % 0x4u; 00248 00249 while(k > 0u) 00250 { 00251 /* Perform the multiply-accumulates */ 00252 sum = __SMLAD(*px++, *py--, sum); 00253 00254 /* Decrement the loop counter */ 00255 k--; 00256 } 00257 00258 /* Store the result in the accumulator in the destination buffer. */ 00259 *pOut++ = (q15_t) (sum >> 15); 00260 00261 /* Update the inputA and inputB pointers for next MAC calculation */ 00262 py = ++pSrc2 - 1u; 00263 px = pIn1; 00264 00265 /* Increment the MAC count */ 00266 count++; 00267 00268 /* Decrement the loop counter */ 00269 blockSize1--; 00270 } 00271 00272 /* -------------------------- 00273 * Initializations of stage2 00274 * ------------------------*/ 00275 00276 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00277 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00278 * .... 00279 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00280 */ 00281 00282 /* Working pointer of inputA */ 00283 px = pIn1; 00284 00285 /* Working pointer of inputB */ 00286 pSrc2 = pIn2 + (srcBLen - 1u); 00287 py = pSrc2; 00288 00289 /* count is the index by which the pointer pIn1 to be incremented */ 00290 count = 0u; 00291 00292 00293 /* -------------------- 00294 * Stage2 process 00295 * -------------------*/ 00296 00297 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00298 * So, to loop unroll over blockSize2, 00299 * srcBLen should be greater than or equal to 4 */ 00300 if(srcBLen >= 4u) 00301 { 00302 /* Loop unroll over blockSize2, by 4 */ 00303 blkCnt = ((uint32_t) blockSize2 >> 2u); 00304 00305 while(blkCnt > 0u) 00306 { 00307 py = py - 1u; 00308 00309 /* Set all accumulators to zero */ 00310 acc0 = 0; 00311 acc1 = 0; 00312 acc2 = 0; 00313 acc3 = 0; 00314 00315 00316 /* read x[0], x[1] samples */ 00317 x0 = *__SIMD32(px); 00318 /* read x[1], x[2] samples */ 00319 x1 = _SIMD32_OFFSET(px+1); 00320 px+= 2u; 00321 00322 00323 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00324 k = srcBLen >> 2u; 00325 00326 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00327 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00328 do 00329 { 00330 /* Read the last two inputB samples using SIMD: 00331 * y[srcBLen - 1] and y[srcBLen - 2] */ 00332 c0 = *__SIMD32(py)--; 00333 00334 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00335 acc0 = __SMLADX(x0, c0, acc0); 00336 00337 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00338 acc1 = __SMLADX(x1, c0, acc1); 00339 00340 /* Read x[2], x[3] */ 00341 x2 = *__SIMD32(px); 00342 00343 /* Read x[3], x[4] */ 00344 x3 = _SIMD32_OFFSET(px+1); 00345 00346 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00347 acc2 = __SMLADX(x2, c0, acc2); 00348 00349 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00350 acc3 = __SMLADX(x3, c0, acc3); 00351 00352 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00353 c0 = *__SIMD32(py)--; 00354 00355 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00356 acc0 = __SMLADX(x2, c0, acc0); 00357 00358 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00359 acc1 = __SMLADX(x3, c0, acc1); 00360 00361 /* Read x[4], x[5] */ 00362 x0 = _SIMD32_OFFSET(px+2); 00363 00364 /* Read x[5], x[6] */ 00365 x1 = _SIMD32_OFFSET(px+3); 00366 px += 4u; 00367 00368 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00369 acc2 = __SMLADX(x0, c0, acc2); 00370 00371 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00372 acc3 = __SMLADX(x1, c0, acc3); 00373 00374 } while(--k); 00375 00376 /* For the next MAC operations, SIMD is not used 00377 * So, the 16 bit pointer if inputB, py is updated */ 00378 00379 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00380 ** No loop unrolling is used. */ 00381 k = srcBLen % 0x4u; 00382 00383 if(k == 1u) 00384 { 00385 /* Read y[srcBLen - 5] */ 00386 c0 = *(py+1); 00387 #ifdef ARM_MATH_BIG_ENDIAN 00388 00389 c0 = c0 << 16u; 00390 00391 #else 00392 00393 c0 = c0 & 0x0000FFFF; 00394 00395 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00396 00397 /* Read x[7] */ 00398 x3 = *__SIMD32(px); 00399 px++; 00400 00401 /* Perform the multiply-accumulates */ 00402 acc0 = __SMLAD(x0, c0, acc0); 00403 acc1 = __SMLAD(x1, c0, acc1); 00404 acc2 = __SMLADX(x1, c0, acc2); 00405 acc3 = __SMLADX(x3, c0, acc3); 00406 } 00407 00408 if(k == 2u) 00409 { 00410 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00411 c0 = _SIMD32_OFFSET(py); 00412 00413 /* Read x[7], x[8] */ 00414 x3 = *__SIMD32(px); 00415 00416 /* Read x[9] */ 00417 x2 = _SIMD32_OFFSET(px+1); 00418 px += 2u; 00419 00420 /* Perform the multiply-accumulates */ 00421 acc0 = __SMLADX(x0, c0, acc0); 00422 acc1 = __SMLADX(x1, c0, acc1); 00423 acc2 = __SMLADX(x3, c0, acc2); 00424 acc3 = __SMLADX(x2, c0, acc3); 00425 } 00426 00427 if(k == 3u) 00428 { 00429 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00430 c0 = _SIMD32_OFFSET(py); 00431 00432 /* Read x[7], x[8] */ 00433 x3 = *__SIMD32(px); 00434 00435 /* Read x[9] */ 00436 x2 = _SIMD32_OFFSET(px+1); 00437 00438 /* Perform the multiply-accumulates */ 00439 acc0 = __SMLADX(x0, c0, acc0); 00440 acc1 = __SMLADX(x1, c0, acc1); 00441 acc2 = __SMLADX(x3, c0, acc2); 00442 acc3 = __SMLADX(x2, c0, acc3); 00443 00444 c0 = *(py-1); 00445 #ifdef ARM_MATH_BIG_ENDIAN 00446 00447 c0 = c0 << 16u; 00448 #else 00449 00450 c0 = c0 & 0x0000FFFF; 00451 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00452 00453 /* Read x[10] */ 00454 x3 = _SIMD32_OFFSET(px+2); 00455 px += 3u; 00456 00457 /* Perform the multiply-accumulates */ 00458 acc0 = __SMLADX(x1, c0, acc0); 00459 acc1 = __SMLAD(x2, c0, acc1); 00460 acc2 = __SMLADX(x2, c0, acc2); 00461 acc3 = __SMLADX(x3, c0, acc3); 00462 } 00463 00464 /* Store the results in the accumulators in the destination buffer. */ 00465 #ifndef ARM_MATH_BIG_ENDIAN 00466 00467 *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16); 00468 *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16); 00469 00470 #else 00471 00472 *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16); 00473 *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16); 00474 00475 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00476 00477 /* Increment the pointer pIn1 index, count by 4 */ 00478 count += 4u; 00479 00480 /* Update the inputA and inputB pointers for next MAC calculation */ 00481 px = pIn1 + count; 00482 py = pSrc2; 00483 00484 /* Decrement the loop counter */ 00485 blkCnt--; 00486 } 00487 00488 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00489 ** No loop unrolling is used. */ 00490 blkCnt = (uint32_t) blockSize2 % 0x4u; 00491 00492 while(blkCnt > 0u) 00493 { 00494 /* Accumulator is made zero for every iteration */ 00495 sum = 0; 00496 00497 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00498 k = srcBLen >> 2u; 00499 00500 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00501 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00502 while(k > 0u) 00503 { 00504 /* Perform the multiply-accumulates */ 00505 sum += ((q31_t) * px++ * *py--); 00506 sum += ((q31_t) * px++ * *py--); 00507 sum += ((q31_t) * px++ * *py--); 00508 sum += ((q31_t) * px++ * *py--); 00509 00510 /* Decrement the loop counter */ 00511 k--; 00512 } 00513 00514 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00515 ** No loop unrolling is used. */ 00516 k = srcBLen % 0x4u; 00517 00518 while(k > 0u) 00519 { 00520 /* Perform the multiply-accumulates */ 00521 sum += ((q31_t) * px++ * *py--); 00522 00523 /* Decrement the loop counter */ 00524 k--; 00525 } 00526 00527 /* Store the result in the accumulator in the destination buffer. */ 00528 *pOut++ = (q15_t) (sum >> 15); 00529 00530 /* Increment the pointer pIn1 index, count by 1 */ 00531 count++; 00532 00533 /* Update the inputA and inputB pointers for next MAC calculation */ 00534 px = pIn1 + count; 00535 py = pSrc2; 00536 00537 /* Decrement the loop counter */ 00538 blkCnt--; 00539 } 00540 } 00541 else 00542 { 00543 /* If the srcBLen is not a multiple of 4, 00544 * the blockSize2 loop cannot be unrolled by 4 */ 00545 blkCnt = (uint32_t) blockSize2; 00546 00547 while(blkCnt > 0u) 00548 { 00549 /* Accumulator is made zero for every iteration */ 00550 sum = 0; 00551 00552 /* srcBLen number of MACS should be performed */ 00553 k = srcBLen; 00554 00555 while(k > 0u) 00556 { 00557 /* Perform the multiply-accumulate */ 00558 sum += ((q31_t) * px++ * *py--); 00559 00560 /* Decrement the loop counter */ 00561 k--; 00562 } 00563 00564 /* Store the result in the accumulator in the destination buffer. */ 00565 *pOut++ = (q15_t) (sum >> 15); 00566 00567 /* Increment the MAC count */ 00568 count++; 00569 00570 /* Update the inputA and inputB pointers for next MAC calculation */ 00571 px = pIn1 + count; 00572 py = pSrc2; 00573 00574 /* Decrement the loop counter */ 00575 blkCnt--; 00576 } 00577 } 00578 00579 00580 /* -------------------------- 00581 * Initializations of stage3 00582 * -------------------------*/ 00583 00584 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00585 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00586 * .... 00587 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00588 * sum += x[srcALen-1] * y[srcBLen-1] 00589 */ 00590 00591 /* In this stage the MAC operations are decreased by 1 for every iteration. 00592 The count variable holds the number of MAC operations performed */ 00593 count = srcBLen - 1u; 00594 00595 /* Working pointer of inputA */ 00596 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00597 px = pSrc1; 00598 00599 /* Working pointer of inputB */ 00600 pSrc2 = pIn2 + (srcBLen - 1u); 00601 pIn2 = pSrc2 - 1u; 00602 py = pIn2; 00603 00604 /* ------------------- 00605 * Stage3 process 00606 * ------------------*/ 00607 00608 /* For loop unrolling by 4, this stage is divided into two. */ 00609 /* First part of this stage computes the MAC operations greater than 4 */ 00610 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00611 00612 /* The first part of the stage starts here */ 00613 j = count >> 2u; 00614 00615 while((j > 0u) && (blockSize3 > 0)) 00616 { 00617 /* Accumulator is made zero for every iteration */ 00618 sum = 0; 00619 00620 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00621 k = count >> 2u; 00622 00623 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00624 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00625 while(k > 0u) 00626 { 00627 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00628 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00629 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00630 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00631 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00632 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00633 00634 /* Decrement the loop counter */ 00635 k--; 00636 } 00637 00638 /* For the next MAC operations, the pointer py is used without SIMD 00639 * So, py is incremented by 1 */ 00640 py = py + 1u; 00641 00642 /* If the count is not a multiple of 4, compute any remaining MACs here. 00643 ** No loop unrolling is used. */ 00644 k = count % 0x4u; 00645 00646 while(k > 0u) 00647 { 00648 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00649 sum = __SMLAD(*px++, *py--, sum); 00650 00651 /* Decrement the loop counter */ 00652 k--; 00653 } 00654 00655 /* Store the result in the accumulator in the destination buffer. */ 00656 *pOut++ = (q15_t) (sum >> 15); 00657 00658 /* Update the inputA and inputB pointers for next MAC calculation */ 00659 px = ++pSrc1; 00660 py = pIn2; 00661 00662 /* Decrement the MAC count */ 00663 count--; 00664 00665 /* Decrement the loop counter */ 00666 blockSize3--; 00667 00668 j--; 00669 } 00670 00671 /* The second part of the stage starts here */ 00672 /* SIMD is not used for the next MAC operations, 00673 * so pointer py is updated to read only one sample at a time */ 00674 py = py + 1u; 00675 00676 while(blockSize3 > 0) 00677 { 00678 /* Accumulator is made zero for every iteration */ 00679 sum = 0; 00680 00681 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00682 k = count; 00683 00684 while(k > 0u) 00685 { 00686 /* Perform the multiply-accumulates */ 00687 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00688 sum = __SMLAD(*px++, *py--, sum); 00689 00690 /* Decrement the loop counter */ 00691 k--; 00692 } 00693 00694 /* Store the result in the accumulator in the destination buffer. */ 00695 *pOut++ = (q15_t) (sum >> 15); 00696 00697 /* Update the inputA and inputB pointers for next MAC calculation */ 00698 px = ++pSrc1; 00699 py = pSrc2; 00700 00701 /* Decrement the MAC count */ 00702 count--; 00703 00704 /* Decrement the loop counter */ 00705 blockSize3--; 00706 } 00707 00708 /* set status as ARM_MATH_SUCCESS */ 00709 status = ARM_MATH_SUCCESS; 00710 } 00711 00712 /* Return to application */ 00713 return (status); 00714 00715 #else 00716 00717 q15_t *pIn1; /* inputA pointer */ 00718 q15_t *pIn2; /* inputB pointer */ 00719 q15_t *pOut = pDst; /* output pointer */ 00720 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00721 q15_t *px; /* Intermediate inputA pointer */ 00722 q15_t *py; /* Intermediate inputB pointer */ 00723 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00724 q31_t x0, x1, x2, x3, c0; 00725 uint32_t j, k, count, check, blkCnt; 00726 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00727 arm_status status; /* status of Partial convolution */ 00728 q15_t a, b; 00729 00730 /* Check for range of output samples to be calculated */ 00731 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00732 { 00733 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00734 status = ARM_MATH_ARGUMENT_ERROR; 00735 } 00736 else 00737 { 00738 00739 /* The algorithm implementation is based on the lengths of the inputs. */ 00740 /* srcB is always made to slide across srcA. */ 00741 /* So srcBLen is always considered as shorter or equal to srcALen */ 00742 if(srcALen >=srcBLen) 00743 { 00744 /* Initialization of inputA pointer */ 00745 pIn1 = pSrcA; 00746 00747 /* Initialization of inputB pointer */ 00748 pIn2 = pSrcB; 00749 } 00750 else 00751 { 00752 /* Initialization of inputA pointer */ 00753 pIn1 = pSrcB; 00754 00755 /* Initialization of inputB pointer */ 00756 pIn2 = pSrcA; 00757 00758 /* srcBLen is always considered as shorter or equal to srcALen */ 00759 j = srcBLen; 00760 srcBLen = srcALen; 00761 srcALen = j; 00762 } 00763 00764 /* Conditions to check which loopCounter holds 00765 * the first and last indices of the output samples to be calculated. */ 00766 check = firstIndex + numPoints; 00767 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00768 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00769 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00770 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00771 (int32_t) numPoints) : 0; 00772 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00773 (int32_t) firstIndex); 00774 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00775 00776 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00777 /* The function is internally 00778 * divided into three stages according to the number of multiplications that has to be 00779 * taken place between inputA samples and inputB samples. In the first stage of the 00780 * algorithm, the multiplications increase by one for every iteration. 00781 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00782 * In the third stage of the algorithm, the multiplications decrease by one 00783 * for every iteration. */ 00784 00785 /* Set the output pointer to point to the firstIndex 00786 * of the output sample to be calculated. */ 00787 pOut = pDst + firstIndex; 00788 00789 /* -------------------------- 00790 * Initializations of stage1 00791 * -------------------------*/ 00792 00793 /* sum = x[0] * y[0] 00794 * sum = x[0] * y[1] + x[1] * y[0] 00795 * .... 00796 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00797 */ 00798 00799 /* In this stage the MAC operations are increased by 1 for every iteration. 00800 The count variable holds the number of MAC operations performed. 00801 Since the partial convolution starts from firstIndex 00802 Number of Macs to be performed is firstIndex + 1 */ 00803 count = 1u + firstIndex; 00804 00805 /* Working pointer of inputA */ 00806 px = pIn1; 00807 00808 /* Working pointer of inputB */ 00809 pSrc2 = pIn2 + firstIndex; 00810 py = pSrc2; 00811 00812 /* ------------------------ 00813 * Stage1 process 00814 * ----------------------*/ 00815 00816 /* For loop unrolling by 4, this stage is divided into two. */ 00817 /* First part of this stage computes the MAC operations less than 4 */ 00818 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00819 00820 /* The first part of the stage starts here */ 00821 while((count < 4u) && (blockSize1 > 0u)) 00822 { 00823 /* Accumulator is made zero for every iteration */ 00824 sum = 0; 00825 00826 /* Loop over number of MAC operations between 00827 * inputA samples and inputB samples */ 00828 k = count; 00829 00830 while(k > 0u) 00831 { 00832 /* Perform the multiply-accumulates */ 00833 sum += ((q31_t) * px++ * *py--); 00834 00835 /* Decrement the loop counter */ 00836 k--; 00837 } 00838 00839 /* Store the result in the accumulator in the destination buffer. */ 00840 *pOut++ = (q15_t) (sum >> 15); 00841 00842 /* Update the inputA and inputB pointers for next MAC calculation */ 00843 py = ++pSrc2; 00844 px = pIn1; 00845 00846 /* Increment the MAC count */ 00847 count++; 00848 00849 /* Decrement the loop counter */ 00850 blockSize1--; 00851 } 00852 00853 /* The second part of the stage starts here */ 00854 /* The internal loop, over count, is unrolled by 4 */ 00855 /* To, read the last two inputB samples using SIMD: 00856 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00857 py = py - 1; 00858 00859 while(blockSize1 > 0u) 00860 { 00861 /* Accumulator is made zero for every iteration */ 00862 sum = 0; 00863 00864 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00865 k = count >> 2u; 00866 00867 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00868 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00869 py++; 00870 00871 while(k > 0u) 00872 { 00873 /* Perform the multiply-accumulates */ 00874 sum += ((q31_t) * px++ * *py--); 00875 sum += ((q31_t) * px++ * *py--); 00876 sum += ((q31_t) * px++ * *py--); 00877 sum += ((q31_t) * px++ * *py--); 00878 00879 /* Decrement the loop counter */ 00880 k--; 00881 } 00882 00883 /* If the count is not a multiple of 4, compute any remaining MACs here. 00884 ** No loop unrolling is used. */ 00885 k = count % 0x4u; 00886 00887 while(k > 0u) 00888 { 00889 /* Perform the multiply-accumulates */ 00890 sum += ((q31_t) * px++ * *py--); 00891 00892 /* Decrement the loop counter */ 00893 k--; 00894 } 00895 00896 /* Store the result in the accumulator in the destination buffer. */ 00897 *pOut++ = (q15_t) (sum >> 15); 00898 00899 /* Update the inputA and inputB pointers for next MAC calculation */ 00900 py = ++pSrc2 - 1u; 00901 px = pIn1; 00902 00903 /* Increment the MAC count */ 00904 count++; 00905 00906 /* Decrement the loop counter */ 00907 blockSize1--; 00908 } 00909 00910 /* -------------------------- 00911 * Initializations of stage2 00912 * ------------------------*/ 00913 00914 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00915 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00916 * .... 00917 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00918 */ 00919 00920 /* Working pointer of inputA */ 00921 px = pIn1; 00922 00923 /* Working pointer of inputB */ 00924 pSrc2 = pIn2 + (srcBLen - 1u); 00925 py = pSrc2; 00926 00927 /* count is the index by which the pointer pIn1 to be incremented */ 00928 count = 0u; 00929 00930 00931 /* -------------------- 00932 * Stage2 process 00933 * -------------------*/ 00934 00935 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00936 * So, to loop unroll over blockSize2, 00937 * srcBLen should be greater than or equal to 4 */ 00938 if(srcBLen >= 4u) 00939 { 00940 /* Loop unroll over blockSize2, by 4 */ 00941 blkCnt = ((uint32_t) blockSize2 >> 2u); 00942 00943 while(blkCnt > 0u) 00944 { 00945 py = py - 1u; 00946 00947 /* Set all accumulators to zero */ 00948 acc0 = 0; 00949 acc1 = 0; 00950 acc2 = 0; 00951 acc3 = 0; 00952 00953 /* read x[0], x[1] samples */ 00954 a = *px++; 00955 b = *px++; 00956 00957 #ifndef ARM_MATH_BIG_ENDIAN 00958 00959 x0 = __PKHBT(a, b, 16); 00960 a = *px; 00961 x1 = __PKHBT(b, a, 16); 00962 00963 #else 00964 00965 x0 = __PKHBT(b, a, 16); 00966 a = *px; 00967 x1 = __PKHBT(a, b, 16); 00968 00969 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00970 00971 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00972 k = srcBLen >> 2u; 00973 00974 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00975 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00976 do 00977 { 00978 /* Read the last two inputB samples using SIMD: 00979 * y[srcBLen - 1] and y[srcBLen - 2] */ 00980 a = *py; 00981 b = *(py+1); 00982 py -= 2; 00983 00984 #ifndef ARM_MATH_BIG_ENDIAN 00985 00986 c0 = __PKHBT(a, b, 16); 00987 00988 #else 00989 00990 c0 = __PKHBT(b, a, 16);; 00991 00992 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00993 00994 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00995 acc0 = __SMLADX(x0, c0, acc0); 00996 00997 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00998 acc1 = __SMLADX(x1, c0, acc1); 00999 01000 a = *px; 01001 b = *(px + 1); 01002 01003 #ifndef ARM_MATH_BIG_ENDIAN 01004 01005 x2 = __PKHBT(a, b, 16); 01006 a = *(px + 2); 01007 x3 = __PKHBT(b, a, 16); 01008 01009 #else 01010 01011 x2 = __PKHBT(b, a, 16); 01012 a = *(px + 2); 01013 x3 = __PKHBT(a, b, 16); 01014 01015 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01016 01017 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 01018 acc2 = __SMLADX(x2, c0, acc2); 01019 01020 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 01021 acc3 = __SMLADX(x3, c0, acc3); 01022 01023 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 01024 a = *py; 01025 b = *(py+1); 01026 py -= 2; 01027 01028 #ifndef ARM_MATH_BIG_ENDIAN 01029 01030 c0 = __PKHBT(a, b, 16); 01031 01032 #else 01033 01034 c0 = __PKHBT(b, a, 16);; 01035 01036 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01037 01038 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 01039 acc0 = __SMLADX(x2, c0, acc0); 01040 01041 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 01042 acc1 = __SMLADX(x3, c0, acc1); 01043 01044 /* Read x[4], x[5], x[6] */ 01045 a = *(px + 2); 01046 b = *(px + 3); 01047 01048 #ifndef ARM_MATH_BIG_ENDIAN 01049 01050 x0 = __PKHBT(a, b, 16); 01051 a = *(px + 4); 01052 x1 = __PKHBT(b, a, 16); 01053 01054 #else 01055 01056 x0 = __PKHBT(b, a, 16); 01057 a = *(px + 4); 01058 x1 = __PKHBT(a, b, 16); 01059 01060 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01061 01062 px += 4u; 01063 01064 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 01065 acc2 = __SMLADX(x0, c0, acc2); 01066 01067 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 01068 acc3 = __SMLADX(x1, c0, acc3); 01069 01070 } while(--k); 01071 01072 /* For the next MAC operations, SIMD is not used 01073 * So, the 16 bit pointer if inputB, py is updated */ 01074 01075 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01076 ** No loop unrolling is used. */ 01077 k = srcBLen % 0x4u; 01078 01079 if(k == 1u) 01080 { 01081 /* Read y[srcBLen - 5] */ 01082 c0 = *(py+1); 01083 01084 #ifdef ARM_MATH_BIG_ENDIAN 01085 01086 c0 = c0 << 16u; 01087 01088 #else 01089 01090 c0 = c0 & 0x0000FFFF; 01091 01092 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01093 01094 /* Read x[7] */ 01095 a = *px; 01096 b = *(px+1); 01097 px++; 01098 01099 #ifndef ARM_MATH_BIG_ENDIAN 01100 01101 x3 = __PKHBT(a, b, 16); 01102 01103 #else 01104 01105 x3 = __PKHBT(b, a, 16);; 01106 01107 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01108 01109 01110 /* Perform the multiply-accumulates */ 01111 acc0 = __SMLAD(x0, c0, acc0); 01112 acc1 = __SMLAD(x1, c0, acc1); 01113 acc2 = __SMLADX(x1, c0, acc2); 01114 acc3 = __SMLADX(x3, c0, acc3); 01115 } 01116 01117 if(k == 2u) 01118 { 01119 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 01120 a = *py; 01121 b = *(py+1); 01122 01123 #ifndef ARM_MATH_BIG_ENDIAN 01124 01125 c0 = __PKHBT(a, b, 16); 01126 01127 #else 01128 01129 c0 = __PKHBT(b, a, 16);; 01130 01131 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01132 01133 /* Read x[7], x[8], x[9] */ 01134 a = *px; 01135 b = *(px + 1); 01136 01137 #ifndef ARM_MATH_BIG_ENDIAN 01138 01139 x3 = __PKHBT(a, b, 16); 01140 a = *(px + 2); 01141 x2 = __PKHBT(b, a, 16); 01142 01143 #else 01144 01145 x3 = __PKHBT(b, a, 16); 01146 a = *(px + 2); 01147 x2 = __PKHBT(a, b, 16); 01148 01149 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01150 px += 2u; 01151 01152 /* Perform the multiply-accumulates */ 01153 acc0 = __SMLADX(x0, c0, acc0); 01154 acc1 = __SMLADX(x1, c0, acc1); 01155 acc2 = __SMLADX(x3, c0, acc2); 01156 acc3 = __SMLADX(x2, c0, acc3); 01157 } 01158 01159 if(k == 3u) 01160 { 01161 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 01162 a = *py; 01163 b = *(py+1); 01164 01165 #ifndef ARM_MATH_BIG_ENDIAN 01166 01167 c0 = __PKHBT(a, b, 16); 01168 01169 #else 01170 01171 c0 = __PKHBT(b, a, 16);; 01172 01173 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01174 01175 /* Read x[7], x[8], x[9] */ 01176 a = *px; 01177 b = *(px + 1); 01178 01179 #ifndef ARM_MATH_BIG_ENDIAN 01180 01181 x3 = __PKHBT(a, b, 16); 01182 a = *(px + 2); 01183 x2 = __PKHBT(b, a, 16); 01184 01185 #else 01186 01187 x3 = __PKHBT(b, a, 16); 01188 a = *(px + 2); 01189 x2 = __PKHBT(a, b, 16); 01190 01191 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01192 01193 /* Perform the multiply-accumulates */ 01194 acc0 = __SMLADX(x0, c0, acc0); 01195 acc1 = __SMLADX(x1, c0, acc1); 01196 acc2 = __SMLADX(x3, c0, acc2); 01197 acc3 = __SMLADX(x2, c0, acc3); 01198 01199 /* Read y[srcBLen - 7] */ 01200 c0 = *(py-1); 01201 #ifdef ARM_MATH_BIG_ENDIAN 01202 01203 c0 = c0 << 16u; 01204 #else 01205 01206 c0 = c0 & 0x0000FFFF; 01207 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01208 01209 /* Read x[10] */ 01210 a = *(px+2); 01211 b = *(px+3); 01212 01213 #ifndef ARM_MATH_BIG_ENDIAN 01214 01215 x3 = __PKHBT(a, b, 16); 01216 01217 #else 01218 01219 x3 = __PKHBT(b, a, 16);; 01220 01221 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01222 01223 px += 3u; 01224 01225 /* Perform the multiply-accumulates */ 01226 acc0 = __SMLADX(x1, c0, acc0); 01227 acc1 = __SMLAD(x2, c0, acc1); 01228 acc2 = __SMLADX(x2, c0, acc2); 01229 acc3 = __SMLADX(x3, c0, acc3); 01230 } 01231 01232 /* Store the results in the accumulators in the destination buffer. */ 01233 *pOut++ = (q15_t)(acc0 >> 15); 01234 *pOut++ = (q15_t)(acc1 >> 15); 01235 *pOut++ = (q15_t)(acc2 >> 15); 01236 *pOut++ = (q15_t)(acc3 >> 15); 01237 01238 /* Increment the pointer pIn1 index, count by 4 */ 01239 count += 4u; 01240 01241 /* Update the inputA and inputB pointers for next MAC calculation */ 01242 px = pIn1 + count; 01243 py = pSrc2; 01244 01245 /* Decrement the loop counter */ 01246 blkCnt--; 01247 } 01248 01249 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 01250 ** No loop unrolling is used. */ 01251 blkCnt = (uint32_t) blockSize2 % 0x4u; 01252 01253 while(blkCnt > 0u) 01254 { 01255 /* Accumulator is made zero for every iteration */ 01256 sum = 0; 01257 01258 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01259 k = srcBLen >> 2u; 01260 01261 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01262 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01263 while(k > 0u) 01264 { 01265 /* Perform the multiply-accumulates */ 01266 sum += ((q31_t) * px++ * *py--); 01267 sum += ((q31_t) * px++ * *py--); 01268 sum += ((q31_t) * px++ * *py--); 01269 sum += ((q31_t) * px++ * *py--); 01270 01271 /* Decrement the loop counter */ 01272 k--; 01273 } 01274 01275 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01276 ** No loop unrolling is used. */ 01277 k = srcBLen % 0x4u; 01278 01279 while(k > 0u) 01280 { 01281 /* Perform the multiply-accumulates */ 01282 sum += ((q31_t) * px++ * *py--); 01283 01284 /* Decrement the loop counter */ 01285 k--; 01286 } 01287 01288 /* Store the result in the accumulator in the destination buffer. */ 01289 *pOut++ = (q15_t) (sum >> 15); 01290 01291 /* Increment the pointer pIn1 index, count by 1 */ 01292 count++; 01293 01294 /* Update the inputA and inputB pointers for next MAC calculation */ 01295 px = pIn1 + count; 01296 py = pSrc2; 01297 01298 /* Decrement the loop counter */ 01299 blkCnt--; 01300 } 01301 } 01302 else 01303 { 01304 /* If the srcBLen is not a multiple of 4, 01305 * the blockSize2 loop cannot be unrolled by 4 */ 01306 blkCnt = (uint32_t) blockSize2; 01307 01308 while(blkCnt > 0u) 01309 { 01310 /* Accumulator is made zero for every iteration */ 01311 sum = 0; 01312 01313 /* srcBLen number of MACS should be performed */ 01314 k = srcBLen; 01315 01316 while(k > 0u) 01317 { 01318 /* Perform the multiply-accumulate */ 01319 sum += ((q31_t) * px++ * *py--); 01320 01321 /* Decrement the loop counter */ 01322 k--; 01323 } 01324 01325 /* Store the result in the accumulator in the destination buffer. */ 01326 *pOut++ = (q15_t) (sum >> 15); 01327 01328 /* Increment the MAC count */ 01329 count++; 01330 01331 /* Update the inputA and inputB pointers for next MAC calculation */ 01332 px = pIn1 + count; 01333 py = pSrc2; 01334 01335 /* Decrement the loop counter */ 01336 blkCnt--; 01337 } 01338 } 01339 01340 01341 /* -------------------------- 01342 * Initializations of stage3 01343 * -------------------------*/ 01344 01345 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 01346 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 01347 * .... 01348 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 01349 * sum += x[srcALen-1] * y[srcBLen-1] 01350 */ 01351 01352 /* In this stage the MAC operations are decreased by 1 for every iteration. 01353 The count variable holds the number of MAC operations performed */ 01354 count = srcBLen - 1u; 01355 01356 /* Working pointer of inputA */ 01357 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 01358 px = pSrc1; 01359 01360 /* Working pointer of inputB */ 01361 pSrc2 = pIn2 + (srcBLen - 1u); 01362 pIn2 = pSrc2 - 1u; 01363 py = pIn2; 01364 01365 /* ------------------- 01366 * Stage3 process 01367 * ------------------*/ 01368 01369 /* For loop unrolling by 4, this stage is divided into two. */ 01370 /* First part of this stage computes the MAC operations greater than 4 */ 01371 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 01372 01373 /* The first part of the stage starts here */ 01374 j = count >> 2u; 01375 01376 while((j > 0u) && (blockSize3 > 0)) 01377 { 01378 /* Accumulator is made zero for every iteration */ 01379 sum = 0; 01380 01381 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01382 k = count >> 2u; 01383 01384 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01385 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01386 py++; 01387 01388 while(k > 0u) 01389 { 01390 /* Perform the multiply-accumulates */ 01391 sum += ((q31_t) * px++ * *py--); 01392 sum += ((q31_t) * px++ * *py--); 01393 sum += ((q31_t) * px++ * *py--); 01394 sum += ((q31_t) * px++ * *py--); 01395 /* Decrement the loop counter */ 01396 k--; 01397 } 01398 01399 01400 /* If the count is not a multiple of 4, compute any remaining MACs here. 01401 ** No loop unrolling is used. */ 01402 k = count % 0x4u; 01403 01404 while(k > 0u) 01405 { 01406 /* Perform the multiply-accumulates */ 01407 sum += ((q31_t) * px++ * *py--); 01408 01409 /* Decrement the loop counter */ 01410 k--; 01411 } 01412 01413 /* Store the result in the accumulator in the destination buffer. */ 01414 *pOut++ = (q15_t) (sum >> 15); 01415 01416 /* Update the inputA and inputB pointers for next MAC calculation */ 01417 px = ++pSrc1; 01418 py = pIn2; 01419 01420 /* Decrement the MAC count */ 01421 count--; 01422 01423 /* Decrement the loop counter */ 01424 blockSize3--; 01425 01426 j--; 01427 } 01428 01429 /* The second part of the stage starts here */ 01430 /* SIMD is not used for the next MAC operations, 01431 * so pointer py is updated to read only one sample at a time */ 01432 py = py + 1u; 01433 01434 while(blockSize3 > 0u) 01435 { 01436 /* Accumulator is made zero for every iteration */ 01437 sum = 0; 01438 01439 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01440 k = count; 01441 01442 while(k > 0u) 01443 { 01444 /* Perform the multiply-accumulates */ 01445 /* sum += x[srcALen-1] * y[srcBLen-1] */ 01446 sum += ((q31_t) * px++ * *py--); 01447 01448 /* Decrement the loop counter */ 01449 k--; 01450 } 01451 01452 /* Store the result in the accumulator in the destination buffer. */ 01453 *pOut++ = (q15_t) (sum >> 15); 01454 01455 /* Update the inputA and inputB pointers for next MAC calculation */ 01456 px = ++pSrc1; 01457 py = pSrc2; 01458 01459 /* Decrement the MAC count */ 01460 count--; 01461 01462 /* Decrement the loop counter */ 01463 blockSize3--; 01464 } 01465 01466 /* set status as ARM_MATH_SUCCESS */ 01467 status = ARM_MATH_SUCCESS; 01468 } 01469 01470 /* Return to application */ 01471 return (status); 01472 01473 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 01474 } 01475 01476 /** 01477 * @} end of PartialConv group 01478 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2