CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_conv_partial_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_q15.c 00009 * 00010 * Description: Partial convolution of Q15 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q15 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00062 * 00063 * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00064 * 00065 * \par 00066 * Refer the function <code>arm_conv_partial_opt_q15()</code> for a faster implementation of this function using scratch buffers. 00067 * 00068 */ 00069 00070 00071 arm_status arm_conv_partial_q15( 00072 q15_t * pSrcA, 00073 uint32_t srcALen, 00074 q15_t * pSrcB, 00075 uint32_t srcBLen, 00076 q15_t * pDst, 00077 uint32_t firstIndex, 00078 uint32_t numPoints) 00079 { 00080 00081 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) 00082 00083 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00084 00085 q15_t *pIn1; /* inputA pointer */ 00086 q15_t *pIn2; /* inputB pointer */ 00087 q15_t *pOut = pDst; /* output pointer */ 00088 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00089 q15_t *px; /* Intermediate inputA pointer */ 00090 q15_t *py; /* Intermediate inputB pointer */ 00091 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00092 q31_t x0, x1, x2, x3, c0; /* Temporary input variables */ 00093 uint32_t j, k, count, check, blkCnt; 00094 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00095 arm_status status; /* status of Partial convolution */ 00096 00097 /* Check for range of output samples to be calculated */ 00098 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00099 { 00100 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00101 status = ARM_MATH_ARGUMENT_ERROR; 00102 } 00103 else 00104 { 00105 00106 /* The algorithm implementation is based on the lengths of the inputs. */ 00107 /* srcB is always made to slide across srcA. */ 00108 /* So srcBLen is always considered as shorter or equal to srcALen */ 00109 if(srcALen >= srcBLen) 00110 { 00111 /* Initialization of inputA pointer */ 00112 pIn1 = pSrcA; 00113 00114 /* Initialization of inputB pointer */ 00115 pIn2 = pSrcB; 00116 } 00117 else 00118 { 00119 /* Initialization of inputA pointer */ 00120 pIn1 = pSrcB; 00121 00122 /* Initialization of inputB pointer */ 00123 pIn2 = pSrcA; 00124 00125 /* srcBLen is always considered as shorter or equal to srcALen */ 00126 j = srcBLen; 00127 srcBLen = srcALen; 00128 srcALen = j; 00129 } 00130 00131 /* Conditions to check which loopCounter holds 00132 * the first and last indices of the output samples to be calculated. */ 00133 check = firstIndex + numPoints; 00134 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; 00135 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; 00136 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00137 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00138 (int32_t) numPoints) : 0; 00139 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00140 (int32_t) firstIndex); 00141 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00142 00143 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00144 /* The function is internally 00145 * divided into three stages according to the number of multiplications that has to be 00146 * taken place between inputA samples and inputB samples. In the first stage of the 00147 * algorithm, the multiplications increase by one for every iteration. 00148 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00149 * In the third stage of the algorithm, the multiplications decrease by one 00150 * for every iteration. */ 00151 00152 /* Set the output pointer to point to the firstIndex 00153 * of the output sample to be calculated. */ 00154 pOut = pDst + firstIndex; 00155 00156 /* -------------------------- 00157 * Initializations of stage1 00158 * -------------------------*/ 00159 00160 /* sum = x[0] * y[0] 00161 * sum = x[0] * y[1] + x[1] * y[0] 00162 * .... 00163 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00164 */ 00165 00166 /* In this stage the MAC operations are increased by 1 for every iteration. 00167 The count variable holds the number of MAC operations performed. 00168 Since the partial convolution starts from firstIndex 00169 Number of Macs to be performed is firstIndex + 1 */ 00170 count = 1u + firstIndex; 00171 00172 /* Working pointer of inputA */ 00173 px = pIn1; 00174 00175 /* Working pointer of inputB */ 00176 pSrc2 = pIn2 + firstIndex; 00177 py = pSrc2; 00178 00179 /* ------------------------ 00180 * Stage1 process 00181 * ----------------------*/ 00182 00183 /* For loop unrolling by 4, this stage is divided into two. */ 00184 /* First part of this stage computes the MAC operations less than 4 */ 00185 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00186 00187 /* The first part of the stage starts here */ 00188 while((count < 4u) && (blockSize1 > 0)) 00189 { 00190 /* Accumulator is made zero for every iteration */ 00191 sum = 0; 00192 00193 /* Loop over number of MAC operations between 00194 * inputA samples and inputB samples */ 00195 k = count; 00196 00197 while(k > 0u) 00198 { 00199 /* Perform the multiply-accumulates */ 00200 sum = __SMLALD(*px++, *py--, sum); 00201 00202 /* Decrement the loop counter */ 00203 k--; 00204 } 00205 00206 /* Store the result in the accumulator in the destination buffer. */ 00207 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00208 00209 /* Update the inputA and inputB pointers for next MAC calculation */ 00210 py = ++pSrc2; 00211 px = pIn1; 00212 00213 /* Increment the MAC count */ 00214 count++; 00215 00216 /* Decrement the loop counter */ 00217 blockSize1--; 00218 } 00219 00220 /* The second part of the stage starts here */ 00221 /* The internal loop, over count, is unrolled by 4 */ 00222 /* To, read the last two inputB samples using SIMD: 00223 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00224 py = py - 1; 00225 00226 while(blockSize1 > 0) 00227 { 00228 /* Accumulator is made zero for every iteration */ 00229 sum = 0; 00230 00231 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00232 k = count >> 2u; 00233 00234 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00235 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00236 while(k > 0u) 00237 { 00238 /* Perform the multiply-accumulates */ 00239 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00240 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00241 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00242 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00243 00244 /* Decrement the loop counter */ 00245 k--; 00246 } 00247 00248 /* For the next MAC operations, the pointer py is used without SIMD 00249 * So, py is incremented by 1 */ 00250 py = py + 1u; 00251 00252 /* If the count is not a multiple of 4, compute any remaining MACs here. 00253 ** No loop unrolling is used. */ 00254 k = count % 0x4u; 00255 00256 while(k > 0u) 00257 { 00258 /* Perform the multiply-accumulates */ 00259 sum = __SMLALD(*px++, *py--, sum); 00260 00261 /* Decrement the loop counter */ 00262 k--; 00263 } 00264 00265 /* Store the result in the accumulator in the destination buffer. */ 00266 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00267 00268 /* Update the inputA and inputB pointers for next MAC calculation */ 00269 py = ++pSrc2 - 1u; 00270 px = pIn1; 00271 00272 /* Increment the MAC count */ 00273 count++; 00274 00275 /* Decrement the loop counter */ 00276 blockSize1--; 00277 } 00278 00279 /* -------------------------- 00280 * Initializations of stage2 00281 * ------------------------*/ 00282 00283 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00284 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00285 * .... 00286 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00287 */ 00288 00289 /* Working pointer of inputA */ 00290 if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00291 { 00292 px = pIn1 + firstIndex - srcBLen + 1; 00293 } 00294 else 00295 { 00296 px = pIn1; 00297 } 00298 00299 /* Working pointer of inputB */ 00300 pSrc2 = pIn2 + (srcBLen - 1u); 00301 py = pSrc2; 00302 00303 /* count is the index by which the pointer pIn1 to be incremented */ 00304 count = 0u; 00305 00306 00307 /* -------------------- 00308 * Stage2 process 00309 * -------------------*/ 00310 00311 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00312 * So, to loop unroll over blockSize2, 00313 * srcBLen should be greater than or equal to 4 */ 00314 if(srcBLen >= 4u) 00315 { 00316 /* Loop unroll over blockSize2, by 4 */ 00317 blkCnt = blockSize2 >> 2u; 00318 00319 while(blkCnt > 0u) 00320 { 00321 py = py - 1u; 00322 00323 /* Set all accumulators to zero */ 00324 acc0 = 0; 00325 acc1 = 0; 00326 acc2 = 0; 00327 acc3 = 0; 00328 00329 00330 /* read x[0], x[1] samples */ 00331 x0 = *__SIMD32(px); 00332 /* read x[1], x[2] samples */ 00333 x1 = _SIMD32_OFFSET(px+1); 00334 px+= 2u; 00335 00336 00337 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00338 k = srcBLen >> 2u; 00339 00340 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00341 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00342 do 00343 { 00344 /* Read the last two inputB samples using SIMD: 00345 * y[srcBLen - 1] and y[srcBLen - 2] */ 00346 c0 = *__SIMD32(py)--; 00347 00348 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00349 acc0 = __SMLALDX(x0, c0, acc0); 00350 00351 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00352 acc1 = __SMLALDX(x1, c0, acc1); 00353 00354 /* Read x[2], x[3] */ 00355 x2 = *__SIMD32(px); 00356 00357 /* Read x[3], x[4] */ 00358 x3 = _SIMD32_OFFSET(px+1); 00359 00360 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00361 acc2 = __SMLALDX(x2, c0, acc2); 00362 00363 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00364 acc3 = __SMLALDX(x3, c0, acc3); 00365 00366 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00367 c0 = *__SIMD32(py)--; 00368 00369 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00370 acc0 = __SMLALDX(x2, c0, acc0); 00371 00372 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00373 acc1 = __SMLALDX(x3, c0, acc1); 00374 00375 /* Read x[4], x[5] */ 00376 x0 = _SIMD32_OFFSET(px+2); 00377 00378 /* Read x[5], x[6] */ 00379 x1 = _SIMD32_OFFSET(px+3); 00380 px += 4u; 00381 00382 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00383 acc2 = __SMLALDX(x0, c0, acc2); 00384 00385 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00386 acc3 = __SMLALDX(x1, c0, acc3); 00387 00388 } while(--k); 00389 00390 /* For the next MAC operations, SIMD is not used 00391 * So, the 16 bit pointer if inputB, py is updated */ 00392 00393 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00394 ** No loop unrolling is used. */ 00395 k = srcBLen % 0x4u; 00396 00397 if(k == 1u) 00398 { 00399 /* Read y[srcBLen - 5] */ 00400 c0 = *(py+1); 00401 00402 #ifdef ARM_MATH_BIG_ENDIAN 00403 00404 c0 = c0 << 16u; 00405 00406 #else 00407 00408 c0 = c0 & 0x0000FFFF; 00409 00410 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00411 00412 /* Read x[7] */ 00413 x3 = *__SIMD32(px); 00414 px++; 00415 00416 /* Perform the multiply-accumulates */ 00417 acc0 = __SMLALD(x0, c0, acc0); 00418 acc1 = __SMLALD(x1, c0, acc1); 00419 acc2 = __SMLALDX(x1, c0, acc2); 00420 acc3 = __SMLALDX(x3, c0, acc3); 00421 } 00422 00423 if(k == 2u) 00424 { 00425 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00426 c0 = _SIMD32_OFFSET(py); 00427 00428 /* Read x[7], x[8] */ 00429 x3 = *__SIMD32(px); 00430 00431 /* Read x[9] */ 00432 x2 = _SIMD32_OFFSET(px+1); 00433 px += 2u; 00434 00435 /* Perform the multiply-accumulates */ 00436 acc0 = __SMLALDX(x0, c0, acc0); 00437 acc1 = __SMLALDX(x1, c0, acc1); 00438 acc2 = __SMLALDX(x3, c0, acc2); 00439 acc3 = __SMLALDX(x2, c0, acc3); 00440 } 00441 00442 if(k == 3u) 00443 { 00444 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00445 c0 = _SIMD32_OFFSET(py); 00446 00447 /* Read x[7], x[8] */ 00448 x3 = *__SIMD32(px); 00449 00450 /* Read x[9] */ 00451 x2 = _SIMD32_OFFSET(px+1); 00452 00453 /* Perform the multiply-accumulates */ 00454 acc0 = __SMLALDX(x0, c0, acc0); 00455 acc1 = __SMLALDX(x1, c0, acc1); 00456 acc2 = __SMLALDX(x3, c0, acc2); 00457 acc3 = __SMLALDX(x2, c0, acc3); 00458 00459 c0 = *(py-1); 00460 00461 #ifdef ARM_MATH_BIG_ENDIAN 00462 00463 c0 = c0 << 16u; 00464 #else 00465 00466 c0 = c0 & 0x0000FFFF; 00467 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00468 00469 /* Read x[10] */ 00470 x3 = _SIMD32_OFFSET(px+2); 00471 px += 3u; 00472 00473 /* Perform the multiply-accumulates */ 00474 acc0 = __SMLALDX(x1, c0, acc0); 00475 acc1 = __SMLALD(x2, c0, acc1); 00476 acc2 = __SMLALDX(x2, c0, acc2); 00477 acc3 = __SMLALDX(x3, c0, acc3); 00478 } 00479 00480 00481 /* Store the results in the accumulators in the destination buffer. */ 00482 00483 #ifndef ARM_MATH_BIG_ENDIAN 00484 00485 *__SIMD32(pOut)++ = 00486 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00487 *__SIMD32(pOut)++ = 00488 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00489 00490 #else 00491 00492 *__SIMD32(pOut)++ = 00493 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00494 *__SIMD32(pOut)++ = 00495 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00496 00497 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00498 00499 /* Increment the pointer pIn1 index, count by 4 */ 00500 count += 4u; 00501 00502 /* Update the inputA and inputB pointers for next MAC calculation */ 00503 px = pIn1 + count; 00504 py = pSrc2; 00505 00506 /* Decrement the loop counter */ 00507 blkCnt--; 00508 } 00509 00510 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00511 ** No loop unrolling is used. */ 00512 blkCnt = (uint32_t) blockSize2 % 0x4u; 00513 00514 while(blkCnt > 0u) 00515 { 00516 /* Accumulator is made zero for every iteration */ 00517 sum = 0; 00518 00519 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00520 k = srcBLen >> 2u; 00521 00522 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00523 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00524 while(k > 0u) 00525 { 00526 /* Perform the multiply-accumulates */ 00527 sum += (q63_t) ((q31_t) * px++ * *py--); 00528 sum += (q63_t) ((q31_t) * px++ * *py--); 00529 sum += (q63_t) ((q31_t) * px++ * *py--); 00530 sum += (q63_t) ((q31_t) * px++ * *py--); 00531 00532 /* Decrement the loop counter */ 00533 k--; 00534 } 00535 00536 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00537 ** No loop unrolling is used. */ 00538 k = srcBLen % 0x4u; 00539 00540 while(k > 0u) 00541 { 00542 /* Perform the multiply-accumulates */ 00543 sum += (q63_t) ((q31_t) * px++ * *py--); 00544 00545 /* Decrement the loop counter */ 00546 k--; 00547 } 00548 00549 /* Store the result in the accumulator in the destination buffer. */ 00550 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00551 00552 /* Increment the pointer pIn1 index, count by 1 */ 00553 count++; 00554 00555 /* Update the inputA and inputB pointers for next MAC calculation */ 00556 px = pIn1 + count; 00557 py = pSrc2; 00558 00559 /* Decrement the loop counter */ 00560 blkCnt--; 00561 } 00562 } 00563 else 00564 { 00565 /* If the srcBLen is not a multiple of 4, 00566 * the blockSize2 loop cannot be unrolled by 4 */ 00567 blkCnt = (uint32_t) blockSize2; 00568 00569 while(blkCnt > 0u) 00570 { 00571 /* Accumulator is made zero for every iteration */ 00572 sum = 0; 00573 00574 /* srcBLen number of MACS should be performed */ 00575 k = srcBLen; 00576 00577 while(k > 0u) 00578 { 00579 /* Perform the multiply-accumulate */ 00580 sum += (q63_t) ((q31_t) * px++ * *py--); 00581 00582 /* Decrement the loop counter */ 00583 k--; 00584 } 00585 00586 /* Store the result in the accumulator in the destination buffer. */ 00587 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00588 00589 /* Increment the MAC count */ 00590 count++; 00591 00592 /* Update the inputA and inputB pointers for next MAC calculation */ 00593 px = pIn1 + count; 00594 py = pSrc2; 00595 00596 /* Decrement the loop counter */ 00597 blkCnt--; 00598 } 00599 } 00600 00601 00602 /* -------------------------- 00603 * Initializations of stage3 00604 * -------------------------*/ 00605 00606 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00607 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00608 * .... 00609 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00610 * sum += x[srcALen-1] * y[srcBLen-1] 00611 */ 00612 00613 /* In this stage the MAC operations are decreased by 1 for every iteration. 00614 The count variable holds the number of MAC operations performed */ 00615 count = srcBLen - 1u; 00616 00617 /* Working pointer of inputA */ 00618 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00619 px = pSrc1; 00620 00621 /* Working pointer of inputB */ 00622 pSrc2 = pIn2 + (srcBLen - 1u); 00623 pIn2 = pSrc2 - 1u; 00624 py = pIn2; 00625 00626 /* ------------------- 00627 * Stage3 process 00628 * ------------------*/ 00629 00630 /* For loop unrolling by 4, this stage is divided into two. */ 00631 /* First part of this stage computes the MAC operations greater than 4 */ 00632 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00633 00634 /* The first part of the stage starts here */ 00635 j = count >> 2u; 00636 00637 while((j > 0u) && (blockSize3 > 0)) 00638 { 00639 /* Accumulator is made zero for every iteration */ 00640 sum = 0; 00641 00642 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00643 k = count >> 2u; 00644 00645 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00646 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00647 while(k > 0u) 00648 { 00649 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00650 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00651 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00652 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00653 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00654 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00655 00656 /* Decrement the loop counter */ 00657 k--; 00658 } 00659 00660 /* For the next MAC operations, the pointer py is used without SIMD 00661 * So, py is incremented by 1 */ 00662 py = py + 1u; 00663 00664 /* If the count is not a multiple of 4, compute any remaining MACs here. 00665 ** No loop unrolling is used. */ 00666 k = count % 0x4u; 00667 00668 while(k > 0u) 00669 { 00670 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00671 sum = __SMLALD(*px++, *py--, sum); 00672 00673 /* Decrement the loop counter */ 00674 k--; 00675 } 00676 00677 /* Store the result in the accumulator in the destination buffer. */ 00678 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00679 00680 /* Update the inputA and inputB pointers for next MAC calculation */ 00681 px = ++pSrc1; 00682 py = pIn2; 00683 00684 /* Decrement the MAC count */ 00685 count--; 00686 00687 /* Decrement the loop counter */ 00688 blockSize3--; 00689 00690 j--; 00691 } 00692 00693 /* The second part of the stage starts here */ 00694 /* SIMD is not used for the next MAC operations, 00695 * so pointer py is updated to read only one sample at a time */ 00696 py = py + 1u; 00697 00698 while(blockSize3 > 0) 00699 { 00700 /* Accumulator is made zero for every iteration */ 00701 sum = 0; 00702 00703 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00704 k = count; 00705 00706 while(k > 0u) 00707 { 00708 /* Perform the multiply-accumulates */ 00709 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00710 sum = __SMLALD(*px++, *py--, sum); 00711 00712 /* Decrement the loop counter */ 00713 k--; 00714 } 00715 00716 /* Store the result in the accumulator in the destination buffer. */ 00717 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00718 00719 /* Update the inputA and inputB pointers for next MAC calculation */ 00720 px = ++pSrc1; 00721 py = pSrc2; 00722 00723 /* Decrement the MAC count */ 00724 count--; 00725 00726 /* Decrement the loop counter */ 00727 blockSize3--; 00728 } 00729 00730 /* set status as ARM_MATH_SUCCESS */ 00731 status = ARM_MATH_SUCCESS; 00732 } 00733 00734 /* Return to application */ 00735 return (status); 00736 00737 #else 00738 00739 /* Run the below code for Cortex-M0 */ 00740 00741 q15_t *pIn1 = pSrcA; /* inputA pointer */ 00742 q15_t *pIn2 = pSrcB; /* inputB pointer */ 00743 q63_t sum; /* Accumulator */ 00744 uint32_t i, j; /* loop counters */ 00745 arm_status status; /* status of Partial convolution */ 00746 00747 /* Check for range of output samples to be calculated */ 00748 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00749 { 00750 /* Set status as ARM_ARGUMENT_ERROR */ 00751 status = ARM_MATH_ARGUMENT_ERROR; 00752 } 00753 else 00754 { 00755 /* Loop to calculate convolution for output length number of values */ 00756 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00757 { 00758 /* Initialize sum with zero to carry on MAC operations */ 00759 sum = 0; 00760 00761 /* Loop to perform MAC operations according to convolution equation */ 00762 for (j = 0; j <= i; j++) 00763 { 00764 /* Check the array limitations */ 00765 if(((i - j) < srcBLen) && (j < srcALen)) 00766 { 00767 /* z[i] += x[i-j] * y[j] */ 00768 sum += ((q31_t) pIn1[j] * (pIn2[i - j])); 00769 } 00770 } 00771 00772 /* Store the output in the destination buffer */ 00773 pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u); 00774 } 00775 /* set status as ARM_SUCCESS as there are no argument errors */ 00776 status = ARM_MATH_SUCCESS; 00777 } 00778 return (status); 00779 00780 #endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */ 00781 00782 } 00783 00784 /** 00785 * @} end of PartialConv group 00786 */
Generated on Tue Jul 12 2022 11:59:16 by 1.7.2