CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_conv_partial_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_q15.c 00009 * 00010 * Description: Partial convolution of Q15 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q15 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00062 * 00063 * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00064 * 00065 * \par 00066 * Refer the function <code>arm_conv_partial_opt_q15()</code> for a faster implementation of this function using scratch buffers. 00067 * 00068 */ 00069 00070 00071 arm_status arm_conv_partial_q15( 00072 q15_t * pSrcA, 00073 uint32_t srcALen, 00074 q15_t * pSrcB, 00075 uint32_t srcBLen, 00076 q15_t * pDst, 00077 uint32_t firstIndex, 00078 uint32_t numPoints) 00079 { 00080 00081 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) 00082 00083 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00084 00085 q15_t *pIn1; /* inputA pointer */ 00086 q15_t *pIn2; /* inputB pointer */ 00087 q15_t *pOut = pDst; /* output pointer */ 00088 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00089 q15_t *px; /* Intermediate inputA pointer */ 00090 q15_t *py; /* Intermediate inputB pointer */ 00091 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00092 q31_t x0, x1, x2, x3, c0; /* Temporary input variables */ 00093 uint32_t j, k, count, check, blkCnt; 00094 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00095 arm_status status; /* status of Partial convolution */ 00096 00097 /* Check for range of output samples to be calculated */ 00098 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00099 { 00100 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00101 status = ARM_MATH_ARGUMENT_ERROR; 00102 } 00103 else 00104 { 00105 00106 /* The algorithm implementation is based on the lengths of the inputs. */ 00107 /* srcB is always made to slide across srcA. */ 00108 /* So srcBLen is always considered as shorter or equal to srcALen */ 00109 if(srcALen >= srcBLen) 00110 { 00111 /* Initialization of inputA pointer */ 00112 pIn1 = pSrcA; 00113 00114 /* Initialization of inputB pointer */ 00115 pIn2 = pSrcB; 00116 } 00117 else 00118 { 00119 /* Initialization of inputA pointer */ 00120 pIn1 = pSrcB; 00121 00122 /* Initialization of inputB pointer */ 00123 pIn2 = pSrcA; 00124 00125 /* srcBLen is always considered as shorter or equal to srcALen */ 00126 j = srcBLen; 00127 srcBLen = srcALen; 00128 srcALen = j; 00129 } 00130 00131 /* Conditions to check which loopCounter holds 00132 * the first and last indices of the output samples to be calculated. */ 00133 check = firstIndex + numPoints; 00134 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00135 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00136 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00137 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00138 (int32_t) numPoints) : 0; 00139 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00140 (int32_t) firstIndex); 00141 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00142 00143 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00144 /* The function is internally 00145 * divided into three stages according to the number of multiplications that has to be 00146 * taken place between inputA samples and inputB samples. In the first stage of the 00147 * algorithm, the multiplications increase by one for every iteration. 00148 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00149 * In the third stage of the algorithm, the multiplications decrease by one 00150 * for every iteration. */ 00151 00152 /* Set the output pointer to point to the firstIndex 00153 * of the output sample to be calculated. */ 00154 pOut = pDst + firstIndex; 00155 00156 /* -------------------------- 00157 * Initializations of stage1 00158 * -------------------------*/ 00159 00160 /* sum = x[0] * y[0] 00161 * sum = x[0] * y[1] + x[1] * y[0] 00162 * .... 00163 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00164 */ 00165 00166 /* In this stage the MAC operations are increased by 1 for every iteration. 00167 The count variable holds the number of MAC operations performed. 00168 Since the partial convolution starts from firstIndex 00169 Number of Macs to be performed is firstIndex + 1 */ 00170 count = 1u + firstIndex; 00171 00172 /* Working pointer of inputA */ 00173 px = pIn1; 00174 00175 /* Working pointer of inputB */ 00176 pSrc2 = pIn2 + firstIndex; 00177 py = pSrc2; 00178 00179 /* ------------------------ 00180 * Stage1 process 00181 * ----------------------*/ 00182 00183 /* For loop unrolling by 4, this stage is divided into two. */ 00184 /* First part of this stage computes the MAC operations less than 4 */ 00185 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00186 00187 /* The first part of the stage starts here */ 00188 while((count < 4u) && (blockSize1 > 0)) 00189 { 00190 /* Accumulator is made zero for every iteration */ 00191 sum = 0; 00192 00193 /* Loop over number of MAC operations between 00194 * inputA samples and inputB samples */ 00195 k = count; 00196 00197 while(k > 0u) 00198 { 00199 /* Perform the multiply-accumulates */ 00200 sum = __SMLALD(*px++, *py--, sum); 00201 00202 /* Decrement the loop counter */ 00203 k--; 00204 } 00205 00206 /* Store the result in the accumulator in the destination buffer. */ 00207 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00208 00209 /* Update the inputA and inputB pointers for next MAC calculation */ 00210 py = ++pSrc2; 00211 px = pIn1; 00212 00213 /* Increment the MAC count */ 00214 count++; 00215 00216 /* Decrement the loop counter */ 00217 blockSize1--; 00218 } 00219 00220 /* The second part of the stage starts here */ 00221 /* The internal loop, over count, is unrolled by 4 */ 00222 /* To, read the last two inputB samples using SIMD: 00223 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00224 py = py - 1; 00225 00226 while(blockSize1 > 0) 00227 { 00228 /* Accumulator is made zero for every iteration */ 00229 sum = 0; 00230 00231 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00232 k = count >> 2u; 00233 00234 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00235 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00236 while(k > 0u) 00237 { 00238 /* Perform the multiply-accumulates */ 00239 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00240 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00241 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00242 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00243 00244 /* Decrement the loop counter */ 00245 k--; 00246 } 00247 00248 /* For the next MAC operations, the pointer py is used without SIMD 00249 * So, py is incremented by 1 */ 00250 py = py + 1u; 00251 00252 /* If the count is not a multiple of 4, compute any remaining MACs here. 00253 ** No loop unrolling is used. */ 00254 k = count % 0x4u; 00255 00256 while(k > 0u) 00257 { 00258 /* Perform the multiply-accumulates */ 00259 sum = __SMLALD(*px++, *py--, sum); 00260 00261 /* Decrement the loop counter */ 00262 k--; 00263 } 00264 00265 /* Store the result in the accumulator in the destination buffer. */ 00266 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00267 00268 /* Update the inputA and inputB pointers for next MAC calculation */ 00269 py = ++pSrc2 - 1u; 00270 px = pIn1; 00271 00272 /* Increment the MAC count */ 00273 count++; 00274 00275 /* Decrement the loop counter */ 00276 blockSize1--; 00277 } 00278 00279 /* -------------------------- 00280 * Initializations of stage2 00281 * ------------------------*/ 00282 00283 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00284 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00285 * .... 00286 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00287 */ 00288 00289 /* Working pointer of inputA */ 00290 px = pIn1; 00291 00292 /* Working pointer of inputB */ 00293 pSrc2 = pIn2 + (srcBLen - 1u); 00294 py = pSrc2; 00295 00296 /* count is the index by which the pointer pIn1 to be incremented */ 00297 count = 0u; 00298 00299 00300 /* -------------------- 00301 * Stage2 process 00302 * -------------------*/ 00303 00304 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00305 * So, to loop unroll over blockSize2, 00306 * srcBLen should be greater than or equal to 4 */ 00307 if(srcBLen >= 4u) 00308 { 00309 /* Loop unroll over blockSize2, by 4 */ 00310 blkCnt = blockSize2 >> 2u; 00311 00312 while(blkCnt > 0u) 00313 { 00314 py = py - 1u; 00315 00316 /* Set all accumulators to zero */ 00317 acc0 = 0; 00318 acc1 = 0; 00319 acc2 = 0; 00320 acc3 = 0; 00321 00322 00323 /* read x[0], x[1] samples */ 00324 x0 = *__SIMD32(px); 00325 /* read x[1], x[2] samples */ 00326 x1 = _SIMD32_OFFSET(px+1); 00327 px+= 2u; 00328 00329 00330 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00331 k = srcBLen >> 2u; 00332 00333 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00334 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00335 do 00336 { 00337 /* Read the last two inputB samples using SIMD: 00338 * y[srcBLen - 1] and y[srcBLen - 2] */ 00339 c0 = *__SIMD32(py)--; 00340 00341 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00342 acc0 = __SMLALDX(x0, c0, acc0); 00343 00344 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00345 acc1 = __SMLALDX(x1, c0, acc1); 00346 00347 /* Read x[2], x[3] */ 00348 x2 = *__SIMD32(px); 00349 00350 /* Read x[3], x[4] */ 00351 x3 = _SIMD32_OFFSET(px+1); 00352 00353 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00354 acc2 = __SMLALDX(x2, c0, acc2); 00355 00356 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00357 acc3 = __SMLALDX(x3, c0, acc3); 00358 00359 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00360 c0 = *__SIMD32(py)--; 00361 00362 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00363 acc0 = __SMLALDX(x2, c0, acc0); 00364 00365 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00366 acc1 = __SMLALDX(x3, c0, acc1); 00367 00368 /* Read x[4], x[5] */ 00369 x0 = _SIMD32_OFFSET(px+2); 00370 00371 /* Read x[5], x[6] */ 00372 x1 = _SIMD32_OFFSET(px+3); 00373 px += 4u; 00374 00375 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00376 acc2 = __SMLALDX(x0, c0, acc2); 00377 00378 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00379 acc3 = __SMLALDX(x1, c0, acc3); 00380 00381 } while(--k); 00382 00383 /* For the next MAC operations, SIMD is not used 00384 * So, the 16 bit pointer if inputB, py is updated */ 00385 00386 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00387 ** No loop unrolling is used. */ 00388 k = srcBLen % 0x4u; 00389 00390 if(k == 1u) 00391 { 00392 /* Read y[srcBLen - 5] */ 00393 c0 = *(py+1); 00394 00395 #ifdef ARM_MATH_BIG_ENDIAN 00396 00397 c0 = c0 << 16u; 00398 00399 #else 00400 00401 c0 = c0 & 0x0000FFFF; 00402 00403 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00404 00405 /* Read x[7] */ 00406 x3 = *__SIMD32(px); 00407 px++; 00408 00409 /* Perform the multiply-accumulates */ 00410 acc0 = __SMLALD(x0, c0, acc0); 00411 acc1 = __SMLALD(x1, c0, acc1); 00412 acc2 = __SMLALDX(x1, c0, acc2); 00413 acc3 = __SMLALDX(x3, c0, acc3); 00414 } 00415 00416 if(k == 2u) 00417 { 00418 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00419 c0 = _SIMD32_OFFSET(py); 00420 00421 /* Read x[7], x[8] */ 00422 x3 = *__SIMD32(px); 00423 00424 /* Read x[9] */ 00425 x2 = _SIMD32_OFFSET(px+1); 00426 px += 2u; 00427 00428 /* Perform the multiply-accumulates */ 00429 acc0 = __SMLALDX(x0, c0, acc0); 00430 acc1 = __SMLALDX(x1, c0, acc1); 00431 acc2 = __SMLALDX(x3, c0, acc2); 00432 acc3 = __SMLALDX(x2, c0, acc3); 00433 } 00434 00435 if(k == 3u) 00436 { 00437 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00438 c0 = _SIMD32_OFFSET(py); 00439 00440 /* Read x[7], x[8] */ 00441 x3 = *__SIMD32(px); 00442 00443 /* Read x[9] */ 00444 x2 = _SIMD32_OFFSET(px+1); 00445 00446 /* Perform the multiply-accumulates */ 00447 acc0 = __SMLALDX(x0, c0, acc0); 00448 acc1 = __SMLALDX(x1, c0, acc1); 00449 acc2 = __SMLALDX(x3, c0, acc2); 00450 acc3 = __SMLALDX(x2, c0, acc3); 00451 00452 c0 = *(py-1); 00453 00454 #ifdef ARM_MATH_BIG_ENDIAN 00455 00456 c0 = c0 << 16u; 00457 #else 00458 00459 c0 = c0 & 0x0000FFFF; 00460 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00461 00462 /* Read x[10] */ 00463 x3 = _SIMD32_OFFSET(px+2); 00464 px += 3u; 00465 00466 /* Perform the multiply-accumulates */ 00467 acc0 = __SMLALDX(x1, c0, acc0); 00468 acc1 = __SMLALD(x2, c0, acc1); 00469 acc2 = __SMLALDX(x2, c0, acc2); 00470 acc3 = __SMLALDX(x3, c0, acc3); 00471 } 00472 00473 00474 /* Store the results in the accumulators in the destination buffer. */ 00475 00476 #ifndef ARM_MATH_BIG_ENDIAN 00477 00478 *__SIMD32(pOut)++ = 00479 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00480 *__SIMD32(pOut)++ = 00481 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00482 00483 #else 00484 00485 *__SIMD32(pOut)++ = 00486 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00487 *__SIMD32(pOut)++ = 00488 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00489 00490 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00491 00492 /* Increment the pointer pIn1 index, count by 4 */ 00493 count += 4u; 00494 00495 /* Update the inputA and inputB pointers for next MAC calculation */ 00496 px = pIn1 + count; 00497 py = pSrc2; 00498 00499 /* Decrement the loop counter */ 00500 blkCnt--; 00501 } 00502 00503 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00504 ** No loop unrolling is used. */ 00505 blkCnt = (uint32_t) blockSize2 % 0x4u; 00506 00507 while(blkCnt > 0u) 00508 { 00509 /* Accumulator is made zero for every iteration */ 00510 sum = 0; 00511 00512 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00513 k = srcBLen >> 2u; 00514 00515 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00516 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00517 while(k > 0u) 00518 { 00519 /* Perform the multiply-accumulates */ 00520 sum += (q63_t) ((q31_t) * px++ * *py--); 00521 sum += (q63_t) ((q31_t) * px++ * *py--); 00522 sum += (q63_t) ((q31_t) * px++ * *py--); 00523 sum += (q63_t) ((q31_t) * px++ * *py--); 00524 00525 /* Decrement the loop counter */ 00526 k--; 00527 } 00528 00529 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00530 ** No loop unrolling is used. */ 00531 k = srcBLen % 0x4u; 00532 00533 while(k > 0u) 00534 { 00535 /* Perform the multiply-accumulates */ 00536 sum += (q63_t) ((q31_t) * px++ * *py--); 00537 00538 /* Decrement the loop counter */ 00539 k--; 00540 } 00541 00542 /* Store the result in the accumulator in the destination buffer. */ 00543 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00544 00545 /* Increment the pointer pIn1 index, count by 1 */ 00546 count++; 00547 00548 /* Update the inputA and inputB pointers for next MAC calculation */ 00549 px = pIn1 + count; 00550 py = pSrc2; 00551 00552 /* Decrement the loop counter */ 00553 blkCnt--; 00554 } 00555 } 00556 else 00557 { 00558 /* If the srcBLen is not a multiple of 4, 00559 * the blockSize2 loop cannot be unrolled by 4 */ 00560 blkCnt = (uint32_t) blockSize2; 00561 00562 while(blkCnt > 0u) 00563 { 00564 /* Accumulator is made zero for every iteration */ 00565 sum = 0; 00566 00567 /* srcBLen number of MACS should be performed */ 00568 k = srcBLen; 00569 00570 while(k > 0u) 00571 { 00572 /* Perform the multiply-accumulate */ 00573 sum += (q63_t) ((q31_t) * px++ * *py--); 00574 00575 /* Decrement the loop counter */ 00576 k--; 00577 } 00578 00579 /* Store the result in the accumulator in the destination buffer. */ 00580 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00581 00582 /* Increment the MAC count */ 00583 count++; 00584 00585 /* Update the inputA and inputB pointers for next MAC calculation */ 00586 px = pIn1 + count; 00587 py = pSrc2; 00588 00589 /* Decrement the loop counter */ 00590 blkCnt--; 00591 } 00592 } 00593 00594 00595 /* -------------------------- 00596 * Initializations of stage3 00597 * -------------------------*/ 00598 00599 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00600 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00601 * .... 00602 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00603 * sum += x[srcALen-1] * y[srcBLen-1] 00604 */ 00605 00606 /* In this stage the MAC operations are decreased by 1 for every iteration. 00607 The count variable holds the number of MAC operations performed */ 00608 count = srcBLen - 1u; 00609 00610 /* Working pointer of inputA */ 00611 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00612 px = pSrc1; 00613 00614 /* Working pointer of inputB */ 00615 pSrc2 = pIn2 + (srcBLen - 1u); 00616 pIn2 = pSrc2 - 1u; 00617 py = pIn2; 00618 00619 /* ------------------- 00620 * Stage3 process 00621 * ------------------*/ 00622 00623 /* For loop unrolling by 4, this stage is divided into two. */ 00624 /* First part of this stage computes the MAC operations greater than 4 */ 00625 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00626 00627 /* The first part of the stage starts here */ 00628 j = count >> 2u; 00629 00630 while((j > 0u) && (blockSize3 > 0)) 00631 { 00632 /* Accumulator is made zero for every iteration */ 00633 sum = 0; 00634 00635 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00636 k = count >> 2u; 00637 00638 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00639 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00640 while(k > 0u) 00641 { 00642 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00643 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00644 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00645 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00646 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00647 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00648 00649 /* Decrement the loop counter */ 00650 k--; 00651 } 00652 00653 /* For the next MAC operations, the pointer py is used without SIMD 00654 * So, py is incremented by 1 */ 00655 py = py + 1u; 00656 00657 /* If the count is not a multiple of 4, compute any remaining MACs here. 00658 ** No loop unrolling is used. */ 00659 k = count % 0x4u; 00660 00661 while(k > 0u) 00662 { 00663 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00664 sum = __SMLALD(*px++, *py--, sum); 00665 00666 /* Decrement the loop counter */ 00667 k--; 00668 } 00669 00670 /* Store the result in the accumulator in the destination buffer. */ 00671 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00672 00673 /* Update the inputA and inputB pointers for next MAC calculation */ 00674 px = ++pSrc1; 00675 py = pIn2; 00676 00677 /* Decrement the MAC count */ 00678 count--; 00679 00680 /* Decrement the loop counter */ 00681 blockSize3--; 00682 00683 j--; 00684 } 00685 00686 /* The second part of the stage starts here */ 00687 /* SIMD is not used for the next MAC operations, 00688 * so pointer py is updated to read only one sample at a time */ 00689 py = py + 1u; 00690 00691 while(blockSize3 > 0) 00692 { 00693 /* Accumulator is made zero for every iteration */ 00694 sum = 0; 00695 00696 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00697 k = count; 00698 00699 while(k > 0u) 00700 { 00701 /* Perform the multiply-accumulates */ 00702 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00703 sum = __SMLALD(*px++, *py--, sum); 00704 00705 /* Decrement the loop counter */ 00706 k--; 00707 } 00708 00709 /* Store the result in the accumulator in the destination buffer. */ 00710 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00711 00712 /* Update the inputA and inputB pointers for next MAC calculation */ 00713 px = ++pSrc1; 00714 py = pSrc2; 00715 00716 /* Decrement the MAC count */ 00717 count--; 00718 00719 /* Decrement the loop counter */ 00720 blockSize3--; 00721 } 00722 00723 /* set status as ARM_MATH_SUCCESS */ 00724 status = ARM_MATH_SUCCESS; 00725 } 00726 00727 /* Return to application */ 00728 return (status); 00729 00730 #else 00731 00732 /* Run the below code for Cortex-M0 */ 00733 00734 q15_t *pIn1 = pSrcA; /* inputA pointer */ 00735 q15_t *pIn2 = pSrcB; /* inputB pointer */ 00736 q63_t sum; /* Accumulator */ 00737 uint32_t i, j; /* loop counters */ 00738 arm_status status; /* status of Partial convolution */ 00739 00740 /* Check for range of output samples to be calculated */ 00741 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00742 { 00743 /* Set status as ARM_ARGUMENT_ERROR */ 00744 status = ARM_MATH_ARGUMENT_ERROR; 00745 } 00746 else 00747 { 00748 /* Loop to calculate convolution for output length number of values */ 00749 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00750 { 00751 /* Initialize sum with zero to carry on MAC operations */ 00752 sum = 0; 00753 00754 /* Loop to perform MAC operations according to convolution equation */ 00755 for (j = 0; j <= i; j++) 00756 { 00757 /* Check the array limitations */ 00758 if(((i - j) < srcBLen) && (j < srcALen)) 00759 { 00760 /* z[i] += x[i-j] * y[j] */ 00761 sum += ((q31_t) pIn1[j] * (pIn2[i - j])); 00762 } 00763 } 00764 00765 /* Store the output in the destination buffer */ 00766 pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u); 00767 } 00768 /* set status as ARM_SUCCESS as there are no argument errors */ 00769 status = ARM_MATH_SUCCESS; 00770 } 00771 return (status); 00772 00773 #endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */ 00774 00775 } 00776 00777 /** 00778 * @} end of PartialConv group 00779 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2