CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_conv_partial_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_q31.c 00009 * 00010 * Description: Partial convolution of Q31 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q31 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00062 * 00063 * See <code>arm_conv_partial_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. 00064 */ 00065 00066 arm_status arm_conv_partial_q31( 00067 q31_t * pSrcA, 00068 uint32_t srcALen, 00069 q31_t * pSrcB, 00070 uint32_t srcBLen, 00071 q31_t * pDst, 00072 uint32_t firstIndex, 00073 uint32_t numPoints) 00074 { 00075 00076 00077 #ifndef ARM_MATH_CM0_FAMILY 00078 00079 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00080 00081 q31_t *pIn1; /* inputA pointer */ 00082 q31_t *pIn2; /* inputB pointer */ 00083 q31_t *pOut = pDst; /* output pointer */ 00084 q31_t *px; /* Intermediate inputA pointer */ 00085 q31_t *py; /* Intermediate inputB pointer */ 00086 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00087 q63_t sum, acc0, acc1, acc2; /* Accumulator */ 00088 q31_t x0, x1, x2, c0; 00089 uint32_t j, k, count, check, blkCnt; 00090 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00091 arm_status status; /* status of Partial convolution */ 00092 00093 00094 /* Check for range of output samples to be calculated */ 00095 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00096 { 00097 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00098 status = ARM_MATH_ARGUMENT_ERROR; 00099 } 00100 else 00101 { 00102 00103 /* The algorithm implementation is based on the lengths of the inputs. */ 00104 /* srcB is always made to slide across srcA. */ 00105 /* So srcBLen is always considered as shorter or equal to srcALen */ 00106 if(srcALen >= srcBLen) 00107 { 00108 /* Initialization of inputA pointer */ 00109 pIn1 = pSrcA; 00110 00111 /* Initialization of inputB pointer */ 00112 pIn2 = pSrcB; 00113 } 00114 else 00115 { 00116 /* Initialization of inputA pointer */ 00117 pIn1 = pSrcB; 00118 00119 /* Initialization of inputB pointer */ 00120 pIn2 = pSrcA; 00121 00122 /* srcBLen is always considered as shorter or equal to srcALen */ 00123 j = srcBLen; 00124 srcBLen = srcALen; 00125 srcALen = j; 00126 } 00127 00128 /* Conditions to check which loopCounter holds 00129 * the first and last indices of the output samples to be calculated. */ 00130 check = firstIndex + numPoints; 00131 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; 00132 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; 00133 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00134 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00135 (int32_t) numPoints) : 0; 00136 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00137 (int32_t) firstIndex); 00138 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00139 00140 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00141 /* The function is internally 00142 * divided into three stages according to the number of multiplications that has to be 00143 * taken place between inputA samples and inputB samples. In the first stage of the 00144 * algorithm, the multiplications increase by one for every iteration. 00145 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00146 * In the third stage of the algorithm, the multiplications decrease by one 00147 * for every iteration. */ 00148 00149 /* Set the output pointer to point to the firstIndex 00150 * of the output sample to be calculated. */ 00151 pOut = pDst + firstIndex; 00152 00153 /* -------------------------- 00154 * Initializations of stage1 00155 * -------------------------*/ 00156 00157 /* sum = x[0] * y[0] 00158 * sum = x[0] * y[1] + x[1] * y[0] 00159 * .... 00160 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00161 */ 00162 00163 /* In this stage the MAC operations are increased by 1 for every iteration. 00164 The count variable holds the number of MAC operations performed. 00165 Since the partial convolution starts from firstIndex 00166 Number of Macs to be performed is firstIndex + 1 */ 00167 count = 1u + firstIndex; 00168 00169 /* Working pointer of inputA */ 00170 px = pIn1; 00171 00172 /* Working pointer of inputB */ 00173 pSrc2 = pIn2 + firstIndex; 00174 py = pSrc2; 00175 00176 /* ------------------------ 00177 * Stage1 process 00178 * ----------------------*/ 00179 00180 /* The first loop starts here */ 00181 while(blockSize1 > 0) 00182 { 00183 /* Accumulator is made zero for every iteration */ 00184 sum = 0; 00185 00186 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00187 k = count >> 2u; 00188 00189 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00190 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00191 while(k > 0u) 00192 { 00193 /* x[0] * y[srcBLen - 1] */ 00194 sum += (q63_t) * px++ * (*py--); 00195 /* x[1] * y[srcBLen - 2] */ 00196 sum += (q63_t) * px++ * (*py--); 00197 /* x[2] * y[srcBLen - 3] */ 00198 sum += (q63_t) * px++ * (*py--); 00199 /* x[3] * y[srcBLen - 4] */ 00200 sum += (q63_t) * px++ * (*py--); 00201 00202 /* Decrement the loop counter */ 00203 k--; 00204 } 00205 00206 /* If the count is not a multiple of 4, compute any remaining MACs here. 00207 ** No loop unrolling is used. */ 00208 k = count % 0x4u; 00209 00210 while(k > 0u) 00211 { 00212 /* Perform the multiply-accumulate */ 00213 sum += (q63_t) * px++ * (*py--); 00214 00215 /* Decrement the loop counter */ 00216 k--; 00217 } 00218 00219 /* Store the result in the accumulator in the destination buffer. */ 00220 *pOut++ = (q31_t) (sum >> 31); 00221 00222 /* Update the inputA and inputB pointers for next MAC calculation */ 00223 py = ++pSrc2; 00224 px = pIn1; 00225 00226 /* Increment the MAC count */ 00227 count++; 00228 00229 /* Decrement the loop counter */ 00230 blockSize1--; 00231 } 00232 00233 /* -------------------------- 00234 * Initializations of stage2 00235 * ------------------------*/ 00236 00237 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00238 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00239 * .... 00240 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00241 */ 00242 00243 /* Working pointer of inputA */ 00244 if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00245 { 00246 px = pIn1 + firstIndex - srcBLen + 1; 00247 } 00248 else 00249 { 00250 px = pIn1; 00251 } 00252 00253 /* Working pointer of inputB */ 00254 pSrc2 = pIn2 + (srcBLen - 1u); 00255 py = pSrc2; 00256 00257 /* count is index by which the pointer pIn1 to be incremented */ 00258 count = 0u; 00259 00260 /* ------------------- 00261 * Stage2 process 00262 * ------------------*/ 00263 00264 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00265 * So, to loop unroll over blockSize2, 00266 * srcBLen should be greater than or equal to 4 */ 00267 if(srcBLen >= 4u) 00268 { 00269 /* Loop unroll over blkCnt */ 00270 00271 blkCnt = blockSize2 / 3; 00272 while(blkCnt > 0u) 00273 { 00274 /* Set all accumulators to zero */ 00275 acc0 = 0; 00276 acc1 = 0; 00277 acc2 = 0; 00278 00279 /* read x[0], x[1] samples */ 00280 x0 = *(px++); 00281 x1 = *(px++); 00282 00283 /* Apply loop unrolling and compute 3 MACs simultaneously. */ 00284 k = srcBLen / 3; 00285 00286 /* First part of the processing with loop unrolling. Compute 3 MACs at a time. 00287 ** a second loop below computes MACs for the remaining 1 to 2 samples. */ 00288 do 00289 { 00290 /* Read y[srcBLen - 1] sample */ 00291 c0 = *(py); 00292 00293 /* Read x[2] sample */ 00294 x2 = *(px); 00295 00296 /* Perform the multiply-accumulates */ 00297 /* acc0 += x[0] * y[srcBLen - 1] */ 00298 acc0 += (q63_t) x0 *c0; 00299 /* acc1 += x[1] * y[srcBLen - 1] */ 00300 acc1 += (q63_t) x1 *c0; 00301 /* acc2 += x[2] * y[srcBLen - 1] */ 00302 acc2 += (q63_t) x2 *c0; 00303 00304 /* Read y[srcBLen - 2] sample */ 00305 c0 = *(py - 1u); 00306 00307 /* Read x[3] sample */ 00308 x0 = *(px + 1u); 00309 00310 /* Perform the multiply-accumulate */ 00311 /* acc0 += x[1] * y[srcBLen - 2] */ 00312 acc0 += (q63_t) x1 *c0; 00313 /* acc1 += x[2] * y[srcBLen - 2] */ 00314 acc1 += (q63_t) x2 *c0; 00315 /* acc2 += x[3] * y[srcBLen - 2] */ 00316 acc2 += (q63_t) x0 *c0; 00317 00318 /* Read y[srcBLen - 3] sample */ 00319 c0 = *(py - 2u); 00320 00321 /* Read x[4] sample */ 00322 x1 = *(px + 2u); 00323 00324 /* Perform the multiply-accumulates */ 00325 /* acc0 += x[2] * y[srcBLen - 3] */ 00326 acc0 += (q63_t) x2 *c0; 00327 /* acc1 += x[3] * y[srcBLen - 2] */ 00328 acc1 += (q63_t) x0 *c0; 00329 /* acc2 += x[4] * y[srcBLen - 2] */ 00330 acc2 += (q63_t) x1 *c0; 00331 00332 00333 px += 3u; 00334 00335 py -= 3u; 00336 00337 } while(--k); 00338 00339 /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. 00340 ** No loop unrolling is used. */ 00341 k = srcBLen - (3 * (srcBLen / 3)); 00342 00343 while(k > 0u) 00344 { 00345 /* Read y[srcBLen - 5] sample */ 00346 c0 = *(py--); 00347 00348 /* Read x[7] sample */ 00349 x2 = *(px++); 00350 00351 /* Perform the multiply-accumulates */ 00352 /* acc0 += x[4] * y[srcBLen - 5] */ 00353 acc0 += (q63_t) x0 *c0; 00354 /* acc1 += x[5] * y[srcBLen - 5] */ 00355 acc1 += (q63_t) x1 *c0; 00356 /* acc2 += x[6] * y[srcBLen - 5] */ 00357 acc2 += (q63_t) x2 *c0; 00358 00359 /* Reuse the present samples for the next MAC */ 00360 x0 = x1; 00361 x1 = x2; 00362 00363 /* Decrement the loop counter */ 00364 k--; 00365 } 00366 00367 /* Store the result in the accumulator in the destination buffer. */ 00368 *pOut++ = (q31_t) (acc0 >> 31); 00369 *pOut++ = (q31_t) (acc1 >> 31); 00370 *pOut++ = (q31_t) (acc2 >> 31); 00371 00372 /* Increment the pointer pIn1 index, count by 3 */ 00373 count += 3u; 00374 00375 /* Update the inputA and inputB pointers for next MAC calculation */ 00376 px = pIn1 + count; 00377 py = pSrc2; 00378 00379 /* Decrement the loop counter */ 00380 blkCnt--; 00381 } 00382 00383 /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. 00384 ** No loop unrolling is used. */ 00385 blkCnt = blockSize2 - 3 * (blockSize2 / 3); 00386 00387 while(blkCnt > 0u) 00388 { 00389 /* Accumulator is made zero for every iteration */ 00390 sum = 0; 00391 00392 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00393 k = srcBLen >> 2u; 00394 00395 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00396 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00397 while(k > 0u) 00398 { 00399 /* Perform the multiply-accumulates */ 00400 sum += (q63_t) * px++ * (*py--); 00401 sum += (q63_t) * px++ * (*py--); 00402 sum += (q63_t) * px++ * (*py--); 00403 sum += (q63_t) * px++ * (*py--); 00404 00405 /* Decrement the loop counter */ 00406 k--; 00407 } 00408 00409 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00410 ** No loop unrolling is used. */ 00411 k = srcBLen % 0x4u; 00412 00413 while(k > 0u) 00414 { 00415 /* Perform the multiply-accumulate */ 00416 sum += (q63_t) * px++ * (*py--); 00417 00418 /* Decrement the loop counter */ 00419 k--; 00420 } 00421 00422 /* Store the result in the accumulator in the destination buffer. */ 00423 *pOut++ = (q31_t) (sum >> 31); 00424 00425 /* Increment the MAC count */ 00426 count++; 00427 00428 /* Update the inputA and inputB pointers for next MAC calculation */ 00429 px = pIn1 + count; 00430 py = pSrc2; 00431 00432 /* Decrement the loop counter */ 00433 blkCnt--; 00434 } 00435 } 00436 else 00437 { 00438 /* If the srcBLen is not a multiple of 4, 00439 * the blockSize2 loop cannot be unrolled by 4 */ 00440 blkCnt = (uint32_t) blockSize2; 00441 00442 while(blkCnt > 0u) 00443 { 00444 /* Accumulator is made zero for every iteration */ 00445 sum = 0; 00446 00447 /* srcBLen number of MACS should be performed */ 00448 k = srcBLen; 00449 00450 while(k > 0u) 00451 { 00452 /* Perform the multiply-accumulate */ 00453 sum += (q63_t) * px++ * (*py--); 00454 00455 /* Decrement the loop counter */ 00456 k--; 00457 } 00458 00459 /* Store the result in the accumulator in the destination buffer. */ 00460 *pOut++ = (q31_t) (sum >> 31); 00461 00462 /* Increment the MAC count */ 00463 count++; 00464 00465 /* Update the inputA and inputB pointers for next MAC calculation */ 00466 px = pIn1 + count; 00467 py = pSrc2; 00468 00469 /* Decrement the loop counter */ 00470 blkCnt--; 00471 } 00472 } 00473 00474 00475 /* -------------------------- 00476 * Initializations of stage3 00477 * -------------------------*/ 00478 00479 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00480 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00481 * .... 00482 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00483 * sum += x[srcALen-1] * y[srcBLen-1] 00484 */ 00485 00486 /* In this stage the MAC operations are decreased by 1 for every iteration. 00487 The blockSize3 variable holds the number of MAC operations performed */ 00488 count = srcBLen - 1u; 00489 00490 /* Working pointer of inputA */ 00491 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00492 px = pSrc1; 00493 00494 /* Working pointer of inputB */ 00495 pSrc2 = pIn2 + (srcBLen - 1u); 00496 py = pSrc2; 00497 00498 /* ------------------- 00499 * Stage3 process 00500 * ------------------*/ 00501 00502 while(blockSize3 > 0) 00503 { 00504 /* Accumulator is made zero for every iteration */ 00505 sum = 0; 00506 00507 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00508 k = count >> 2u; 00509 00510 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00511 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00512 while(k > 0u) 00513 { 00514 sum += (q63_t) * px++ * (*py--); 00515 sum += (q63_t) * px++ * (*py--); 00516 sum += (q63_t) * px++ * (*py--); 00517 sum += (q63_t) * px++ * (*py--); 00518 00519 /* Decrement the loop counter */ 00520 k--; 00521 } 00522 00523 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00524 ** No loop unrolling is used. */ 00525 k = count % 0x4u; 00526 00527 while(k > 0u) 00528 { 00529 /* Perform the multiply-accumulate */ 00530 sum += (q63_t) * px++ * (*py--); 00531 00532 /* Decrement the loop counter */ 00533 k--; 00534 } 00535 00536 /* Store the result in the accumulator in the destination buffer. */ 00537 *pOut++ = (q31_t) (sum >> 31); 00538 00539 /* Update the inputA and inputB pointers for next MAC calculation */ 00540 px = ++pSrc1; 00541 py = pSrc2; 00542 00543 /* Decrement the MAC count */ 00544 count--; 00545 00546 /* Decrement the loop counter */ 00547 blockSize3--; 00548 00549 } 00550 00551 /* set status as ARM_MATH_SUCCESS */ 00552 status = ARM_MATH_SUCCESS; 00553 } 00554 00555 /* Return to application */ 00556 return (status); 00557 00558 #else 00559 00560 /* Run the below code for Cortex-M0 */ 00561 00562 q31_t *pIn1 = pSrcA; /* inputA pointer */ 00563 q31_t *pIn2 = pSrcB; /* inputB pointer */ 00564 q63_t sum; /* Accumulator */ 00565 uint32_t i, j; /* loop counters */ 00566 arm_status status; /* status of Partial convolution */ 00567 00568 /* Check for range of output samples to be calculated */ 00569 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00570 { 00571 /* Set status as ARM_ARGUMENT_ERROR */ 00572 status = ARM_MATH_ARGUMENT_ERROR; 00573 } 00574 else 00575 { 00576 /* Loop to calculate convolution for output length number of values */ 00577 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00578 { 00579 /* Initialize sum with zero to carry on MAC operations */ 00580 sum = 0; 00581 00582 /* Loop to perform MAC operations according to convolution equation */ 00583 for (j = 0; j <= i; j++) 00584 { 00585 /* Check the array limitations */ 00586 if(((i - j) < srcBLen) && (j < srcALen)) 00587 { 00588 /* z[i] += x[i-j] * y[j] */ 00589 sum += ((q63_t) pIn1[j] * (pIn2[i - j])); 00590 } 00591 } 00592 00593 /* Store the output in the destination buffer */ 00594 pDst[i] = (q31_t) (sum >> 31u); 00595 } 00596 /* set status as ARM_SUCCESS as there are no argument errors */ 00597 status = ARM_MATH_SUCCESS; 00598 } 00599 return (status); 00600 00601 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00602 00603 } 00604 00605 /** 00606 * @} end of PartialConv group 00607 */
Generated on Tue Jul 12 2022 11:59:16 by 1.7.2