CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_conv_partial_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_q31.c 00009 * 00010 * Description: Partial convolution of Q31 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q31 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00062 * 00063 * See <code>arm_conv_partial_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. 00064 */ 00065 00066 arm_status arm_conv_partial_q31( 00067 q31_t * pSrcA, 00068 uint32_t srcALen, 00069 q31_t * pSrcB, 00070 uint32_t srcBLen, 00071 q31_t * pDst, 00072 uint32_t firstIndex, 00073 uint32_t numPoints) 00074 { 00075 00076 00077 #ifndef ARM_MATH_CM0_FAMILY 00078 00079 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00080 00081 q31_t *pIn1; /* inputA pointer */ 00082 q31_t *pIn2; /* inputB pointer */ 00083 q31_t *pOut = pDst; /* output pointer */ 00084 q31_t *px; /* Intermediate inputA pointer */ 00085 q31_t *py; /* Intermediate inputB pointer */ 00086 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00087 q63_t sum, acc0, acc1, acc2; /* Accumulator */ 00088 q31_t x0, x1, x2, c0; 00089 uint32_t j, k, count, check, blkCnt; 00090 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00091 arm_status status; /* status of Partial convolution */ 00092 00093 00094 /* Check for range of output samples to be calculated */ 00095 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00096 { 00097 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00098 status = ARM_MATH_ARGUMENT_ERROR; 00099 } 00100 else 00101 { 00102 00103 /* The algorithm implementation is based on the lengths of the inputs. */ 00104 /* srcB is always made to slide across srcA. */ 00105 /* So srcBLen is always considered as shorter or equal to srcALen */ 00106 if(srcALen >= srcBLen) 00107 { 00108 /* Initialization of inputA pointer */ 00109 pIn1 = pSrcA; 00110 00111 /* Initialization of inputB pointer */ 00112 pIn2 = pSrcB; 00113 } 00114 else 00115 { 00116 /* Initialization of inputA pointer */ 00117 pIn1 = pSrcB; 00118 00119 /* Initialization of inputB pointer */ 00120 pIn2 = pSrcA; 00121 00122 /* srcBLen is always considered as shorter or equal to srcALen */ 00123 j = srcBLen; 00124 srcBLen = srcALen; 00125 srcALen = j; 00126 } 00127 00128 /* Conditions to check which loopCounter holds 00129 * the first and last indices of the output samples to be calculated. */ 00130 check = firstIndex + numPoints; 00131 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00132 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00133 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00134 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00135 (int32_t) numPoints) : 0; 00136 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00137 (int32_t) firstIndex); 00138 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00139 00140 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00141 /* The function is internally 00142 * divided into three stages according to the number of multiplications that has to be 00143 * taken place between inputA samples and inputB samples. In the first stage of the 00144 * algorithm, the multiplications increase by one for every iteration. 00145 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00146 * In the third stage of the algorithm, the multiplications decrease by one 00147 * for every iteration. */ 00148 00149 /* Set the output pointer to point to the firstIndex 00150 * of the output sample to be calculated. */ 00151 pOut = pDst + firstIndex; 00152 00153 /* -------------------------- 00154 * Initializations of stage1 00155 * -------------------------*/ 00156 00157 /* sum = x[0] * y[0] 00158 * sum = x[0] * y[1] + x[1] * y[0] 00159 * .... 00160 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00161 */ 00162 00163 /* In this stage the MAC operations are increased by 1 for every iteration. 00164 The count variable holds the number of MAC operations performed. 00165 Since the partial convolution starts from firstIndex 00166 Number of Macs to be performed is firstIndex + 1 */ 00167 count = 1u + firstIndex; 00168 00169 /* Working pointer of inputA */ 00170 px = pIn1; 00171 00172 /* Working pointer of inputB */ 00173 pSrc2 = pIn2 + firstIndex; 00174 py = pSrc2; 00175 00176 /* ------------------------ 00177 * Stage1 process 00178 * ----------------------*/ 00179 00180 /* The first loop starts here */ 00181 while(blockSize1 > 0) 00182 { 00183 /* Accumulator is made zero for every iteration */ 00184 sum = 0; 00185 00186 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00187 k = count >> 2u; 00188 00189 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00190 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00191 while(k > 0u) 00192 { 00193 /* x[0] * y[srcBLen - 1] */ 00194 sum += (q63_t) * px++ * (*py--); 00195 /* x[1] * y[srcBLen - 2] */ 00196 sum += (q63_t) * px++ * (*py--); 00197 /* x[2] * y[srcBLen - 3] */ 00198 sum += (q63_t) * px++ * (*py--); 00199 /* x[3] * y[srcBLen - 4] */ 00200 sum += (q63_t) * px++ * (*py--); 00201 00202 /* Decrement the loop counter */ 00203 k--; 00204 } 00205 00206 /* If the count is not a multiple of 4, compute any remaining MACs here. 00207 ** No loop unrolling is used. */ 00208 k = count % 0x4u; 00209 00210 while(k > 0u) 00211 { 00212 /* Perform the multiply-accumulate */ 00213 sum += (q63_t) * px++ * (*py--); 00214 00215 /* Decrement the loop counter */ 00216 k--; 00217 } 00218 00219 /* Store the result in the accumulator in the destination buffer. */ 00220 *pOut++ = (q31_t) (sum >> 31); 00221 00222 /* Update the inputA and inputB pointers for next MAC calculation */ 00223 py = ++pSrc2; 00224 px = pIn1; 00225 00226 /* Increment the MAC count */ 00227 count++; 00228 00229 /* Decrement the loop counter */ 00230 blockSize1--; 00231 } 00232 00233 /* -------------------------- 00234 * Initializations of stage2 00235 * ------------------------*/ 00236 00237 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00238 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00239 * .... 00240 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00241 */ 00242 00243 /* Working pointer of inputA */ 00244 px = pIn1; 00245 00246 /* Working pointer of inputB */ 00247 pSrc2 = pIn2 + (srcBLen - 1u); 00248 py = pSrc2; 00249 00250 /* count is index by which the pointer pIn1 to be incremented */ 00251 count = 0u; 00252 00253 /* ------------------- 00254 * Stage2 process 00255 * ------------------*/ 00256 00257 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00258 * So, to loop unroll over blockSize2, 00259 * srcBLen should be greater than or equal to 4 */ 00260 if(srcBLen >= 4u) 00261 { 00262 /* Loop unroll over blkCnt */ 00263 00264 blkCnt = blockSize2 / 3; 00265 while(blkCnt > 0u) 00266 { 00267 /* Set all accumulators to zero */ 00268 acc0 = 0; 00269 acc1 = 0; 00270 acc2 = 0; 00271 00272 /* read x[0], x[1] samples */ 00273 x0 = *(px++); 00274 x1 = *(px++); 00275 00276 /* Apply loop unrolling and compute 3 MACs simultaneously. */ 00277 k = srcBLen / 3; 00278 00279 /* First part of the processing with loop unrolling. Compute 3 MACs at a time. 00280 ** a second loop below computes MACs for the remaining 1 to 2 samples. */ 00281 do 00282 { 00283 /* Read y[srcBLen - 1] sample */ 00284 c0 = *(py); 00285 00286 /* Read x[2] sample */ 00287 x2 = *(px); 00288 00289 /* Perform the multiply-accumulates */ 00290 /* acc0 += x[0] * y[srcBLen - 1] */ 00291 acc0 += (q63_t) x0 *c0; 00292 /* acc1 += x[1] * y[srcBLen - 1] */ 00293 acc1 += (q63_t) x1 *c0; 00294 /* acc2 += x[2] * y[srcBLen - 1] */ 00295 acc2 += (q63_t) x2 *c0; 00296 00297 /* Read y[srcBLen - 2] sample */ 00298 c0 = *(py - 1u); 00299 00300 /* Read x[3] sample */ 00301 x0 = *(px + 1u); 00302 00303 /* Perform the multiply-accumulate */ 00304 /* acc0 += x[1] * y[srcBLen - 2] */ 00305 acc0 += (q63_t) x1 *c0; 00306 /* acc1 += x[2] * y[srcBLen - 2] */ 00307 acc1 += (q63_t) x2 *c0; 00308 /* acc2 += x[3] * y[srcBLen - 2] */ 00309 acc2 += (q63_t) x0 *c0; 00310 00311 /* Read y[srcBLen - 3] sample */ 00312 c0 = *(py - 2u); 00313 00314 /* Read x[4] sample */ 00315 x1 = *(px + 2u); 00316 00317 /* Perform the multiply-accumulates */ 00318 /* acc0 += x[2] * y[srcBLen - 3] */ 00319 acc0 += (q63_t) x2 *c0; 00320 /* acc1 += x[3] * y[srcBLen - 2] */ 00321 acc1 += (q63_t) x0 *c0; 00322 /* acc2 += x[4] * y[srcBLen - 2] */ 00323 acc2 += (q63_t) x1 *c0; 00324 00325 00326 px += 3u; 00327 00328 py -= 3u; 00329 00330 } while(--k); 00331 00332 /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. 00333 ** No loop unrolling is used. */ 00334 k = srcBLen - (3 * (srcBLen / 3)); 00335 00336 while(k > 0u) 00337 { 00338 /* Read y[srcBLen - 5] sample */ 00339 c0 = *(py--); 00340 00341 /* Read x[7] sample */ 00342 x2 = *(px++); 00343 00344 /* Perform the multiply-accumulates */ 00345 /* acc0 += x[4] * y[srcBLen - 5] */ 00346 acc0 += (q63_t) x0 *c0; 00347 /* acc1 += x[5] * y[srcBLen - 5] */ 00348 acc1 += (q63_t) x1 *c0; 00349 /* acc2 += x[6] * y[srcBLen - 5] */ 00350 acc2 += (q63_t) x2 *c0; 00351 00352 /* Reuse the present samples for the next MAC */ 00353 x0 = x1; 00354 x1 = x2; 00355 00356 /* Decrement the loop counter */ 00357 k--; 00358 } 00359 00360 /* Store the result in the accumulator in the destination buffer. */ 00361 *pOut++ = (q31_t) (acc0 >> 31); 00362 *pOut++ = (q31_t) (acc1 >> 31); 00363 *pOut++ = (q31_t) (acc2 >> 31); 00364 00365 /* Increment the pointer pIn1 index, count by 3 */ 00366 count += 3u; 00367 00368 /* Update the inputA and inputB pointers for next MAC calculation */ 00369 px = pIn1 + count; 00370 py = pSrc2; 00371 00372 /* Decrement the loop counter */ 00373 blkCnt--; 00374 } 00375 00376 /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. 00377 ** No loop unrolling is used. */ 00378 blkCnt = blockSize2 - 3 * (blockSize2 / 3); 00379 00380 while(blkCnt > 0u) 00381 { 00382 /* Accumulator is made zero for every iteration */ 00383 sum = 0; 00384 00385 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00386 k = srcBLen >> 2u; 00387 00388 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00389 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00390 while(k > 0u) 00391 { 00392 /* Perform the multiply-accumulates */ 00393 sum += (q63_t) * px++ * (*py--); 00394 sum += (q63_t) * px++ * (*py--); 00395 sum += (q63_t) * px++ * (*py--); 00396 sum += (q63_t) * px++ * (*py--); 00397 00398 /* Decrement the loop counter */ 00399 k--; 00400 } 00401 00402 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00403 ** No loop unrolling is used. */ 00404 k = srcBLen % 0x4u; 00405 00406 while(k > 0u) 00407 { 00408 /* Perform the multiply-accumulate */ 00409 sum += (q63_t) * px++ * (*py--); 00410 00411 /* Decrement the loop counter */ 00412 k--; 00413 } 00414 00415 /* Store the result in the accumulator in the destination buffer. */ 00416 *pOut++ = (q31_t) (sum >> 31); 00417 00418 /* Increment the MAC count */ 00419 count++; 00420 00421 /* Update the inputA and inputB pointers for next MAC calculation */ 00422 px = pIn1 + count; 00423 py = pSrc2; 00424 00425 /* Decrement the loop counter */ 00426 blkCnt--; 00427 } 00428 } 00429 else 00430 { 00431 /* If the srcBLen is not a multiple of 4, 00432 * the blockSize2 loop cannot be unrolled by 4 */ 00433 blkCnt = (uint32_t) blockSize2; 00434 00435 while(blkCnt > 0u) 00436 { 00437 /* Accumulator is made zero for every iteration */ 00438 sum = 0; 00439 00440 /* srcBLen number of MACS should be performed */ 00441 k = srcBLen; 00442 00443 while(k > 0u) 00444 { 00445 /* Perform the multiply-accumulate */ 00446 sum += (q63_t) * px++ * (*py--); 00447 00448 /* Decrement the loop counter */ 00449 k--; 00450 } 00451 00452 /* Store the result in the accumulator in the destination buffer. */ 00453 *pOut++ = (q31_t) (sum >> 31); 00454 00455 /* Increment the MAC count */ 00456 count++; 00457 00458 /* Update the inputA and inputB pointers for next MAC calculation */ 00459 px = pIn1 + count; 00460 py = pSrc2; 00461 00462 /* Decrement the loop counter */ 00463 blkCnt--; 00464 } 00465 } 00466 00467 00468 /* -------------------------- 00469 * Initializations of stage3 00470 * -------------------------*/ 00471 00472 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00473 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00474 * .... 00475 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00476 * sum += x[srcALen-1] * y[srcBLen-1] 00477 */ 00478 00479 /* In this stage the MAC operations are decreased by 1 for every iteration. 00480 The blockSize3 variable holds the number of MAC operations performed */ 00481 count = srcBLen - 1u; 00482 00483 /* Working pointer of inputA */ 00484 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00485 px = pSrc1; 00486 00487 /* Working pointer of inputB */ 00488 pSrc2 = pIn2 + (srcBLen - 1u); 00489 py = pSrc2; 00490 00491 /* ------------------- 00492 * Stage3 process 00493 * ------------------*/ 00494 00495 while(blockSize3 > 0) 00496 { 00497 /* Accumulator is made zero for every iteration */ 00498 sum = 0; 00499 00500 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00501 k = count >> 2u; 00502 00503 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00504 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00505 while(k > 0u) 00506 { 00507 sum += (q63_t) * px++ * (*py--); 00508 sum += (q63_t) * px++ * (*py--); 00509 sum += (q63_t) * px++ * (*py--); 00510 sum += (q63_t) * px++ * (*py--); 00511 00512 /* Decrement the loop counter */ 00513 k--; 00514 } 00515 00516 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00517 ** No loop unrolling is used. */ 00518 k = count % 0x4u; 00519 00520 while(k > 0u) 00521 { 00522 /* Perform the multiply-accumulate */ 00523 sum += (q63_t) * px++ * (*py--); 00524 00525 /* Decrement the loop counter */ 00526 k--; 00527 } 00528 00529 /* Store the result in the accumulator in the destination buffer. */ 00530 *pOut++ = (q31_t) (sum >> 31); 00531 00532 /* Update the inputA and inputB pointers for next MAC calculation */ 00533 px = ++pSrc1; 00534 py = pSrc2; 00535 00536 /* Decrement the MAC count */ 00537 count--; 00538 00539 /* Decrement the loop counter */ 00540 blockSize3--; 00541 00542 } 00543 00544 /* set status as ARM_MATH_SUCCESS */ 00545 status = ARM_MATH_SUCCESS; 00546 } 00547 00548 /* Return to application */ 00549 return (status); 00550 00551 #else 00552 00553 /* Run the below code for Cortex-M0 */ 00554 00555 q31_t *pIn1 = pSrcA; /* inputA pointer */ 00556 q31_t *pIn2 = pSrcB; /* inputB pointer */ 00557 q63_t sum; /* Accumulator */ 00558 uint32_t i, j; /* loop counters */ 00559 arm_status status; /* status of Partial convolution */ 00560 00561 /* Check for range of output samples to be calculated */ 00562 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00563 { 00564 /* Set status as ARM_ARGUMENT_ERROR */ 00565 status = ARM_MATH_ARGUMENT_ERROR; 00566 } 00567 else 00568 { 00569 /* Loop to calculate convolution for output length number of values */ 00570 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00571 { 00572 /* Initialize sum with zero to carry on MAC operations */ 00573 sum = 0; 00574 00575 /* Loop to perform MAC operations according to convolution equation */ 00576 for (j = 0; j <= i; j++) 00577 { 00578 /* Check the array limitations */ 00579 if(((i - j) < srcBLen) && (j < srcALen)) 00580 { 00581 /* z[i] += x[i-j] * y[j] */ 00582 sum += ((q63_t) pIn1[j] * (pIn2[i - j])); 00583 } 00584 } 00585 00586 /* Store the output in the destination buffer */ 00587 pDst[i] = (q31_t) (sum >> 31u); 00588 } 00589 /* set status as ARM_SUCCESS as there are no argument errors */ 00590 status = ARM_MATH_SUCCESS; 00591 } 00592 return (status); 00593 00594 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00595 00596 } 00597 00598 /** 00599 * @} end of PartialConv group 00600 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2