Aded CMSIS5 DSP and NN folder. Needs some work
Embed:
(wiki syntax)
Show/hide line numbers
arm_conv_partial_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_conv_partial_q31.c 00004 * Description: Partial convolution of Q31 sequences 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup PartialConv 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Partial convolution of Q31 sequences. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. 00047 * @param[in] firstIndex is the first output sample to start with. 00048 * @param[in] numPoints is the number of output points to be computed. 00049 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00050 * 00051 * See <code>arm_conv_partial_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. 00052 */ 00053 00054 arm_status arm_conv_partial_q31( 00055 q31_t * pSrcA, 00056 uint32_t srcALen, 00057 q31_t * pSrcB, 00058 uint32_t srcBLen, 00059 q31_t * pDst, 00060 uint32_t firstIndex, 00061 uint32_t numPoints) 00062 { 00063 00064 00065 #if defined (ARM_MATH_DSP) 00066 00067 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00068 00069 q31_t *pIn1; /* inputA pointer */ 00070 q31_t *pIn2; /* inputB pointer */ 00071 q31_t *pOut = pDst; /* output pointer */ 00072 q31_t *px; /* Intermediate inputA pointer */ 00073 q31_t *py; /* Intermediate inputB pointer */ 00074 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00075 q63_t sum, acc0, acc1, acc2; /* Accumulator */ 00076 q31_t x0, x1, x2, c0; 00077 uint32_t j, k, count, check, blkCnt; 00078 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00079 arm_status status; /* status of Partial convolution */ 00080 00081 00082 /* Check for range of output samples to be calculated */ 00083 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) 00084 { 00085 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00086 status = ARM_MATH_ARGUMENT_ERROR; 00087 } 00088 else 00089 { 00090 00091 /* The algorithm implementation is based on the lengths of the inputs. */ 00092 /* srcB is always made to slide across srcA. */ 00093 /* So srcBLen is always considered as shorter or equal to srcALen */ 00094 if (srcALen >= srcBLen) 00095 { 00096 /* Initialization of inputA pointer */ 00097 pIn1 = pSrcA; 00098 00099 /* Initialization of inputB pointer */ 00100 pIn2 = pSrcB; 00101 } 00102 else 00103 { 00104 /* Initialization of inputA pointer */ 00105 pIn1 = pSrcB; 00106 00107 /* Initialization of inputB pointer */ 00108 pIn2 = pSrcA; 00109 00110 /* srcBLen is always considered as shorter or equal to srcALen */ 00111 j = srcBLen; 00112 srcBLen = srcALen; 00113 srcALen = j; 00114 } 00115 00116 /* Conditions to check which loopCounter holds 00117 * the first and last indices of the output samples to be calculated. */ 00118 check = firstIndex + numPoints; 00119 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; 00120 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; 00121 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00122 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : 00123 (int32_t) numPoints) : 0; 00124 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00125 (int32_t) firstIndex); 00126 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00127 00128 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00129 /* The function is internally 00130 * divided into three stages according to the number of multiplications that has to be 00131 * taken place between inputA samples and inputB samples. In the first stage of the 00132 * algorithm, the multiplications increase by one for every iteration. 00133 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00134 * In the third stage of the algorithm, the multiplications decrease by one 00135 * for every iteration. */ 00136 00137 /* Set the output pointer to point to the firstIndex 00138 * of the output sample to be calculated. */ 00139 pOut = pDst + firstIndex; 00140 00141 /* -------------------------- 00142 * Initializations of stage1 00143 * -------------------------*/ 00144 00145 /* sum = x[0] * y[0] 00146 * sum = x[0] * y[1] + x[1] * y[0] 00147 * .... 00148 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00149 */ 00150 00151 /* In this stage the MAC operations are increased by 1 for every iteration. 00152 The count variable holds the number of MAC operations performed. 00153 Since the partial convolution starts from firstIndex 00154 Number of Macs to be performed is firstIndex + 1 */ 00155 count = 1U + firstIndex; 00156 00157 /* Working pointer of inputA */ 00158 px = pIn1; 00159 00160 /* Working pointer of inputB */ 00161 pSrc2 = pIn2 + firstIndex; 00162 py = pSrc2; 00163 00164 /* ------------------------ 00165 * Stage1 process 00166 * ----------------------*/ 00167 00168 /* The first loop starts here */ 00169 while (blockSize1 > 0) 00170 { 00171 /* Accumulator is made zero for every iteration */ 00172 sum = 0; 00173 00174 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00175 k = count >> 2U; 00176 00177 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00178 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00179 while (k > 0U) 00180 { 00181 /* x[0] * y[srcBLen - 1] */ 00182 sum += (q63_t) * px++ * (*py--); 00183 /* x[1] * y[srcBLen - 2] */ 00184 sum += (q63_t) * px++ * (*py--); 00185 /* x[2] * y[srcBLen - 3] */ 00186 sum += (q63_t) * px++ * (*py--); 00187 /* x[3] * y[srcBLen - 4] */ 00188 sum += (q63_t) * px++ * (*py--); 00189 00190 /* Decrement the loop counter */ 00191 k--; 00192 } 00193 00194 /* If the count is not a multiple of 4, compute any remaining MACs here. 00195 ** No loop unrolling is used. */ 00196 k = count % 0x4U; 00197 00198 while (k > 0U) 00199 { 00200 /* Perform the multiply-accumulate */ 00201 sum += (q63_t) * px++ * (*py--); 00202 00203 /* Decrement the loop counter */ 00204 k--; 00205 } 00206 00207 /* Store the result in the accumulator in the destination buffer. */ 00208 *pOut++ = (q31_t) (sum >> 31); 00209 00210 /* Update the inputA and inputB pointers for next MAC calculation */ 00211 py = ++pSrc2; 00212 px = pIn1; 00213 00214 /* Increment the MAC count */ 00215 count++; 00216 00217 /* Decrement the loop counter */ 00218 blockSize1--; 00219 } 00220 00221 /* -------------------------- 00222 * Initializations of stage2 00223 * ------------------------*/ 00224 00225 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00226 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00227 * .... 00228 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00229 */ 00230 00231 /* Working pointer of inputA */ 00232 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00233 { 00234 px = pIn1 + firstIndex - srcBLen + 1; 00235 } 00236 else 00237 { 00238 px = pIn1; 00239 } 00240 00241 /* Working pointer of inputB */ 00242 pSrc2 = pIn2 + (srcBLen - 1U); 00243 py = pSrc2; 00244 00245 /* count is index by which the pointer pIn1 to be incremented */ 00246 count = 0U; 00247 00248 /* ------------------- 00249 * Stage2 process 00250 * ------------------*/ 00251 00252 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00253 * So, to loop unroll over blockSize2, 00254 * srcBLen should be greater than or equal to 4 */ 00255 if (srcBLen >= 4U) 00256 { 00257 /* Loop unroll over blkCnt */ 00258 00259 blkCnt = blockSize2 / 3; 00260 while (blkCnt > 0U) 00261 { 00262 /* Set all accumulators to zero */ 00263 acc0 = 0; 00264 acc1 = 0; 00265 acc2 = 0; 00266 00267 /* read x[0], x[1] samples */ 00268 x0 = *(px++); 00269 x1 = *(px++); 00270 00271 /* Apply loop unrolling and compute 3 MACs simultaneously. */ 00272 k = srcBLen / 3; 00273 00274 /* First part of the processing with loop unrolling. Compute 3 MACs at a time. 00275 ** a second loop below computes MACs for the remaining 1 to 2 samples. */ 00276 do 00277 { 00278 /* Read y[srcBLen - 1] sample */ 00279 c0 = *(py); 00280 00281 /* Read x[2] sample */ 00282 x2 = *(px); 00283 00284 /* Perform the multiply-accumulates */ 00285 /* acc0 += x[0] * y[srcBLen - 1] */ 00286 acc0 += (q63_t) x0 *c0; 00287 /* acc1 += x[1] * y[srcBLen - 1] */ 00288 acc1 += (q63_t) x1 *c0; 00289 /* acc2 += x[2] * y[srcBLen - 1] */ 00290 acc2 += (q63_t) x2 *c0; 00291 00292 /* Read y[srcBLen - 2] sample */ 00293 c0 = *(py - 1U); 00294 00295 /* Read x[3] sample */ 00296 x0 = *(px + 1U); 00297 00298 /* Perform the multiply-accumulate */ 00299 /* acc0 += x[1] * y[srcBLen - 2] */ 00300 acc0 += (q63_t) x1 *c0; 00301 /* acc1 += x[2] * y[srcBLen - 2] */ 00302 acc1 += (q63_t) x2 *c0; 00303 /* acc2 += x[3] * y[srcBLen - 2] */ 00304 acc2 += (q63_t) x0 *c0; 00305 00306 /* Read y[srcBLen - 3] sample */ 00307 c0 = *(py - 2U); 00308 00309 /* Read x[4] sample */ 00310 x1 = *(px + 2U); 00311 00312 /* Perform the multiply-accumulates */ 00313 /* acc0 += x[2] * y[srcBLen - 3] */ 00314 acc0 += (q63_t) x2 *c0; 00315 /* acc1 += x[3] * y[srcBLen - 2] */ 00316 acc1 += (q63_t) x0 *c0; 00317 /* acc2 += x[4] * y[srcBLen - 2] */ 00318 acc2 += (q63_t) x1 *c0; 00319 00320 00321 px += 3U; 00322 00323 py -= 3U; 00324 00325 } while (--k); 00326 00327 /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. 00328 ** No loop unrolling is used. */ 00329 k = srcBLen - (3 * (srcBLen / 3)); 00330 00331 while (k > 0U) 00332 { 00333 /* Read y[srcBLen - 5] sample */ 00334 c0 = *(py--); 00335 00336 /* Read x[7] sample */ 00337 x2 = *(px++); 00338 00339 /* Perform the multiply-accumulates */ 00340 /* acc0 += x[4] * y[srcBLen - 5] */ 00341 acc0 += (q63_t) x0 *c0; 00342 /* acc1 += x[5] * y[srcBLen - 5] */ 00343 acc1 += (q63_t) x1 *c0; 00344 /* acc2 += x[6] * y[srcBLen - 5] */ 00345 acc2 += (q63_t) x2 *c0; 00346 00347 /* Reuse the present samples for the next MAC */ 00348 x0 = x1; 00349 x1 = x2; 00350 00351 /* Decrement the loop counter */ 00352 k--; 00353 } 00354 00355 /* Store the result in the accumulator in the destination buffer. */ 00356 *pOut++ = (q31_t) (acc0 >> 31); 00357 *pOut++ = (q31_t) (acc1 >> 31); 00358 *pOut++ = (q31_t) (acc2 >> 31); 00359 00360 /* Increment the pointer pIn1 index, count by 3 */ 00361 count += 3U; 00362 00363 /* Update the inputA and inputB pointers for next MAC calculation */ 00364 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00365 { 00366 px = pIn1 + firstIndex - srcBLen + 1 + count; 00367 } 00368 else 00369 { 00370 px = pIn1 + count; 00371 } 00372 py = pSrc2; 00373 00374 /* Decrement the loop counter */ 00375 blkCnt--; 00376 } 00377 00378 /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. 00379 ** No loop unrolling is used. */ 00380 blkCnt = blockSize2 - 3 * (blockSize2 / 3); 00381 00382 while (blkCnt > 0U) 00383 { 00384 /* Accumulator is made zero for every iteration */ 00385 sum = 0; 00386 00387 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00388 k = srcBLen >> 2U; 00389 00390 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00391 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00392 while (k > 0U) 00393 { 00394 /* Perform the multiply-accumulates */ 00395 sum += (q63_t) * px++ * (*py--); 00396 sum += (q63_t) * px++ * (*py--); 00397 sum += (q63_t) * px++ * (*py--); 00398 sum += (q63_t) * px++ * (*py--); 00399 00400 /* Decrement the loop counter */ 00401 k--; 00402 } 00403 00404 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00405 ** No loop unrolling is used. */ 00406 k = srcBLen % 0x4U; 00407 00408 while (k > 0U) 00409 { 00410 /* Perform the multiply-accumulate */ 00411 sum += (q63_t) * px++ * (*py--); 00412 00413 /* Decrement the loop counter */ 00414 k--; 00415 } 00416 00417 /* Store the result in the accumulator in the destination buffer. */ 00418 *pOut++ = (q31_t) (sum >> 31); 00419 00420 /* Increment the MAC count */ 00421 count++; 00422 00423 /* Update the inputA and inputB pointers for next MAC calculation */ 00424 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00425 { 00426 px = pIn1 + firstIndex - srcBLen + 1 + count; 00427 } 00428 else 00429 { 00430 px = pIn1 + count; 00431 } 00432 py = pSrc2; 00433 00434 /* Decrement the loop counter */ 00435 blkCnt--; 00436 } 00437 } 00438 else 00439 { 00440 /* If the srcBLen is not a multiple of 4, 00441 * the blockSize2 loop cannot be unrolled by 4 */ 00442 blkCnt = (uint32_t) blockSize2; 00443 00444 while (blkCnt > 0U) 00445 { 00446 /* Accumulator is made zero for every iteration */ 00447 sum = 0; 00448 00449 /* srcBLen number of MACS should be performed */ 00450 k = srcBLen; 00451 00452 while (k > 0U) 00453 { 00454 /* Perform the multiply-accumulate */ 00455 sum += (q63_t) * px++ * (*py--); 00456 00457 /* Decrement the loop counter */ 00458 k--; 00459 } 00460 00461 /* Store the result in the accumulator in the destination buffer. */ 00462 *pOut++ = (q31_t) (sum >> 31); 00463 00464 /* Increment the MAC count */ 00465 count++; 00466 00467 /* Update the inputA and inputB pointers for next MAC calculation */ 00468 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00469 { 00470 px = pIn1 + firstIndex - srcBLen + 1 + count; 00471 } 00472 else 00473 { 00474 px = pIn1 + count; 00475 } 00476 py = pSrc2; 00477 00478 /* Decrement the loop counter */ 00479 blkCnt--; 00480 } 00481 } 00482 00483 00484 /* -------------------------- 00485 * Initializations of stage3 00486 * -------------------------*/ 00487 00488 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00489 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00490 * .... 00491 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00492 * sum += x[srcALen-1] * y[srcBLen-1] 00493 */ 00494 00495 /* In this stage the MAC operations are decreased by 1 for every iteration. 00496 The blockSize3 variable holds the number of MAC operations performed */ 00497 count = srcBLen - 1U; 00498 00499 /* Working pointer of inputA */ 00500 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); 00501 px = pSrc1; 00502 00503 /* Working pointer of inputB */ 00504 pSrc2 = pIn2 + (srcBLen - 1U); 00505 py = pSrc2; 00506 00507 /* ------------------- 00508 * Stage3 process 00509 * ------------------*/ 00510 00511 while (blockSize3 > 0) 00512 { 00513 /* Accumulator is made zero for every iteration */ 00514 sum = 0; 00515 00516 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00517 k = count >> 2U; 00518 00519 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00520 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00521 while (k > 0U) 00522 { 00523 sum += (q63_t) * px++ * (*py--); 00524 sum += (q63_t) * px++ * (*py--); 00525 sum += (q63_t) * px++ * (*py--); 00526 sum += (q63_t) * px++ * (*py--); 00527 00528 /* Decrement the loop counter */ 00529 k--; 00530 } 00531 00532 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00533 ** No loop unrolling is used. */ 00534 k = count % 0x4U; 00535 00536 while (k > 0U) 00537 { 00538 /* Perform the multiply-accumulate */ 00539 sum += (q63_t) * px++ * (*py--); 00540 00541 /* Decrement the loop counter */ 00542 k--; 00543 } 00544 00545 /* Store the result in the accumulator in the destination buffer. */ 00546 *pOut++ = (q31_t) (sum >> 31); 00547 00548 /* Update the inputA and inputB pointers for next MAC calculation */ 00549 px = ++pSrc1; 00550 py = pSrc2; 00551 00552 /* Decrement the MAC count */ 00553 count--; 00554 00555 /* Decrement the loop counter */ 00556 blockSize3--; 00557 00558 } 00559 00560 /* set status as ARM_MATH_SUCCESS */ 00561 status = ARM_MATH_SUCCESS; 00562 } 00563 00564 /* Return to application */ 00565 return (status); 00566 00567 #else 00568 00569 /* Run the below code for Cortex-M0 */ 00570 00571 q31_t *pIn1 = pSrcA; /* inputA pointer */ 00572 q31_t *pIn2 = pSrcB; /* inputB pointer */ 00573 q63_t sum; /* Accumulator */ 00574 uint32_t i, j; /* loop counters */ 00575 arm_status status; /* status of Partial convolution */ 00576 00577 /* Check for range of output samples to be calculated */ 00578 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) 00579 { 00580 /* Set status as ARM_ARGUMENT_ERROR */ 00581 status = ARM_MATH_ARGUMENT_ERROR; 00582 } 00583 else 00584 { 00585 /* Loop to calculate convolution for output length number of values */ 00586 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00587 { 00588 /* Initialize sum with zero to carry on MAC operations */ 00589 sum = 0; 00590 00591 /* Loop to perform MAC operations according to convolution equation */ 00592 for (j = 0; j <= i; j++) 00593 { 00594 /* Check the array limitations */ 00595 if (((i - j) < srcBLen) && (j < srcALen)) 00596 { 00597 /* z[i] += x[i-j] * y[j] */ 00598 sum += ((q63_t) pIn1[j] * (pIn2[i - j])); 00599 } 00600 } 00601 00602 /* Store the output in the destination buffer */ 00603 pDst[i] = (q31_t) (sum >> 31U); 00604 } 00605 /* set status as ARM_SUCCESS as there are no argument errors */ 00606 status = ARM_MATH_SUCCESS; 00607 } 00608 return (status); 00609 00610 #endif /* #if defined (ARM_MATH_DSP) */ 00611 00612 } 00613 00614 /** 00615 * @} end of PartialConv group 00616 */ 00617
Generated on Tue Jul 12 2022 16:46:23 by 1.7.2