Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
arm_conv_partial_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_conv_partial_q7.c 00004 * Description: Partial convolution of Q7 sequences 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup PartialConv 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Partial convolution of Q7 sequences. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. 00047 * @param[in] firstIndex is the first output sample to start with. 00048 * @param[in] numPoints is the number of output points to be computed. 00049 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00050 * 00051 * \par 00052 * Refer the function <code>arm_conv_partial_opt_q7()</code> for a faster implementation of this function. 00053 * 00054 */ 00055 00056 arm_status arm_conv_partial_q7( 00057 q7_t * pSrcA, 00058 uint32_t srcALen, 00059 q7_t * pSrcB, 00060 uint32_t srcBLen, 00061 q7_t * pDst, 00062 uint32_t firstIndex, 00063 uint32_t numPoints) 00064 { 00065 00066 00067 #if defined (ARM_MATH_DSP) 00068 00069 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00070 00071 q7_t *pIn1; /* inputA pointer */ 00072 q7_t *pIn2; /* inputB pointer */ 00073 q7_t *pOut = pDst; /* output pointer */ 00074 q7_t *px; /* Intermediate inputA pointer */ 00075 q7_t *py; /* Intermediate inputB pointer */ 00076 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00077 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00078 q31_t input1, input2; 00079 q15_t in1, in2; 00080 q7_t x0, x1, x2, x3, c0, c1; 00081 uint32_t j, k, count, check, blkCnt; 00082 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00083 arm_status status; 00084 00085 00086 /* Check for range of output samples to be calculated */ 00087 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) 00088 { 00089 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00090 status = ARM_MATH_ARGUMENT_ERROR; 00091 } 00092 else 00093 { 00094 00095 /* The algorithm implementation is based on the lengths of the inputs. */ 00096 /* srcB is always made to slide across srcA. */ 00097 /* So srcBLen is always considered as shorter or equal to srcALen */ 00098 if (srcALen >= srcBLen) 00099 { 00100 /* Initialization of inputA pointer */ 00101 pIn1 = pSrcA; 00102 00103 /* Initialization of inputB pointer */ 00104 pIn2 = pSrcB; 00105 } 00106 else 00107 { 00108 /* Initialization of inputA pointer */ 00109 pIn1 = pSrcB; 00110 00111 /* Initialization of inputB pointer */ 00112 pIn2 = pSrcA; 00113 00114 /* srcBLen is always considered as shorter or equal to srcALen */ 00115 j = srcBLen; 00116 srcBLen = srcALen; 00117 srcALen = j; 00118 } 00119 00120 /* Conditions to check which loopCounter holds 00121 * the first and last indices of the output samples to be calculated. */ 00122 check = firstIndex + numPoints; 00123 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; 00124 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; 00125 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00126 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : 00127 (int32_t) numPoints) : 0; 00128 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00129 (int32_t) firstIndex); 00130 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00131 00132 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00133 /* The function is internally 00134 * divided into three stages according to the number of multiplications that has to be 00135 * taken place between inputA samples and inputB samples. In the first stage of the 00136 * algorithm, the multiplications increase by one for every iteration. 00137 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00138 * In the third stage of the algorithm, the multiplications decrease by one 00139 * for every iteration. */ 00140 00141 /* Set the output pointer to point to the firstIndex 00142 * of the output sample to be calculated. */ 00143 pOut = pDst + firstIndex; 00144 00145 /* -------------------------- 00146 * Initializations of stage1 00147 * -------------------------*/ 00148 00149 /* sum = x[0] * y[0] 00150 * sum = x[0] * y[1] + x[1] * y[0] 00151 * .... 00152 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00153 */ 00154 00155 /* In this stage the MAC operations are increased by 1 for every iteration. 00156 The count variable holds the number of MAC operations performed. 00157 Since the partial convolution starts from from firstIndex 00158 Number of Macs to be performed is firstIndex + 1 */ 00159 count = 1U + firstIndex; 00160 00161 /* Working pointer of inputA */ 00162 px = pIn1; 00163 00164 /* Working pointer of inputB */ 00165 pSrc2 = pIn2 + firstIndex; 00166 py = pSrc2; 00167 00168 /* ------------------------ 00169 * Stage1 process 00170 * ----------------------*/ 00171 00172 /* The first stage starts here */ 00173 while (blockSize1 > 0) 00174 { 00175 /* Accumulator is made zero for every iteration */ 00176 sum = 0; 00177 00178 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00179 k = count >> 2U; 00180 00181 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00182 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00183 while (k > 0U) 00184 { 00185 /* x[0] , x[1] */ 00186 in1 = (q15_t) * px++; 00187 in2 = (q15_t) * px++; 00188 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00189 00190 /* y[srcBLen - 1] , y[srcBLen - 2] */ 00191 in1 = (q15_t) * py--; 00192 in2 = (q15_t) * py--; 00193 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00194 00195 /* x[0] * y[srcBLen - 1] */ 00196 /* x[1] * y[srcBLen - 2] */ 00197 sum = __SMLAD(input1, input2, sum); 00198 00199 /* x[2] , x[3] */ 00200 in1 = (q15_t) * px++; 00201 in2 = (q15_t) * px++; 00202 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00203 00204 /* y[srcBLen - 3] , y[srcBLen - 4] */ 00205 in1 = (q15_t) * py--; 00206 in2 = (q15_t) * py--; 00207 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00208 00209 /* x[2] * y[srcBLen - 3] */ 00210 /* x[3] * y[srcBLen - 4] */ 00211 sum = __SMLAD(input1, input2, sum); 00212 00213 /* Decrement the loop counter */ 00214 k--; 00215 } 00216 00217 /* If the count is not a multiple of 4, compute any remaining MACs here. 00218 ** No loop unrolling is used. */ 00219 k = count % 0x4U; 00220 00221 while (k > 0U) 00222 { 00223 /* Perform the multiply-accumulates */ 00224 sum += ((q31_t) * px++ * *py--); 00225 00226 /* Decrement the loop counter */ 00227 k--; 00228 } 00229 00230 /* Store the result in the accumulator in the destination buffer. */ 00231 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00232 00233 /* Update the inputA and inputB pointers for next MAC calculation */ 00234 py = ++pSrc2; 00235 px = pIn1; 00236 00237 /* Increment the MAC count */ 00238 count++; 00239 00240 /* Decrement the loop counter */ 00241 blockSize1--; 00242 } 00243 00244 /* -------------------------- 00245 * Initializations of stage2 00246 * ------------------------*/ 00247 00248 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00249 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00250 * .... 00251 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00252 */ 00253 00254 /* Working pointer of inputA */ 00255 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00256 { 00257 px = pIn1 + firstIndex - srcBLen + 1; 00258 } 00259 else 00260 { 00261 px = pIn1; 00262 } 00263 00264 /* Working pointer of inputB */ 00265 pSrc2 = pIn2 + (srcBLen - 1U); 00266 py = pSrc2; 00267 00268 /* count is index by which the pointer pIn1 to be incremented */ 00269 count = 0U; 00270 00271 /* ------------------- 00272 * Stage2 process 00273 * ------------------*/ 00274 00275 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00276 * So, to loop unroll over blockSize2, 00277 * srcBLen should be greater than or equal to 4 */ 00278 if (srcBLen >= 4U) 00279 { 00280 /* Loop unroll over blockSize2, by 4 */ 00281 blkCnt = ((uint32_t) blockSize2 >> 2U); 00282 00283 while (blkCnt > 0U) 00284 { 00285 /* Set all accumulators to zero */ 00286 acc0 = 0; 00287 acc1 = 0; 00288 acc2 = 0; 00289 acc3 = 0; 00290 00291 /* read x[0], x[1], x[2] samples */ 00292 x0 = *(px++); 00293 x1 = *(px++); 00294 x2 = *(px++); 00295 00296 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00297 k = srcBLen >> 2U; 00298 00299 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00300 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00301 do 00302 { 00303 /* Read y[srcBLen - 1] sample */ 00304 c0 = *(py--); 00305 /* Read y[srcBLen - 2] sample */ 00306 c1 = *(py--); 00307 00308 /* Read x[3] sample */ 00309 x3 = *(px++); 00310 00311 /* x[0] and x[1] are packed */ 00312 in1 = (q15_t) x0; 00313 in2 = (q15_t) x1; 00314 00315 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00316 00317 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ 00318 in1 = (q15_t) c0; 00319 in2 = (q15_t) c1; 00320 00321 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00322 00323 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00324 acc0 = __SMLAD(input1, input2, acc0); 00325 00326 /* x[1] and x[2] are packed */ 00327 in1 = (q15_t) x1; 00328 in2 = (q15_t) x2; 00329 00330 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00331 00332 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00333 acc1 = __SMLAD(input1, input2, acc1); 00334 00335 /* x[2] and x[3] are packed */ 00336 in1 = (q15_t) x2; 00337 in2 = (q15_t) x3; 00338 00339 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00340 00341 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00342 acc2 = __SMLAD(input1, input2, acc2); 00343 00344 /* Read x[4] sample */ 00345 x0 = *(px++); 00346 00347 /* x[3] and x[4] are packed */ 00348 in1 = (q15_t) x3; 00349 in2 = (q15_t) x0; 00350 00351 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00352 00353 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00354 acc3 = __SMLAD(input1, input2, acc3); 00355 00356 /* Read y[srcBLen - 3] sample */ 00357 c0 = *(py--); 00358 /* Read y[srcBLen - 4] sample */ 00359 c1 = *(py--); 00360 00361 /* Read x[5] sample */ 00362 x1 = *(px++); 00363 00364 /* x[2] and x[3] are packed */ 00365 in1 = (q15_t) x2; 00366 in2 = (q15_t) x3; 00367 00368 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00369 00370 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 00371 in1 = (q15_t) c0; 00372 in2 = (q15_t) c1; 00373 00374 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00375 00376 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00377 acc0 = __SMLAD(input1, input2, acc0); 00378 00379 /* x[3] and x[4] are packed */ 00380 in1 = (q15_t) x3; 00381 in2 = (q15_t) x0; 00382 00383 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00384 00385 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00386 acc1 = __SMLAD(input1, input2, acc1); 00387 00388 /* x[4] and x[5] are packed */ 00389 in1 = (q15_t) x0; 00390 in2 = (q15_t) x1; 00391 00392 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00393 00394 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00395 acc2 = __SMLAD(input1, input2, acc2); 00396 00397 /* Read x[6] sample */ 00398 x2 = *(px++); 00399 00400 /* x[5] and x[6] are packed */ 00401 in1 = (q15_t) x1; 00402 in2 = (q15_t) x2; 00403 00404 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00405 00406 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00407 acc3 = __SMLAD(input1, input2, acc3); 00408 00409 } while (--k); 00410 00411 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00412 ** No loop unrolling is used. */ 00413 k = srcBLen % 0x4U; 00414 00415 while (k > 0U) 00416 { 00417 /* Read y[srcBLen - 5] sample */ 00418 c0 = *(py--); 00419 00420 /* Read x[7] sample */ 00421 x3 = *(px++); 00422 00423 /* Perform the multiply-accumulates */ 00424 /* acc0 += x[4] * y[srcBLen - 5] */ 00425 acc0 += ((q31_t) x0 * c0); 00426 /* acc1 += x[5] * y[srcBLen - 5] */ 00427 acc1 += ((q31_t) x1 * c0); 00428 /* acc2 += x[6] * y[srcBLen - 5] */ 00429 acc2 += ((q31_t) x2 * c0); 00430 /* acc3 += x[7] * y[srcBLen - 5] */ 00431 acc3 += ((q31_t) x3 * c0); 00432 00433 /* Reuse the present samples for the next MAC */ 00434 x0 = x1; 00435 x1 = x2; 00436 x2 = x3; 00437 00438 /* Decrement the loop counter */ 00439 k--; 00440 } 00441 00442 /* Store the result in the accumulator in the destination buffer. */ 00443 *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8)); 00444 *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8)); 00445 *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8)); 00446 *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8)); 00447 00448 /* Increment the pointer pIn1 index, count by 4 */ 00449 count += 4U; 00450 00451 /* Update the inputA and inputB pointers for next MAC calculation */ 00452 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00453 { 00454 px = pIn1 + firstIndex - srcBLen + 1 + count; 00455 } 00456 else 00457 { 00458 px = pIn1 + count; 00459 } 00460 py = pSrc2; 00461 00462 00463 /* Decrement the loop counter */ 00464 blkCnt--; 00465 } 00466 00467 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00468 ** No loop unrolling is used. */ 00469 blkCnt = (uint32_t) blockSize2 % 0x4U; 00470 00471 while (blkCnt > 0U) 00472 { 00473 /* Accumulator is made zero for every iteration */ 00474 sum = 0; 00475 00476 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00477 k = srcBLen >> 2U; 00478 00479 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00480 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00481 while (k > 0U) 00482 { 00483 00484 /* Reading two inputs of SrcA buffer and packing */ 00485 in1 = (q15_t) * px++; 00486 in2 = (q15_t) * px++; 00487 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00488 00489 /* Reading two inputs of SrcB buffer and packing */ 00490 in1 = (q15_t) * py--; 00491 in2 = (q15_t) * py--; 00492 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00493 00494 /* Perform the multiply-accumulates */ 00495 sum = __SMLAD(input1, input2, sum); 00496 00497 /* Reading two inputs of SrcA buffer and packing */ 00498 in1 = (q15_t) * px++; 00499 in2 = (q15_t) * px++; 00500 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00501 00502 /* Reading two inputs of SrcB buffer and packing */ 00503 in1 = (q15_t) * py--; 00504 in2 = (q15_t) * py--; 00505 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00506 00507 /* Perform the multiply-accumulates */ 00508 sum = __SMLAD(input1, input2, sum); 00509 00510 /* Decrement the loop counter */ 00511 k--; 00512 } 00513 00514 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00515 ** No loop unrolling is used. */ 00516 k = srcBLen % 0x4U; 00517 00518 while (k > 0U) 00519 { 00520 /* Perform the multiply-accumulates */ 00521 sum += ((q31_t) * px++ * *py--); 00522 00523 /* Decrement the loop counter */ 00524 k--; 00525 } 00526 00527 /* Store the result in the accumulator in the destination buffer. */ 00528 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00529 00530 /* Increment the pointer pIn1 index, count by 1 */ 00531 count++; 00532 00533 /* Update the inputA and inputB pointers for next MAC calculation */ 00534 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00535 { 00536 px = pIn1 + firstIndex - srcBLen + 1 + count; 00537 } 00538 else 00539 { 00540 px = pIn1 + count; 00541 } 00542 py = pSrc2; 00543 00544 /* Decrement the loop counter */ 00545 blkCnt--; 00546 } 00547 } 00548 else 00549 { 00550 /* If the srcBLen is not a multiple of 4, 00551 * the blockSize2 loop cannot be unrolled by 4 */ 00552 blkCnt = (uint32_t) blockSize2; 00553 00554 while (blkCnt > 0U) 00555 { 00556 /* Accumulator is made zero for every iteration */ 00557 sum = 0; 00558 00559 /* srcBLen number of MACS should be performed */ 00560 k = srcBLen; 00561 00562 while (k > 0U) 00563 { 00564 /* Perform the multiply-accumulate */ 00565 sum += ((q31_t) * px++ * *py--); 00566 00567 /* Decrement the loop counter */ 00568 k--; 00569 } 00570 00571 /* Store the result in the accumulator in the destination buffer. */ 00572 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00573 00574 /* Increment the MAC count */ 00575 count++; 00576 00577 /* Update the inputA and inputB pointers for next MAC calculation */ 00578 if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00579 { 00580 px = pIn1 + firstIndex - srcBLen + 1 + count; 00581 } 00582 else 00583 { 00584 px = pIn1 + count; 00585 } 00586 py = pSrc2; 00587 00588 /* Decrement the loop counter */ 00589 blkCnt--; 00590 } 00591 } 00592 00593 00594 /* -------------------------- 00595 * Initializations of stage3 00596 * -------------------------*/ 00597 00598 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00599 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00600 * .... 00601 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00602 * sum += x[srcALen-1] * y[srcBLen-1] 00603 */ 00604 00605 /* In this stage the MAC operations are decreased by 1 for every iteration. 00606 The count variable holds the number of MAC operations performed */ 00607 count = srcBLen - 1U; 00608 00609 /* Working pointer of inputA */ 00610 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U); 00611 px = pSrc1; 00612 00613 /* Working pointer of inputB */ 00614 pSrc2 = pIn2 + (srcBLen - 1U); 00615 py = pSrc2; 00616 00617 /* ------------------- 00618 * Stage3 process 00619 * ------------------*/ 00620 00621 while (blockSize3 > 0) 00622 { 00623 /* Accumulator is made zero for every iteration */ 00624 sum = 0; 00625 00626 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00627 k = count >> 2U; 00628 00629 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00630 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00631 while (k > 0U) 00632 { 00633 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 00634 in1 = (q15_t) * px++; 00635 in2 = (q15_t) * px++; 00636 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00637 00638 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 00639 in1 = (q15_t) * py--; 00640 in2 = (q15_t) * py--; 00641 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00642 00643 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00644 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00645 sum = __SMLAD(input1, input2, sum); 00646 00647 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 00648 in1 = (q15_t) * px++; 00649 in2 = (q15_t) * px++; 00650 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00651 00652 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 00653 in1 = (q15_t) * py--; 00654 in2 = (q15_t) * py--; 00655 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00656 00657 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00658 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00659 sum = __SMLAD(input1, input2, sum); 00660 00661 /* Decrement the loop counter */ 00662 k--; 00663 } 00664 00665 /* If the count is not a multiple of 4, compute any remaining MACs here. 00666 ** No loop unrolling is used. */ 00667 k = count % 0x4U; 00668 00669 while (k > 0U) 00670 { 00671 /* Perform the multiply-accumulates */ 00672 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00673 sum += ((q31_t) * px++ * *py--); 00674 00675 /* Decrement the loop counter */ 00676 k--; 00677 } 00678 00679 /* Store the result in the accumulator in the destination buffer. */ 00680 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00681 00682 /* Update the inputA and inputB pointers for next MAC calculation */ 00683 px = ++pSrc1; 00684 py = pSrc2; 00685 00686 /* Decrement the MAC count */ 00687 count--; 00688 00689 /* Decrement the loop counter */ 00690 blockSize3--; 00691 00692 } 00693 00694 /* set status as ARM_MATH_SUCCESS */ 00695 status = ARM_MATH_SUCCESS; 00696 } 00697 00698 /* Return to application */ 00699 return (status); 00700 00701 #else 00702 00703 /* Run the below code for Cortex-M0 */ 00704 00705 q7_t *pIn1 = pSrcA; /* inputA pointer */ 00706 q7_t *pIn2 = pSrcB; /* inputB pointer */ 00707 q31_t sum; /* Accumulator */ 00708 uint32_t i, j; /* loop counters */ 00709 arm_status status; /* status of Partial convolution */ 00710 00711 /* Check for range of output samples to be calculated */ 00712 if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U)))) 00713 { 00714 /* Set status as ARM_ARGUMENT_ERROR */ 00715 status = ARM_MATH_ARGUMENT_ERROR; 00716 } 00717 else 00718 { 00719 /* Loop to calculate convolution for output length number of values */ 00720 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00721 { 00722 /* Initialize sum with zero to carry on MAC operations */ 00723 sum = 0; 00724 00725 /* Loop to perform MAC operations according to convolution equation */ 00726 for (j = 0; j <= i; j++) 00727 { 00728 /* Check the array limitations */ 00729 if (((i - j) < srcBLen) && (j < srcALen)) 00730 { 00731 /* z[i] += x[i-j] * y[j] */ 00732 sum += ((q15_t) pIn1[j] * (pIn2[i - j])); 00733 } 00734 } 00735 00736 /* Store the output in the destination buffer */ 00737 pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U); 00738 } 00739 /* set status as ARM_SUCCESS as there are no argument errors */ 00740 status = ARM_MATH_SUCCESS; 00741 } 00742 return (status); 00743 00744 #endif /* #if defined (ARM_MATH_DSP) */ 00745 00746 } 00747 00748 /** 00749 * @} end of PartialConv group 00750 */ 00751
Generated on Tue Jul 12 2022 16:46:23 by
 1.7.2
 1.7.2