Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
arm_conv_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_conv_q7.c 00004 * Description: Convolution of Q7 sequences 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup Conv 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Convolution of Q7 sequences. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00047 * @return none. 00048 * 00049 * @details 00050 * <b>Scaling and Overflow Behavior:</b> 00051 * 00052 * \par 00053 * The function is implemented using a 32-bit internal accumulator. 00054 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 00055 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00056 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 00057 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. 00058 * 00059 * \par 00060 * Refer the function <code>arm_conv_opt_q7()</code> for a faster implementation of this function. 00061 * 00062 */ 00063 00064 void arm_conv_q7( 00065 q7_t * pSrcA, 00066 uint32_t srcALen, 00067 q7_t * pSrcB, 00068 uint32_t srcBLen, 00069 q7_t * pDst) 00070 { 00071 00072 00073 #if defined (ARM_MATH_DSP) 00074 00075 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00076 00077 q7_t *pIn1; /* inputA pointer */ 00078 q7_t *pIn2; /* inputB pointer */ 00079 q7_t *pOut = pDst; /* output pointer */ 00080 q7_t *px; /* Intermediate inputA pointer */ 00081 q7_t *py; /* Intermediate inputB pointer */ 00082 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00083 q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */ 00084 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00085 q31_t input1, input2; /* Temporary input variables */ 00086 q15_t in1, in2; /* Temporary input variables */ 00087 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ 00088 00089 /* The algorithm implementation is based on the lengths of the inputs. */ 00090 /* srcB is always made to slide across srcA. */ 00091 /* So srcBLen is always considered as shorter or equal to srcALen */ 00092 if (srcALen >= srcBLen) 00093 { 00094 /* Initialization of inputA pointer */ 00095 pIn1 = pSrcA; 00096 00097 /* Initialization of inputB pointer */ 00098 pIn2 = pSrcB; 00099 } 00100 else 00101 { 00102 /* Initialization of inputA pointer */ 00103 pIn1 = pSrcB; 00104 00105 /* Initialization of inputB pointer */ 00106 pIn2 = pSrcA; 00107 00108 /* srcBLen is always considered as shorter or equal to srcALen */ 00109 j = srcBLen; 00110 srcBLen = srcALen; 00111 srcALen = j; 00112 } 00113 00114 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00115 /* The function is internally 00116 * divided into three stages according to the number of multiplications that has to be 00117 * taken place between inputA samples and inputB samples. In the first stage of the 00118 * algorithm, the multiplications increase by one for every iteration. 00119 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00120 * In the third stage of the algorithm, the multiplications decrease by one 00121 * for every iteration. */ 00122 00123 /* The algorithm is implemented in three stages. 00124 The loop counters of each stage is initiated here. */ 00125 blockSize1 = srcBLen - 1U; 00126 blockSize2 = (srcALen - srcBLen) + 1U; 00127 blockSize3 = blockSize1; 00128 00129 /* -------------------------- 00130 * Initializations of stage1 00131 * -------------------------*/ 00132 00133 /* sum = x[0] * y[0] 00134 * sum = x[0] * y[1] + x[1] * y[0] 00135 * .... 00136 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00137 */ 00138 00139 /* In this stage the MAC operations are increased by 1 for every iteration. 00140 The count variable holds the number of MAC operations performed */ 00141 count = 1U; 00142 00143 /* Working pointer of inputA */ 00144 px = pIn1; 00145 00146 /* Working pointer of inputB */ 00147 py = pIn2; 00148 00149 00150 /* ------------------------ 00151 * Stage1 process 00152 * ----------------------*/ 00153 00154 /* The first stage starts here */ 00155 while (blockSize1 > 0U) 00156 { 00157 /* Accumulator is made zero for every iteration */ 00158 sum = 0; 00159 00160 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00161 k = count >> 2U; 00162 00163 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00164 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00165 while (k > 0U) 00166 { 00167 /* x[0] , x[1] */ 00168 in1 = (q15_t) * px++; 00169 in2 = (q15_t) * px++; 00170 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00171 00172 /* y[srcBLen - 1] , y[srcBLen - 2] */ 00173 in1 = (q15_t) * py--; 00174 in2 = (q15_t) * py--; 00175 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00176 00177 /* x[0] * y[srcBLen - 1] */ 00178 /* x[1] * y[srcBLen - 2] */ 00179 sum = __SMLAD(input1, input2, sum); 00180 00181 /* x[2] , x[3] */ 00182 in1 = (q15_t) * px++; 00183 in2 = (q15_t) * px++; 00184 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00185 00186 /* y[srcBLen - 3] , y[srcBLen - 4] */ 00187 in1 = (q15_t) * py--; 00188 in2 = (q15_t) * py--; 00189 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00190 00191 /* x[2] * y[srcBLen - 3] */ 00192 /* x[3] * y[srcBLen - 4] */ 00193 sum = __SMLAD(input1, input2, sum); 00194 00195 /* Decrement the loop counter */ 00196 k--; 00197 } 00198 00199 /* If the count is not a multiple of 4, compute any remaining MACs here. 00200 ** No loop unrolling is used. */ 00201 k = count % 0x4U; 00202 00203 while (k > 0U) 00204 { 00205 /* Perform the multiply-accumulates */ 00206 sum += ((q15_t) * px++ * *py--); 00207 00208 /* Decrement the loop counter */ 00209 k--; 00210 } 00211 00212 /* Store the result in the accumulator in the destination buffer. */ 00213 *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8)); 00214 00215 /* Update the inputA and inputB pointers for next MAC calculation */ 00216 py = pIn2 + count; 00217 px = pIn1; 00218 00219 /* Increment the MAC count */ 00220 count++; 00221 00222 /* Decrement the loop counter */ 00223 blockSize1--; 00224 } 00225 00226 /* -------------------------- 00227 * Initializations of stage2 00228 * ------------------------*/ 00229 00230 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00231 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00232 * .... 00233 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00234 */ 00235 00236 /* Working pointer of inputA */ 00237 px = pIn1; 00238 00239 /* Working pointer of inputB */ 00240 pSrc2 = pIn2 + (srcBLen - 1U); 00241 py = pSrc2; 00242 00243 /* count is index by which the pointer pIn1 to be incremented */ 00244 count = 0U; 00245 00246 /* ------------------- 00247 * Stage2 process 00248 * ------------------*/ 00249 00250 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00251 * So, to loop unroll over blockSize2, 00252 * srcBLen should be greater than or equal to 4 */ 00253 if (srcBLen >= 4U) 00254 { 00255 /* Loop unroll over blockSize2, by 4 */ 00256 blkCnt = blockSize2 >> 2U; 00257 00258 while (blkCnt > 0U) 00259 { 00260 /* Set all accumulators to zero */ 00261 acc0 = 0; 00262 acc1 = 0; 00263 acc2 = 0; 00264 acc3 = 0; 00265 00266 /* read x[0], x[1], x[2] samples */ 00267 x0 = *(px++); 00268 x1 = *(px++); 00269 x2 = *(px++); 00270 00271 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00272 k = srcBLen >> 2U; 00273 00274 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00275 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00276 do 00277 { 00278 /* Read y[srcBLen - 1] sample */ 00279 c0 = *(py--); 00280 /* Read y[srcBLen - 2] sample */ 00281 c1 = *(py--); 00282 00283 /* Read x[3] sample */ 00284 x3 = *(px++); 00285 00286 /* x[0] and x[1] are packed */ 00287 in1 = (q15_t) x0; 00288 in2 = (q15_t) x1; 00289 00290 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00291 00292 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ 00293 in1 = (q15_t) c0; 00294 in2 = (q15_t) c1; 00295 00296 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00297 00298 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00299 acc0 = __SMLAD(input1, input2, acc0); 00300 00301 /* x[1] and x[2] are packed */ 00302 in1 = (q15_t) x1; 00303 in2 = (q15_t) x2; 00304 00305 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00306 00307 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00308 acc1 = __SMLAD(input1, input2, acc1); 00309 00310 /* x[2] and x[3] are packed */ 00311 in1 = (q15_t) x2; 00312 in2 = (q15_t) x3; 00313 00314 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00315 00316 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00317 acc2 = __SMLAD(input1, input2, acc2); 00318 00319 /* Read x[4] sample */ 00320 x0 = *(px++); 00321 00322 /* x[3] and x[4] are packed */ 00323 in1 = (q15_t) x3; 00324 in2 = (q15_t) x0; 00325 00326 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00327 00328 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00329 acc3 = __SMLAD(input1, input2, acc3); 00330 00331 /* Read y[srcBLen - 3] sample */ 00332 c0 = *(py--); 00333 /* Read y[srcBLen - 4] sample */ 00334 c1 = *(py--); 00335 00336 /* Read x[5] sample */ 00337 x1 = *(px++); 00338 00339 /* x[2] and x[3] are packed */ 00340 in1 = (q15_t) x2; 00341 in2 = (q15_t) x3; 00342 00343 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00344 00345 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 00346 in1 = (q15_t) c0; 00347 in2 = (q15_t) c1; 00348 00349 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00350 00351 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00352 acc0 = __SMLAD(input1, input2, acc0); 00353 00354 /* x[3] and x[4] are packed */ 00355 in1 = (q15_t) x3; 00356 in2 = (q15_t) x0; 00357 00358 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00359 00360 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00361 acc1 = __SMLAD(input1, input2, acc1); 00362 00363 /* x[4] and x[5] are packed */ 00364 in1 = (q15_t) x0; 00365 in2 = (q15_t) x1; 00366 00367 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00368 00369 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00370 acc2 = __SMLAD(input1, input2, acc2); 00371 00372 /* Read x[6] sample */ 00373 x2 = *(px++); 00374 00375 /* x[5] and x[6] are packed */ 00376 in1 = (q15_t) x1; 00377 in2 = (q15_t) x2; 00378 00379 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00380 00381 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00382 acc3 = __SMLAD(input1, input2, acc3); 00383 00384 } while (--k); 00385 00386 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00387 ** No loop unrolling is used. */ 00388 k = srcBLen % 0x4U; 00389 00390 while (k > 0U) 00391 { 00392 /* Read y[srcBLen - 5] sample */ 00393 c0 = *(py--); 00394 00395 /* Read x[7] sample */ 00396 x3 = *(px++); 00397 00398 /* Perform the multiply-accumulates */ 00399 /* acc0 += x[4] * y[srcBLen - 5] */ 00400 acc0 += ((q15_t) x0 * c0); 00401 /* acc1 += x[5] * y[srcBLen - 5] */ 00402 acc1 += ((q15_t) x1 * c0); 00403 /* acc2 += x[6] * y[srcBLen - 5] */ 00404 acc2 += ((q15_t) x2 * c0); 00405 /* acc3 += x[7] * y[srcBLen - 5] */ 00406 acc3 += ((q15_t) x3 * c0); 00407 00408 /* Reuse the present samples for the next MAC */ 00409 x0 = x1; 00410 x1 = x2; 00411 x2 = x3; 00412 00413 /* Decrement the loop counter */ 00414 k--; 00415 } 00416 00417 00418 /* Store the result in the accumulator in the destination buffer. */ 00419 *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8)); 00420 *pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8)); 00421 *pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8)); 00422 *pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8)); 00423 00424 /* Increment the pointer pIn1 index, count by 4 */ 00425 count += 4U; 00426 00427 /* Update the inputA and inputB pointers for next MAC calculation */ 00428 px = pIn1 + count; 00429 py = pSrc2; 00430 00431 /* Decrement the loop counter */ 00432 blkCnt--; 00433 } 00434 00435 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00436 ** No loop unrolling is used. */ 00437 blkCnt = blockSize2 % 0x4U; 00438 00439 while (blkCnt > 0U) 00440 { 00441 /* Accumulator is made zero for every iteration */ 00442 sum = 0; 00443 00444 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00445 k = srcBLen >> 2U; 00446 00447 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00448 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00449 while (k > 0U) 00450 { 00451 00452 /* Reading two inputs of SrcA buffer and packing */ 00453 in1 = (q15_t) * px++; 00454 in2 = (q15_t) * px++; 00455 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00456 00457 /* Reading two inputs of SrcB buffer and packing */ 00458 in1 = (q15_t) * py--; 00459 in2 = (q15_t) * py--; 00460 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00461 00462 /* Perform the multiply-accumulates */ 00463 sum = __SMLAD(input1, input2, sum); 00464 00465 /* Reading two inputs of SrcA buffer and packing */ 00466 in1 = (q15_t) * px++; 00467 in2 = (q15_t) * px++; 00468 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00469 00470 /* Reading two inputs of SrcB buffer and packing */ 00471 in1 = (q15_t) * py--; 00472 in2 = (q15_t) * py--; 00473 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00474 00475 /* Perform the multiply-accumulates */ 00476 sum = __SMLAD(input1, input2, sum); 00477 00478 /* Decrement the loop counter */ 00479 k--; 00480 } 00481 00482 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00483 ** No loop unrolling is used. */ 00484 k = srcBLen % 0x4U; 00485 00486 while (k > 0U) 00487 { 00488 /* Perform the multiply-accumulates */ 00489 sum += ((q15_t) * px++ * *py--); 00490 00491 /* Decrement the loop counter */ 00492 k--; 00493 } 00494 00495 /* Store the result in the accumulator in the destination buffer. */ 00496 *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8)); 00497 00498 /* Increment the pointer pIn1 index, count by 1 */ 00499 count++; 00500 00501 /* Update the inputA and inputB pointers for next MAC calculation */ 00502 px = pIn1 + count; 00503 py = pSrc2; 00504 00505 /* Decrement the loop counter */ 00506 blkCnt--; 00507 } 00508 } 00509 else 00510 { 00511 /* If the srcBLen is not a multiple of 4, 00512 * the blockSize2 loop cannot be unrolled by 4 */ 00513 blkCnt = blockSize2; 00514 00515 while (blkCnt > 0U) 00516 { 00517 /* Accumulator is made zero for every iteration */ 00518 sum = 0; 00519 00520 /* srcBLen number of MACS should be performed */ 00521 k = srcBLen; 00522 00523 while (k > 0U) 00524 { 00525 /* Perform the multiply-accumulate */ 00526 sum += ((q15_t) * px++ * *py--); 00527 00528 /* Decrement the loop counter */ 00529 k--; 00530 } 00531 00532 /* Store the result in the accumulator in the destination buffer. */ 00533 *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8)); 00534 00535 /* Increment the MAC count */ 00536 count++; 00537 00538 /* Update the inputA and inputB pointers for next MAC calculation */ 00539 px = pIn1 + count; 00540 py = pSrc2; 00541 00542 /* Decrement the loop counter */ 00543 blkCnt--; 00544 } 00545 } 00546 00547 00548 /* -------------------------- 00549 * Initializations of stage3 00550 * -------------------------*/ 00551 00552 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00553 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00554 * .... 00555 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00556 * sum += x[srcALen-1] * y[srcBLen-1] 00557 */ 00558 00559 /* In this stage the MAC operations are decreased by 1 for every iteration. 00560 The blockSize3 variable holds the number of MAC operations performed */ 00561 00562 /* Working pointer of inputA */ 00563 pSrc1 = pIn1 + (srcALen - (srcBLen - 1U)); 00564 px = pSrc1; 00565 00566 /* Working pointer of inputB */ 00567 pSrc2 = pIn2 + (srcBLen - 1U); 00568 py = pSrc2; 00569 00570 /* ------------------- 00571 * Stage3 process 00572 * ------------------*/ 00573 00574 while (blockSize3 > 0U) 00575 { 00576 /* Accumulator is made zero for every iteration */ 00577 sum = 0; 00578 00579 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00580 k = blockSize3 >> 2U; 00581 00582 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00583 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00584 while (k > 0U) 00585 { 00586 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 00587 in1 = (q15_t) * px++; 00588 in2 = (q15_t) * px++; 00589 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00590 00591 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 00592 in1 = (q15_t) * py--; 00593 in2 = (q15_t) * py--; 00594 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00595 00596 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00597 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00598 sum = __SMLAD(input1, input2, sum); 00599 00600 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 00601 in1 = (q15_t) * px++; 00602 in2 = (q15_t) * px++; 00603 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00604 00605 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 00606 in1 = (q15_t) * py--; 00607 in2 = (q15_t) * py--; 00608 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U); 00609 00610 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00611 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00612 sum = __SMLAD(input1, input2, sum); 00613 00614 /* Decrement the loop counter */ 00615 k--; 00616 } 00617 00618 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00619 ** No loop unrolling is used. */ 00620 k = blockSize3 % 0x4U; 00621 00622 while (k > 0U) 00623 { 00624 /* Perform the multiply-accumulates */ 00625 sum += ((q15_t) * px++ * *py--); 00626 00627 /* Decrement the loop counter */ 00628 k--; 00629 } 00630 00631 /* Store the result in the accumulator in the destination buffer. */ 00632 *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8)); 00633 00634 /* Update the inputA and inputB pointers for next MAC calculation */ 00635 px = ++pSrc1; 00636 py = pSrc2; 00637 00638 /* Decrement the loop counter */ 00639 blockSize3--; 00640 } 00641 00642 #else 00643 00644 /* Run the below code for Cortex-M0 */ 00645 00646 q7_t *pIn1 = pSrcA; /* input pointer */ 00647 q7_t *pIn2 = pSrcB; /* coefficient pointer */ 00648 q31_t sum; /* Accumulator */ 00649 uint32_t i, j; /* loop counter */ 00650 00651 /* Loop to calculate output of convolution for output length number of times */ 00652 for (i = 0; i < (srcALen + srcBLen - 1); i++) 00653 { 00654 /* Initialize sum with zero to carry on MAC operations */ 00655 sum = 0; 00656 00657 /* Loop to perform MAC operations according to convolution equation */ 00658 for (j = 0; j <= i; j++) 00659 { 00660 /* Check the array limitations */ 00661 if (((i - j) < srcBLen) && (j < srcALen)) 00662 { 00663 /* z[i] += x[i-j] * y[j] */ 00664 sum += (q15_t) pIn1[j] * (pIn2[i - j]); 00665 } 00666 } 00667 00668 /* Store the output in the destination buffer */ 00669 pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U); 00670 } 00671 00672 #endif /* #if defined (ARM_MATH_DSP) */ 00673 00674 } 00675 00676 /** 00677 * @} end of Conv group 00678 */ 00679
Generated on Tue Jul 12 2022 16:46:23 by
 1.7.2
 1.7.2