CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_conv_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_q7.c 00009 * 00010 * Description: Q7 Convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated 00025 * 00026 * Version 0.0.7 2010/06/10 00027 * Misra-C changes done 00028 * 00029 * -------------------------------------------------------------------- */ 00030 00031 #include "arm_math.h" 00032 00033 /** 00034 * @ingroup groupFilters 00035 */ 00036 00037 /** 00038 * @addtogroup Conv 00039 * @{ 00040 */ 00041 00042 /** 00043 * @brief Convolution of Q7 sequences. 00044 * @param[in] *pSrcA points to the first input sequence. 00045 * @param[in] srcALen length of the first input sequence. 00046 * @param[in] *pSrcB points to the second input sequence. 00047 * @param[in] srcBLen length of the second input sequence. 00048 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00049 * @return none. 00050 * 00051 * @details 00052 * <b>Scaling and Overflow Behavior:</b> 00053 * 00054 * \par 00055 * The function is implemented using a 32-bit internal accumulator. 00056 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 00057 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00058 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 00059 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. 00060 */ 00061 00062 void arm_conv_q7( 00063 q7_t * pSrcA, 00064 uint32_t srcALen, 00065 q7_t * pSrcB, 00066 uint32_t srcBLen, 00067 q7_t * pDst) 00068 { 00069 q7_t *pIn1; /* inputA pointer */ 00070 q7_t *pIn2; /* inputB pointer */ 00071 q7_t *pOut = pDst; /* output pointer */ 00072 q7_t *px; /* Intermediate inputA pointer */ 00073 q7_t *py; /* Intermediate inputB pointer */ 00074 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00075 q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */ 00076 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00077 q31_t input1, input2; /* Temporary input variables */ 00078 q15_t in1, in2; /* Temporary input variables */ 00079 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ 00080 00081 00082 /* The algorithm implementation is based on the lengths of the inputs. */ 00083 /* srcB is always made to slide across srcA. */ 00084 /* So srcBLen is always considered as shorter or equal to srcALen */ 00085 if(srcALen >= srcBLen) 00086 { 00087 /* Initialization of inputA pointer */ 00088 pIn1 = pSrcA; 00089 00090 /* Initialization of inputB pointer */ 00091 pIn2 = pSrcB; 00092 } 00093 else 00094 { 00095 /* Initialization of inputA pointer */ 00096 pIn1 = pSrcB; 00097 00098 /* Initialization of inputB pointer */ 00099 pIn2 = pSrcA; 00100 00101 /* srcBLen is always considered as shorter or equal to srcALen */ 00102 j = srcBLen; 00103 srcBLen = srcALen; 00104 srcALen = j; 00105 } 00106 00107 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00108 /* The function is internally 00109 * divided into three stages according to the number of multiplications that has to be 00110 * taken place between inputA samples and inputB samples. In the first stage of the 00111 * algorithm, the multiplications increase by one for every iteration. 00112 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00113 * In the third stage of the algorithm, the multiplications decrease by one 00114 * for every iteration. */ 00115 00116 /* The algorithm is implemented in three stages. 00117 The loop counters of each stage is initiated here. */ 00118 blockSize1 = srcBLen - 1u; 00119 blockSize2 = (srcALen - srcBLen) + 1u; 00120 blockSize3 = blockSize1; 00121 00122 /* -------------------------- 00123 * Initializations of stage1 00124 * -------------------------*/ 00125 00126 /* sum = x[0] * y[0] 00127 * sum = x[0] * y[1] + x[1] * y[0] 00128 * .... 00129 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00130 */ 00131 00132 /* In this stage the MAC operations are increased by 1 for every iteration. 00133 The count variable holds the number of MAC operations performed */ 00134 count = 1u; 00135 00136 /* Working pointer of inputA */ 00137 px = pIn1; 00138 00139 /* Working pointer of inputB */ 00140 py = pIn2; 00141 00142 00143 /* ------------------------ 00144 * Stage1 process 00145 * ----------------------*/ 00146 00147 /* The first stage starts here */ 00148 while(blockSize1 > 0u) 00149 { 00150 /* Accumulator is made zero for every iteration */ 00151 sum = 0; 00152 00153 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00154 k = count >> 2u; 00155 00156 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00157 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00158 while(k > 0u) 00159 { 00160 /* x[0] , x[1] */ 00161 in1 = (q15_t) * px++; 00162 in2 = (q15_t) * px++; 00163 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00164 00165 /* y[srcBLen - 1] , y[srcBLen - 2] */ 00166 in1 = (q15_t) * py--; 00167 in2 = (q15_t) * py--; 00168 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00169 00170 /* x[0] * y[srcBLen - 1] */ 00171 /* x[1] * y[srcBLen - 2] */ 00172 sum = __SMLAD(input1, input2, sum); 00173 00174 /* x[2] , x[3] */ 00175 in1 = (q15_t) * px++; 00176 in2 = (q15_t) * px++; 00177 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00178 00179 /* y[srcBLen - 3] , y[srcBLen - 4] */ 00180 in1 = (q15_t) * py--; 00181 in2 = (q15_t) * py--; 00182 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00183 00184 /* x[2] * y[srcBLen - 3] */ 00185 /* x[3] * y[srcBLen - 4] */ 00186 sum = __SMLAD(input1, input2, sum); 00187 00188 /* Decrement the loop counter */ 00189 k--; 00190 } 00191 00192 /* If the count is not a multiple of 4, compute any remaining MACs here. 00193 ** No loop unrolling is used. */ 00194 k = count % 0x4u; 00195 00196 while(k > 0u) 00197 { 00198 /* Perform the multiply-accumulates */ 00199 sum += ((q15_t) * px++ * *py--); 00200 00201 /* Decrement the loop counter */ 00202 k--; 00203 } 00204 00205 /* Store the result in the accumulator in the destination buffer. */ 00206 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00207 00208 /* Update the inputA and inputB pointers for next MAC calculation */ 00209 py = pIn2 + count; 00210 px = pIn1; 00211 00212 /* Increment the MAC count */ 00213 count++; 00214 00215 /* Decrement the loop counter */ 00216 blockSize1--; 00217 } 00218 00219 /* -------------------------- 00220 * Initializations of stage2 00221 * ------------------------*/ 00222 00223 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00224 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00225 * .... 00226 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00227 */ 00228 00229 /* Working pointer of inputA */ 00230 px = pIn1; 00231 00232 /* Working pointer of inputB */ 00233 pSrc2 = pIn2 + (srcBLen - 1u); 00234 py = pSrc2; 00235 00236 /* count is index by which the pointer pIn1 to be incremented */ 00237 count = 1u; 00238 00239 /* ------------------- 00240 * Stage2 process 00241 * ------------------*/ 00242 00243 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00244 * So, to loop unroll over blockSize2, 00245 * srcBLen should be greater than or equal to 4 */ 00246 if(srcBLen >= 4u) 00247 { 00248 /* Loop unroll over blockSize2, by 4 */ 00249 blkCnt = blockSize2 >> 2u; 00250 00251 while(blkCnt > 0u) 00252 { 00253 /* Set all accumulators to zero */ 00254 acc0 = 0; 00255 acc1 = 0; 00256 acc2 = 0; 00257 acc3 = 0; 00258 00259 /* read x[0], x[1], x[2] samples */ 00260 x0 = *(px++); 00261 x1 = *(px++); 00262 x2 = *(px++); 00263 00264 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00265 k = srcBLen >> 2u; 00266 00267 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00268 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00269 do 00270 { 00271 /* Read y[srcBLen - 1] sample */ 00272 c0 = *(py--); 00273 /* Read y[srcBLen - 2] sample */ 00274 c1 = *(py--); 00275 00276 /* Read x[3] sample */ 00277 x3 = *(px++); 00278 00279 /* x[0] and x[1] are packed */ 00280 in1 = (q15_t) x0; 00281 in2 = (q15_t) x1; 00282 00283 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00284 00285 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ 00286 in1 = (q15_t) c0; 00287 in2 = (q15_t) c1; 00288 00289 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00290 00291 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00292 acc0 = __SMLAD(input1, input2, acc0); 00293 00294 /* x[1] and x[2] are packed */ 00295 in1 = (q15_t) x1; 00296 in2 = (q15_t) x2; 00297 00298 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00299 00300 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00301 acc1 = __SMLAD(input1, input2, acc1); 00302 00303 /* x[2] and x[3] are packed */ 00304 in1 = (q15_t) x2; 00305 in2 = (q15_t) x3; 00306 00307 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00308 00309 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00310 acc2 = __SMLAD(input1, input2, acc2); 00311 00312 /* Read x[4] sample */ 00313 x0 = *(px++); 00314 00315 /* x[3] and x[4] are packed */ 00316 in1 = (q15_t) x3; 00317 in2 = (q15_t) x0; 00318 00319 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00320 00321 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00322 acc3 = __SMLAD(input1, input2, acc3); 00323 00324 /* Read y[srcBLen - 3] sample */ 00325 c0 = *(py--); 00326 /* Read y[srcBLen - 4] sample */ 00327 c1 = *(py--); 00328 00329 /* Read x[5] sample */ 00330 x1 = *(px++); 00331 00332 /* x[2] and x[3] are packed */ 00333 in1 = (q15_t) x2; 00334 in2 = (q15_t) x3; 00335 00336 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00337 00338 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 00339 in1 = (q15_t) c0; 00340 in2 = (q15_t) c1; 00341 00342 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00343 00344 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00345 acc0 = __SMLAD(input1, input2, acc0); 00346 00347 /* x[3] and x[4] are packed */ 00348 in1 = (q15_t) x3; 00349 in2 = (q15_t) x0; 00350 00351 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00352 00353 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00354 acc1 = __SMLAD(input1, input2, acc1); 00355 00356 /* x[4] and x[5] are packed */ 00357 in1 = (q15_t) x0; 00358 in2 = (q15_t) x1; 00359 00360 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00361 00362 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00363 acc2 = __SMLAD(input1, input2, acc2); 00364 00365 /* Read x[6] sample */ 00366 x2 = *(px++); 00367 00368 /* x[5] and x[6] are packed */ 00369 in1 = (q15_t) x1; 00370 in2 = (q15_t) x2; 00371 00372 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00373 00374 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00375 acc3 = __SMLAD(input1, input2, acc3); 00376 00377 } while(--k); 00378 00379 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00380 ** No loop unrolling is used. */ 00381 k = srcBLen % 0x4u; 00382 00383 while(k > 0u) 00384 { 00385 /* Read y[srcBLen - 5] sample */ 00386 c0 = *(py--); 00387 00388 /* Read x[7] sample */ 00389 x3 = *(px++); 00390 00391 /* Perform the multiply-accumulates */ 00392 /* acc0 += x[4] * y[srcBLen - 5] */ 00393 acc0 += ((q15_t) x0 * c0); 00394 /* acc1 += x[5] * y[srcBLen - 5] */ 00395 acc1 += ((q15_t) x1 * c0); 00396 /* acc2 += x[6] * y[srcBLen - 5] */ 00397 acc2 += ((q15_t) x2 * c0); 00398 /* acc3 += x[7] * y[srcBLen - 5] */ 00399 acc3 += ((q15_t) x3 * c0); 00400 00401 /* Reuse the present samples for the next MAC */ 00402 x0 = x1; 00403 x1 = x2; 00404 x2 = x3; 00405 00406 /* Decrement the loop counter */ 00407 k--; 00408 } 00409 00410 00411 /* Store the result in the accumulator in the destination buffer. */ 00412 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00413 *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00414 *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00415 *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00416 00417 /* Update the inputA and inputB pointers for next MAC calculation */ 00418 px = pIn1 + (count * 4u); 00419 py = pSrc2; 00420 00421 /* Increment the pointer pIn1 index, count by 1 */ 00422 count++; 00423 00424 /* Decrement the loop counter */ 00425 blkCnt--; 00426 } 00427 00428 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00429 ** No loop unrolling is used. */ 00430 blkCnt = blockSize2 % 0x4u; 00431 00432 while(blkCnt > 0u) 00433 { 00434 /* Accumulator is made zero for every iteration */ 00435 sum = 0; 00436 00437 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00438 k = srcBLen >> 2u; 00439 00440 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00441 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00442 while(k > 0u) 00443 { 00444 00445 /* Reading two inputs of SrcA buffer and packing */ 00446 in1 = (q15_t) * px++; 00447 in2 = (q15_t) * px++; 00448 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00449 00450 /* Reading two inputs of SrcB buffer and packing */ 00451 in1 = (q15_t) * py--; 00452 in2 = (q15_t) * py--; 00453 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00454 00455 /* Perform the multiply-accumulates */ 00456 sum = __SMLAD(input1, input2, sum); 00457 00458 /* Reading two inputs of SrcA buffer and packing */ 00459 in1 = (q15_t) * px++; 00460 in2 = (q15_t) * px++; 00461 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00462 00463 /* Reading two inputs of SrcB buffer and packing */ 00464 in1 = (q15_t) * py--; 00465 in2 = (q15_t) * py--; 00466 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00467 00468 /* Perform the multiply-accumulates */ 00469 sum = __SMLAD(input1, input2, sum); 00470 00471 /* Decrement the loop counter */ 00472 k--; 00473 } 00474 00475 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00476 ** No loop unrolling is used. */ 00477 k = srcBLen % 0x4u; 00478 00479 while(k > 0u) 00480 { 00481 /* Perform the multiply-accumulates */ 00482 sum += ((q15_t) * px++ * *py--); 00483 00484 /* Decrement the loop counter */ 00485 k--; 00486 } 00487 00488 /* Store the result in the accumulator in the destination buffer. */ 00489 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00490 00491 /* Update the inputA and inputB pointers for next MAC calculation */ 00492 px = pIn1 + count; 00493 py = pSrc2; 00494 00495 /* Increment the pointer pIn1 index, count by 1 */ 00496 count++; 00497 00498 /* Decrement the loop counter */ 00499 blkCnt--; 00500 } 00501 } 00502 else 00503 { 00504 /* If the srcBLen is not a multiple of 4, 00505 * the blockSize2 loop cannot be unrolled by 4 */ 00506 blkCnt = blockSize2; 00507 00508 while(blkCnt > 0u) 00509 { 00510 /* Accumulator is made zero for every iteration */ 00511 sum = 0; 00512 00513 /* srcBLen number of MACS should be performed */ 00514 k = srcBLen; 00515 00516 while(k > 0u) 00517 { 00518 /* Perform the multiply-accumulate */ 00519 sum += ((q15_t) * px++ * *py--); 00520 00521 /* Decrement the loop counter */ 00522 k--; 00523 } 00524 00525 /* Store the result in the accumulator in the destination buffer. */ 00526 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00527 00528 /* Update the inputA and inputB pointers for next MAC calculation */ 00529 px = pIn1 + count; 00530 py = pSrc2; 00531 00532 /* Increment the MAC count */ 00533 count++; 00534 00535 /* Decrement the loop counter */ 00536 blkCnt--; 00537 } 00538 } 00539 00540 00541 /* -------------------------- 00542 * Initializations of stage3 00543 * -------------------------*/ 00544 00545 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00546 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00547 * .... 00548 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00549 * sum += x[srcALen-1] * y[srcBLen-1] 00550 */ 00551 00552 /* In this stage the MAC operations are decreased by 1 for every iteration. 00553 The blockSize3 variable holds the number of MAC operations performed */ 00554 00555 /* Working pointer of inputA */ 00556 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00557 px = pSrc1; 00558 00559 /* Working pointer of inputB */ 00560 pSrc2 = pIn2 + (srcBLen - 1u); 00561 py = pSrc2; 00562 00563 /* ------------------- 00564 * Stage3 process 00565 * ------------------*/ 00566 00567 while(blockSize3 > 0u) 00568 { 00569 /* Accumulator is made zero for every iteration */ 00570 sum = 0; 00571 00572 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00573 k = blockSize3 >> 2u; 00574 00575 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00576 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00577 while(k > 0u) 00578 { 00579 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 00580 in1 = (q15_t) * px++; 00581 in2 = (q15_t) * px++; 00582 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00583 00584 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 00585 in1 = (q15_t) * py--; 00586 in2 = (q15_t) * py--; 00587 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00588 00589 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00590 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00591 sum = __SMLAD(input1, input2, sum); 00592 00593 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 00594 in1 = (q15_t) * px++; 00595 in2 = (q15_t) * px++; 00596 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00597 00598 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 00599 in1 = (q15_t) * py--; 00600 in2 = (q15_t) * py--; 00601 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00602 00603 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00604 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00605 sum = __SMLAD(input1, input2, sum); 00606 00607 /* Decrement the loop counter */ 00608 k--; 00609 } 00610 00611 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00612 ** No loop unrolling is used. */ 00613 k = blockSize3 % 0x4u; 00614 00615 while(k > 0u) 00616 { 00617 /* Perform the multiply-accumulates */ 00618 sum += ((q15_t) * px++ * *py--); 00619 00620 /* Decrement the loop counter */ 00621 k--; 00622 } 00623 00624 /* Store the result in the accumulator in the destination buffer. */ 00625 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00626 00627 /* Update the inputA and inputB pointers for next MAC calculation */ 00628 px = ++pSrc1; 00629 py = pSrc2; 00630 00631 /* Decrement the loop counter */ 00632 blockSize3--; 00633 } 00634 00635 } 00636 00637 /** 00638 * @} end of Conv group 00639 */
Generated on Tue Jul 12 2022 14:13:52 by 1.7.2