CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_correlate_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_q7.c 00009 * 00010 * Description: Process function for Q7 Correlation. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated 00025 * 00026 * Version 0.0.7 2010/06/10 00027 * Misra-C changes done 00028 * 00029 * -------------------------------------------------------------------- */ 00030 00031 #include "arm_math.h" 00032 00033 /** 00034 * @ingroup groupFilters 00035 */ 00036 00037 /** 00038 * @addtogroup Corr 00039 * @{ 00040 */ 00041 00042 /** 00043 * @brief Correlation of Q7 sequences. 00044 * @param[in] *pSrcA points to the first input sequence. 00045 * @param[in] srcALen length of the first input sequence. 00046 * @param[in] *pSrcB points to the second input sequence. 00047 * @param[in] srcBLen length of the second input sequence. 00048 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. 00049 * @return none. 00050 * 00051 * @details 00052 * <b>Scaling and Overflow Behavior:</b> 00053 * 00054 * \par 00055 * The function is implemented using a 32-bit internal accumulator. 00056 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 00057 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00058 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 00059 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format. 00060 */ 00061 00062 void arm_correlate_q7( 00063 q7_t * pSrcA, 00064 uint32_t srcALen, 00065 q7_t * pSrcB, 00066 uint32_t srcBLen, 00067 q7_t * pDst) 00068 { 00069 q7_t *pIn1; /* inputA pointer */ 00070 q7_t *pIn2; /* inputB pointer */ 00071 q7_t *pOut = pDst; /* output pointer */ 00072 q7_t *px; /* Intermediate inputA pointer */ 00073 q7_t *py; /* Intermediate inputB pointer */ 00074 q7_t *pSrc1; /* Intermediate pointers */ 00075 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00076 q31_t input1, input2; /* temporary variables */ 00077 q15_t in1, in2; /* temporary variables */ 00078 q7_t x0, x1, x2, x3, c0, c1; /* temporary variables for holding input and coefficient values */ 00079 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00080 int32_t inc = 1; 00081 00082 00083 /* The algorithm implementation is based on the lengths of the inputs. */ 00084 /* srcB is always made to slide across srcA. */ 00085 /* So srcBLen is always considered as shorter or equal to srcALen */ 00086 /* But CORR(x, y) is reverse of CORR(y, x) */ 00087 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00088 /* and the destination pointer modifier, inc is set to -1 */ 00089 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00090 /* But to improve the performance, 00091 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00092 /* If srcALen > srcBLen, 00093 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00094 /* If srcALen < srcBLen, 00095 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00096 if(srcALen >= srcBLen) 00097 { 00098 /* Initialization of inputA pointer */ 00099 pIn1 = (pSrcA); 00100 00101 /* Initialization of inputB pointer */ 00102 pIn2 = (pSrcB); 00103 00104 /* Number of output samples is calculated */ 00105 outBlockSize = (2u * srcALen) - 1u; 00106 00107 /* When srcALen > srcBLen, zero padding is done to srcB 00108 * to make their lengths equal. 00109 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00110 * number of output samples are made zero */ 00111 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00112 00113 while(j > 0u) 00114 { 00115 /* Zero is stored in the destination buffer */ 00116 *pOut++ = 0; 00117 00118 /* Decrement the loop counter */ 00119 j--; 00120 } 00121 00122 } 00123 else 00124 { 00125 /* Initialization of inputA pointer */ 00126 pIn1 = (pSrcB); 00127 00128 /* Initialization of inputB pointer */ 00129 pIn2 = (pSrcA); 00130 00131 /* srcBLen is always considered as shorter or equal to srcALen */ 00132 j = srcBLen; 00133 srcBLen = srcALen; 00134 srcALen = j; 00135 00136 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00137 /* Hence set the destination pointer to point to the last output sample */ 00138 pOut = pDst + ((srcALen + srcBLen) - 2u); 00139 00140 /* Destination address modifier is set to -1 */ 00141 inc = -1; 00142 00143 } 00144 00145 /* The function is internally 00146 * divided into three parts according to the number of multiplications that has to be 00147 * taken place between inputA samples and inputB samples. In the first part of the 00148 * algorithm, the multiplications increase by one for every iteration. 00149 * In the second part of the algorithm, srcBLen number of multiplications are done. 00150 * In the third part of the algorithm, the multiplications decrease by one 00151 * for every iteration.*/ 00152 /* The algorithm is implemented in three stages. 00153 * The loop counters of each stage is initiated here. */ 00154 blockSize1 = srcBLen - 1u; 00155 blockSize2 = srcALen - (srcBLen - 1u); 00156 blockSize3 = blockSize1; 00157 00158 /* -------------------------- 00159 * Initializations of stage1 00160 * -------------------------*/ 00161 00162 /* sum = x[0] * y[srcBlen - 1] 00163 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00164 * .... 00165 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00166 */ 00167 00168 /* In this stage the MAC operations are increased by 1 for every iteration. 00169 The count variable holds the number of MAC operations performed */ 00170 count = 1u; 00171 00172 /* Working pointer of inputA */ 00173 px = pIn1; 00174 00175 /* Working pointer of inputB */ 00176 pSrc1 = pIn2 + (srcBLen - 1u); 00177 py = pSrc1; 00178 00179 /* ------------------------ 00180 * Stage1 process 00181 * ----------------------*/ 00182 00183 /* The first stage starts here */ 00184 while(blockSize1 > 0u) 00185 { 00186 /* Accumulator is made zero for every iteration */ 00187 sum = 0; 00188 00189 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00190 k = count >> 2; 00191 00192 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00193 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00194 while(k > 0u) 00195 { 00196 /* x[0] , x[1] */ 00197 in1 = (q15_t) * px++; 00198 in2 = (q15_t) * px++; 00199 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00200 00201 /* y[srcBLen - 4] , y[srcBLen - 3] */ 00202 in1 = (q15_t) * py++; 00203 in2 = (q15_t) * py++; 00204 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00205 00206 /* x[0] * y[srcBLen - 4] */ 00207 /* x[1] * y[srcBLen - 3] */ 00208 sum = __SMLAD(input1, input2, sum); 00209 00210 /* x[2] , x[3] */ 00211 in1 = (q15_t) * px++; 00212 in2 = (q15_t) * px++; 00213 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00214 00215 /* y[srcBLen - 2] , y[srcBLen - 1] */ 00216 in1 = (q15_t) * py++; 00217 in2 = (q15_t) * py++; 00218 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00219 00220 /* x[2] * y[srcBLen - 2] */ 00221 /* x[3] * y[srcBLen - 1] */ 00222 sum = __SMLAD(input1, input2, sum); 00223 00224 00225 /* Decrement the loop counter */ 00226 k--; 00227 } 00228 00229 /* If the count is not a multiple of 4, compute any remaining MACs here. 00230 ** No loop unrolling is used. */ 00231 k = count % 0x4u; 00232 00233 while(k > 0u) 00234 { 00235 /* Perform the multiply-accumulates */ 00236 /* x[0] * y[srcBLen - 1] */ 00237 sum += (q31_t) ((q15_t) * px++ * *py++); 00238 00239 /* Decrement the loop counter */ 00240 k--; 00241 } 00242 00243 /* Store the result in the accumulator in the destination buffer. */ 00244 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00245 /* Destination pointer is updated according to the address modifier, inc */ 00246 pOut += inc; 00247 00248 /* Update the inputA and inputB pointers for next MAC calculation */ 00249 py = pSrc1 - count; 00250 px = pIn1; 00251 00252 /* Increment the MAC count */ 00253 count++; 00254 00255 /* Decrement the loop counter */ 00256 blockSize1--; 00257 } 00258 00259 /* -------------------------- 00260 * Initializations of stage2 00261 * ------------------------*/ 00262 00263 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00264 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00265 * .... 00266 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00267 */ 00268 00269 /* Working pointer of inputA */ 00270 px = pIn1; 00271 00272 /* Working pointer of inputB */ 00273 py = pIn2; 00274 00275 /* count is index by which the pointer pIn1 to be incremented */ 00276 count = 1u; 00277 00278 /* ------------------- 00279 * Stage2 process 00280 * ------------------*/ 00281 00282 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00283 * So, to loop unroll over blockSize2, 00284 * srcBLen should be greater than or equal to 4 */ 00285 if(srcBLen >= 4u) 00286 { 00287 /* Loop unroll over blockSize2, by 4 */ 00288 blkCnt = blockSize2 >> 2u; 00289 00290 while(blkCnt > 0u) 00291 { 00292 /* Set all accumulators to zero */ 00293 acc0 = 0; 00294 acc1 = 0; 00295 acc2 = 0; 00296 acc3 = 0; 00297 00298 /* read x[0], x[1], x[2] samples */ 00299 x0 = *px++; 00300 x1 = *px++; 00301 x2 = *px++; 00302 00303 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00304 k = srcBLen >> 2u; 00305 00306 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00307 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00308 do 00309 { 00310 /* Read y[0] sample */ 00311 c0 = *py++; 00312 /* Read y[1] sample */ 00313 c1 = *py++; 00314 00315 /* Read x[3] sample */ 00316 x3 = *px++; 00317 00318 /* x[0] and x[1] are packed */ 00319 in1 = (q15_t) x0; 00320 in2 = (q15_t) x1; 00321 00322 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00323 00324 /* y[0] and y[1] are packed */ 00325 in1 = (q15_t) c0; 00326 in2 = (q15_t) c1; 00327 00328 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00329 00330 /* acc0 += x[0] * y[0] + x[1] * y[1] */ 00331 acc0 = __SMLAD(input1, input2, acc0); 00332 00333 /* x[1] and x[2] are packed */ 00334 in1 = (q15_t) x1; 00335 in2 = (q15_t) x2; 00336 00337 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00338 00339 /* acc1 += x[1] * y[0] + x[2] * y[1] */ 00340 acc1 = __SMLAD(input1, input2, acc1); 00341 00342 /* x[2] and x[3] are packed */ 00343 in1 = (q15_t) x2; 00344 in2 = (q15_t) x3; 00345 00346 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00347 00348 /* acc2 += x[2] * y[0] + x[3] * y[1] */ 00349 acc2 = __SMLAD(input1, input2, acc2); 00350 00351 /* Read x[4] sample */ 00352 x0 = *(px++); 00353 00354 /* x[3] and x[4] are packed */ 00355 in1 = (q15_t) x3; 00356 in2 = (q15_t) x0; 00357 00358 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00359 00360 /* acc3 += x[3] * y[0] + x[4] * y[1] */ 00361 acc3 = __SMLAD(input1, input2, acc3); 00362 00363 /* Read y[2] sample */ 00364 c0 = *py++; 00365 /* Read y[3] sample */ 00366 c1 = *py++; 00367 00368 /* Read x[5] sample */ 00369 x1 = *px++; 00370 00371 /* x[2] and x[3] are packed */ 00372 in1 = (q15_t) x2; 00373 in2 = (q15_t) x3; 00374 00375 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00376 00377 /* y[2] and y[3] are packed */ 00378 in1 = (q15_t) c0; 00379 in2 = (q15_t) c1; 00380 00381 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00382 00383 /* acc0 += x[2] * y[2] + x[3] * y[3] */ 00384 acc0 = __SMLAD(input1, input2, acc0); 00385 00386 /* x[3] and x[4] are packed */ 00387 in1 = (q15_t) x3; 00388 in2 = (q15_t) x0; 00389 00390 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00391 00392 /* acc1 += x[3] * y[2] + x[4] * y[3] */ 00393 acc1 = __SMLAD(input1, input2, acc1); 00394 00395 /* x[4] and x[5] are packed */ 00396 in1 = (q15_t) x0; 00397 in2 = (q15_t) x1; 00398 00399 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00400 00401 /* acc2 += x[4] * y[2] + x[5] * y[3] */ 00402 acc2 = __SMLAD(input1, input2, acc2); 00403 00404 /* Read x[6] sample */ 00405 x2 = *px++; 00406 00407 /* x[5] and x[6] are packed */ 00408 in1 = (q15_t) x1; 00409 in2 = (q15_t) x2; 00410 00411 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00412 00413 /* acc3 += x[5] * y[2] + x[6] * y[3] */ 00414 acc3 = __SMLAD(input1, input2, acc3); 00415 00416 } while(--k); 00417 00418 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00419 ** No loop unrolling is used. */ 00420 k = srcBLen % 0x4u; 00421 00422 while(k > 0u) 00423 { 00424 /* Read y[4] sample */ 00425 c0 = *py++; 00426 00427 /* Read x[7] sample */ 00428 x3 = *px++; 00429 00430 /* Perform the multiply-accumulates */ 00431 /* acc0 += x[4] * y[4] */ 00432 acc0 += ((q15_t) x0 * c0); 00433 /* acc1 += x[5] * y[4] */ 00434 acc1 += ((q15_t) x1 * c0); 00435 /* acc2 += x[6] * y[4] */ 00436 acc2 += ((q15_t) x2 * c0); 00437 /* acc3 += x[7] * y[4] */ 00438 acc3 += ((q15_t) x3 * c0); 00439 00440 /* Reuse the present samples for the next MAC */ 00441 x0 = x1; 00442 x1 = x2; 00443 x2 = x3; 00444 00445 /* Decrement the loop counter */ 00446 k--; 00447 } 00448 00449 /* Store the result in the accumulator in the destination buffer. */ 00450 *pOut = (q7_t) (__SSAT(acc0 >> 7, 8)); 00451 /* Destination pointer is updated according to the address modifier, inc */ 00452 pOut += inc; 00453 00454 *pOut = (q7_t) (__SSAT(acc1 >> 7, 8)); 00455 pOut += inc; 00456 00457 *pOut = (q7_t) (__SSAT(acc2 >> 7, 8)); 00458 pOut += inc; 00459 00460 *pOut = (q7_t) (__SSAT(acc3 >> 7, 8)); 00461 pOut += inc; 00462 00463 /* Update the inputA and inputB pointers for next MAC calculation */ 00464 px = pIn1 + (count * 4u); 00465 py = pIn2; 00466 00467 /* Increment the pointer pIn1 index, count by 1 */ 00468 count++; 00469 00470 /* Decrement the loop counter */ 00471 blkCnt--; 00472 } 00473 00474 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00475 ** No loop unrolling is used. */ 00476 blkCnt = blockSize2 % 0x4u; 00477 00478 while(blkCnt > 0u) 00479 { 00480 /* Accumulator is made zero for every iteration */ 00481 sum = 0; 00482 00483 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00484 k = srcBLen >> 2u; 00485 00486 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00487 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00488 while(k > 0u) 00489 { 00490 /* Reading two inputs of SrcA buffer and packing */ 00491 in1 = (q15_t) * px++; 00492 in2 = (q15_t) * px++; 00493 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00494 00495 /* Reading two inputs of SrcB buffer and packing */ 00496 in1 = (q15_t) * py++; 00497 in2 = (q15_t) * py++; 00498 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00499 00500 /* Perform the multiply-accumulates */ 00501 sum = __SMLAD(input1, input2, sum); 00502 00503 /* Reading two inputs of SrcA buffer and packing */ 00504 in1 = (q15_t) * px++; 00505 in2 = (q15_t) * px++; 00506 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00507 00508 /* Reading two inputs of SrcB buffer and packing */ 00509 in1 = (q15_t) * py++; 00510 in2 = (q15_t) * py++; 00511 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00512 00513 /* Perform the multiply-accumulates */ 00514 sum = __SMLAD(input1, input2, sum); 00515 00516 /* Decrement the loop counter */ 00517 k--; 00518 } 00519 00520 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00521 ** No loop unrolling is used. */ 00522 k = srcBLen % 0x4u; 00523 00524 while(k > 0u) 00525 { 00526 /* Perform the multiply-accumulates */ 00527 sum += ((q15_t) * px++ * *py++); 00528 00529 /* Decrement the loop counter */ 00530 k--; 00531 } 00532 00533 /* Store the result in the accumulator in the destination buffer. */ 00534 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00535 /* Destination pointer is updated according to the address modifier, inc */ 00536 pOut += inc; 00537 00538 /* Update the inputA and inputB pointers for next MAC calculation */ 00539 px = pIn1 + count; 00540 py = pIn2; 00541 00542 /* Increment the pointer pIn1 index, count by 1 */ 00543 count++; 00544 00545 /* Decrement the loop counter */ 00546 blkCnt--; 00547 } 00548 } 00549 else 00550 { 00551 /* If the srcBLen is not a multiple of 4, 00552 * the blockSize2 loop cannot be unrolled by 4 */ 00553 blkCnt = blockSize2; 00554 00555 while(blkCnt > 0u) 00556 { 00557 /* Accumulator is made zero for every iteration */ 00558 sum = 0; 00559 00560 /* Loop over srcBLen */ 00561 k = srcBLen; 00562 00563 while(k > 0u) 00564 { 00565 /* Perform the multiply-accumulate */ 00566 sum += ((q15_t) * px++ * *py++); 00567 00568 /* Decrement the loop counter */ 00569 k--; 00570 } 00571 00572 /* Store the result in the accumulator in the destination buffer. */ 00573 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00574 /* Destination pointer is updated according to the address modifier, inc */ 00575 pOut += inc; 00576 00577 /* Update the inputA and inputB pointers for next MAC calculation */ 00578 px = pIn1 + count; 00579 py = pIn2; 00580 00581 /* Increment the MAC count */ 00582 count++; 00583 00584 /* Decrement the loop counter */ 00585 blkCnt--; 00586 } 00587 } 00588 00589 /* -------------------------- 00590 * Initializations of stage3 00591 * -------------------------*/ 00592 00593 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00594 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00595 * .... 00596 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00597 * sum += x[srcALen-1] * y[0] 00598 */ 00599 00600 /* In this stage the MAC operations are decreased by 1 for every iteration. 00601 The count variable holds the number of MAC operations performed */ 00602 count = srcBLen - 1u; 00603 00604 /* Working pointer of inputA */ 00605 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00606 px = pSrc1; 00607 00608 /* Working pointer of inputB */ 00609 py = pIn2; 00610 00611 /* ------------------- 00612 * Stage3 process 00613 * ------------------*/ 00614 00615 while(blockSize3 > 0u) 00616 { 00617 /* Accumulator is made zero for every iteration */ 00618 sum = 0; 00619 00620 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00621 k = count >> 2u; 00622 00623 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00624 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00625 while(k > 0u) 00626 { 00627 /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2] */ 00628 in1 = (q15_t) * px++; 00629 in2 = (q15_t) * px++; 00630 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00631 00632 /* y[0] , y[1] */ 00633 in1 = (q15_t) * py++; 00634 in2 = (q15_t) * py++; 00635 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00636 00637 /* sum += x[srcALen - srcBLen + 1] * y[0] */ 00638 /* sum += x[srcALen - srcBLen + 2] * y[1] */ 00639 sum = __SMLAD(input1, input2, sum); 00640 00641 /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */ 00642 in1 = (q15_t) * px++; 00643 in2 = (q15_t) * px++; 00644 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00645 00646 /* y[2] , y[3] */ 00647 in1 = (q15_t) * py++; 00648 in2 = (q15_t) * py++; 00649 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00650 00651 /* sum += x[srcALen - srcBLen + 3] * y[2] */ 00652 /* sum += x[srcALen - srcBLen + 4] * y[3] */ 00653 sum = __SMLAD(input1, input2, sum); 00654 00655 /* Decrement the loop counter */ 00656 k--; 00657 } 00658 00659 /* If the count is not a multiple of 4, compute any remaining MACs here. 00660 ** No loop unrolling is used. */ 00661 k = count % 0x4u; 00662 00663 while(k > 0u) 00664 { 00665 /* Perform the multiply-accumulates */ 00666 sum += ((q15_t) * px++ * *py++); 00667 00668 /* Decrement the loop counter */ 00669 k--; 00670 } 00671 00672 /* Store the result in the accumulator in the destination buffer. */ 00673 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00674 /* Destination pointer is updated according to the address modifier, inc */ 00675 pOut += inc; 00676 00677 /* Update the inputA and inputB pointers for next MAC calculation */ 00678 px = ++pSrc1; 00679 py = pIn2; 00680 00681 /* Decrement the MAC count */ 00682 count--; 00683 00684 /* Decrement the loop counter */ 00685 blockSize3--; 00686 } 00687 00688 } 00689 00690 /** 00691 * @} end of Corr group 00692 */
Generated on Tue Jul 12 2022 14:13:52 by 1.7.2