Marco Zecchini
/
Example_RTOS
Rtos API example
Embed:
(wiki syntax)
Show/hide line numbers
arm_correlate_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_q7.c 00009 * 00010 * Description: Correlation of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Corr 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Correlation of Q7 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. 00059 * @return none. 00060 * 00061 * @details 00062 * <b>Scaling and Overflow Behavior:</b> 00063 * 00064 * \par 00065 * The function is implemented using a 32-bit internal accumulator. 00066 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 00067 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00068 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 00069 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format. 00070 * 00071 * \par 00072 * Refer the function <code>arm_correlate_opt_q7()</code> for a faster implementation of this function. 00073 * 00074 */ 00075 00076 void arm_correlate_q7( 00077 q7_t * pSrcA, 00078 uint32_t srcALen, 00079 q7_t * pSrcB, 00080 uint32_t srcBLen, 00081 q7_t * pDst) 00082 { 00083 00084 00085 #ifndef ARM_MATH_CM0_FAMILY 00086 00087 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00088 00089 q7_t *pIn1; /* inputA pointer */ 00090 q7_t *pIn2; /* inputB pointer */ 00091 q7_t *pOut = pDst; /* output pointer */ 00092 q7_t *px; /* Intermediate inputA pointer */ 00093 q7_t *py; /* Intermediate inputB pointer */ 00094 q7_t *pSrc1; /* Intermediate pointers */ 00095 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00096 q31_t input1, input2; /* temporary variables */ 00097 q15_t in1, in2; /* temporary variables */ 00098 q7_t x0, x1, x2, x3, c0, c1; /* temporary variables for holding input and coefficient values */ 00099 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00100 int32_t inc = 1; 00101 00102 00103 /* The algorithm implementation is based on the lengths of the inputs. */ 00104 /* srcB is always made to slide across srcA. */ 00105 /* So srcBLen is always considered as shorter or equal to srcALen */ 00106 /* But CORR(x, y) is reverse of CORR(y, x) */ 00107 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00108 /* and the destination pointer modifier, inc is set to -1 */ 00109 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00110 /* But to improve the performance, 00111 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00112 /* If srcALen > srcBLen, 00113 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00114 /* If srcALen < srcBLen, 00115 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00116 if(srcALen >= srcBLen) 00117 { 00118 /* Initialization of inputA pointer */ 00119 pIn1 = (pSrcA); 00120 00121 /* Initialization of inputB pointer */ 00122 pIn2 = (pSrcB); 00123 00124 /* Number of output samples is calculated */ 00125 outBlockSize = (2u * srcALen) - 1u; 00126 00127 /* When srcALen > srcBLen, zero padding is done to srcB 00128 * to make their lengths equal. 00129 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00130 * number of output samples are made zero */ 00131 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00132 00133 /* Updating the pointer position to non zero value */ 00134 pOut += j; 00135 00136 } 00137 else 00138 { 00139 /* Initialization of inputA pointer */ 00140 pIn1 = (pSrcB); 00141 00142 /* Initialization of inputB pointer */ 00143 pIn2 = (pSrcA); 00144 00145 /* srcBLen is always considered as shorter or equal to srcALen */ 00146 j = srcBLen; 00147 srcBLen = srcALen; 00148 srcALen = j; 00149 00150 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00151 /* Hence set the destination pointer to point to the last output sample */ 00152 pOut = pDst + ((srcALen + srcBLen) - 2u); 00153 00154 /* Destination address modifier is set to -1 */ 00155 inc = -1; 00156 00157 } 00158 00159 /* The function is internally 00160 * divided into three parts according to the number of multiplications that has to be 00161 * taken place between inputA samples and inputB samples. In the first part of the 00162 * algorithm, the multiplications increase by one for every iteration. 00163 * In the second part of the algorithm, srcBLen number of multiplications are done. 00164 * In the third part of the algorithm, the multiplications decrease by one 00165 * for every iteration.*/ 00166 /* The algorithm is implemented in three stages. 00167 * The loop counters of each stage is initiated here. */ 00168 blockSize1 = srcBLen - 1u; 00169 blockSize2 = srcALen - (srcBLen - 1u); 00170 blockSize3 = blockSize1; 00171 00172 /* -------------------------- 00173 * Initializations of stage1 00174 * -------------------------*/ 00175 00176 /* sum = x[0] * y[srcBlen - 1] 00177 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00178 * .... 00179 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00180 */ 00181 00182 /* In this stage the MAC operations are increased by 1 for every iteration. 00183 The count variable holds the number of MAC operations performed */ 00184 count = 1u; 00185 00186 /* Working pointer of inputA */ 00187 px = pIn1; 00188 00189 /* Working pointer of inputB */ 00190 pSrc1 = pIn2 + (srcBLen - 1u); 00191 py = pSrc1; 00192 00193 /* ------------------------ 00194 * Stage1 process 00195 * ----------------------*/ 00196 00197 /* The first stage starts here */ 00198 while(blockSize1 > 0u) 00199 { 00200 /* Accumulator is made zero for every iteration */ 00201 sum = 0; 00202 00203 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00204 k = count >> 2; 00205 00206 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00207 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00208 while(k > 0u) 00209 { 00210 /* x[0] , x[1] */ 00211 in1 = (q15_t) * px++; 00212 in2 = (q15_t) * px++; 00213 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00214 00215 /* y[srcBLen - 4] , y[srcBLen - 3] */ 00216 in1 = (q15_t) * py++; 00217 in2 = (q15_t) * py++; 00218 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00219 00220 /* x[0] * y[srcBLen - 4] */ 00221 /* x[1] * y[srcBLen - 3] */ 00222 sum = __SMLAD(input1, input2, sum); 00223 00224 /* x[2] , x[3] */ 00225 in1 = (q15_t) * px++; 00226 in2 = (q15_t) * px++; 00227 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00228 00229 /* y[srcBLen - 2] , y[srcBLen - 1] */ 00230 in1 = (q15_t) * py++; 00231 in2 = (q15_t) * py++; 00232 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00233 00234 /* x[2] * y[srcBLen - 2] */ 00235 /* x[3] * y[srcBLen - 1] */ 00236 sum = __SMLAD(input1, input2, sum); 00237 00238 00239 /* Decrement the loop counter */ 00240 k--; 00241 } 00242 00243 /* If the count is not a multiple of 4, compute any remaining MACs here. 00244 ** No loop unrolling is used. */ 00245 k = count % 0x4u; 00246 00247 while(k > 0u) 00248 { 00249 /* Perform the multiply-accumulates */ 00250 /* x[0] * y[srcBLen - 1] */ 00251 sum += (q31_t) ((q15_t) * px++ * *py++); 00252 00253 /* Decrement the loop counter */ 00254 k--; 00255 } 00256 00257 /* Store the result in the accumulator in the destination buffer. */ 00258 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00259 /* Destination pointer is updated according to the address modifier, inc */ 00260 pOut += inc; 00261 00262 /* Update the inputA and inputB pointers for next MAC calculation */ 00263 py = pSrc1 - count; 00264 px = pIn1; 00265 00266 /* Increment the MAC count */ 00267 count++; 00268 00269 /* Decrement the loop counter */ 00270 blockSize1--; 00271 } 00272 00273 /* -------------------------- 00274 * Initializations of stage2 00275 * ------------------------*/ 00276 00277 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00278 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00279 * .... 00280 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00281 */ 00282 00283 /* Working pointer of inputA */ 00284 px = pIn1; 00285 00286 /* Working pointer of inputB */ 00287 py = pIn2; 00288 00289 /* count is index by which the pointer pIn1 to be incremented */ 00290 count = 0u; 00291 00292 /* ------------------- 00293 * Stage2 process 00294 * ------------------*/ 00295 00296 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00297 * So, to loop unroll over blockSize2, 00298 * srcBLen should be greater than or equal to 4 */ 00299 if(srcBLen >= 4u) 00300 { 00301 /* Loop unroll over blockSize2, by 4 */ 00302 blkCnt = blockSize2 >> 2u; 00303 00304 while(blkCnt > 0u) 00305 { 00306 /* Set all accumulators to zero */ 00307 acc0 = 0; 00308 acc1 = 0; 00309 acc2 = 0; 00310 acc3 = 0; 00311 00312 /* read x[0], x[1], x[2] samples */ 00313 x0 = *px++; 00314 x1 = *px++; 00315 x2 = *px++; 00316 00317 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00318 k = srcBLen >> 2u; 00319 00320 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00321 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00322 do 00323 { 00324 /* Read y[0] sample */ 00325 c0 = *py++; 00326 /* Read y[1] sample */ 00327 c1 = *py++; 00328 00329 /* Read x[3] sample */ 00330 x3 = *px++; 00331 00332 /* x[0] and x[1] are packed */ 00333 in1 = (q15_t) x0; 00334 in2 = (q15_t) x1; 00335 00336 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00337 00338 /* y[0] and y[1] are packed */ 00339 in1 = (q15_t) c0; 00340 in2 = (q15_t) c1; 00341 00342 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00343 00344 /* acc0 += x[0] * y[0] + x[1] * y[1] */ 00345 acc0 = __SMLAD(input1, input2, acc0); 00346 00347 /* x[1] and x[2] are packed */ 00348 in1 = (q15_t) x1; 00349 in2 = (q15_t) x2; 00350 00351 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00352 00353 /* acc1 += x[1] * y[0] + x[2] * y[1] */ 00354 acc1 = __SMLAD(input1, input2, acc1); 00355 00356 /* x[2] and x[3] are packed */ 00357 in1 = (q15_t) x2; 00358 in2 = (q15_t) x3; 00359 00360 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00361 00362 /* acc2 += x[2] * y[0] + x[3] * y[1] */ 00363 acc2 = __SMLAD(input1, input2, acc2); 00364 00365 /* Read x[4] sample */ 00366 x0 = *(px++); 00367 00368 /* x[3] and x[4] are packed */ 00369 in1 = (q15_t) x3; 00370 in2 = (q15_t) x0; 00371 00372 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00373 00374 /* acc3 += x[3] * y[0] + x[4] * y[1] */ 00375 acc3 = __SMLAD(input1, input2, acc3); 00376 00377 /* Read y[2] sample */ 00378 c0 = *py++; 00379 /* Read y[3] sample */ 00380 c1 = *py++; 00381 00382 /* Read x[5] sample */ 00383 x1 = *px++; 00384 00385 /* x[2] and x[3] are packed */ 00386 in1 = (q15_t) x2; 00387 in2 = (q15_t) x3; 00388 00389 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00390 00391 /* y[2] and y[3] are packed */ 00392 in1 = (q15_t) c0; 00393 in2 = (q15_t) c1; 00394 00395 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00396 00397 /* acc0 += x[2] * y[2] + x[3] * y[3] */ 00398 acc0 = __SMLAD(input1, input2, acc0); 00399 00400 /* x[3] and x[4] are packed */ 00401 in1 = (q15_t) x3; 00402 in2 = (q15_t) x0; 00403 00404 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00405 00406 /* acc1 += x[3] * y[2] + x[4] * y[3] */ 00407 acc1 = __SMLAD(input1, input2, acc1); 00408 00409 /* x[4] and x[5] are packed */ 00410 in1 = (q15_t) x0; 00411 in2 = (q15_t) x1; 00412 00413 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00414 00415 /* acc2 += x[4] * y[2] + x[5] * y[3] */ 00416 acc2 = __SMLAD(input1, input2, acc2); 00417 00418 /* Read x[6] sample */ 00419 x2 = *px++; 00420 00421 /* x[5] and x[6] are packed */ 00422 in1 = (q15_t) x1; 00423 in2 = (q15_t) x2; 00424 00425 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00426 00427 /* acc3 += x[5] * y[2] + x[6] * y[3] */ 00428 acc3 = __SMLAD(input1, input2, acc3); 00429 00430 } while(--k); 00431 00432 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00433 ** No loop unrolling is used. */ 00434 k = srcBLen % 0x4u; 00435 00436 while(k > 0u) 00437 { 00438 /* Read y[4] sample */ 00439 c0 = *py++; 00440 00441 /* Read x[7] sample */ 00442 x3 = *px++; 00443 00444 /* Perform the multiply-accumulates */ 00445 /* acc0 += x[4] * y[4] */ 00446 acc0 += ((q15_t) x0 * c0); 00447 /* acc1 += x[5] * y[4] */ 00448 acc1 += ((q15_t) x1 * c0); 00449 /* acc2 += x[6] * y[4] */ 00450 acc2 += ((q15_t) x2 * c0); 00451 /* acc3 += x[7] * y[4] */ 00452 acc3 += ((q15_t) x3 * c0); 00453 00454 /* Reuse the present samples for the next MAC */ 00455 x0 = x1; 00456 x1 = x2; 00457 x2 = x3; 00458 00459 /* Decrement the loop counter */ 00460 k--; 00461 } 00462 00463 /* Store the result in the accumulator in the destination buffer. */ 00464 *pOut = (q7_t) (__SSAT(acc0 >> 7, 8)); 00465 /* Destination pointer is updated according to the address modifier, inc */ 00466 pOut += inc; 00467 00468 *pOut = (q7_t) (__SSAT(acc1 >> 7, 8)); 00469 pOut += inc; 00470 00471 *pOut = (q7_t) (__SSAT(acc2 >> 7, 8)); 00472 pOut += inc; 00473 00474 *pOut = (q7_t) (__SSAT(acc3 >> 7, 8)); 00475 pOut += inc; 00476 00477 count += 4u; 00478 /* Update the inputA and inputB pointers for next MAC calculation */ 00479 px = pIn1 + count; 00480 py = pIn2; 00481 00482 /* Decrement the loop counter */ 00483 blkCnt--; 00484 } 00485 00486 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00487 ** No loop unrolling is used. */ 00488 blkCnt = blockSize2 % 0x4u; 00489 00490 while(blkCnt > 0u) 00491 { 00492 /* Accumulator is made zero for every iteration */ 00493 sum = 0; 00494 00495 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00496 k = srcBLen >> 2u; 00497 00498 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00499 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00500 while(k > 0u) 00501 { 00502 /* Reading two inputs of SrcA buffer and packing */ 00503 in1 = (q15_t) * px++; 00504 in2 = (q15_t) * px++; 00505 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00506 00507 /* Reading two inputs of SrcB buffer and packing */ 00508 in1 = (q15_t) * py++; 00509 in2 = (q15_t) * py++; 00510 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00511 00512 /* Perform the multiply-accumulates */ 00513 sum = __SMLAD(input1, input2, sum); 00514 00515 /* Reading two inputs of SrcA buffer and packing */ 00516 in1 = (q15_t) * px++; 00517 in2 = (q15_t) * px++; 00518 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00519 00520 /* Reading two inputs of SrcB buffer and packing */ 00521 in1 = (q15_t) * py++; 00522 in2 = (q15_t) * py++; 00523 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00524 00525 /* Perform the multiply-accumulates */ 00526 sum = __SMLAD(input1, input2, sum); 00527 00528 /* Decrement the loop counter */ 00529 k--; 00530 } 00531 00532 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00533 ** No loop unrolling is used. */ 00534 k = srcBLen % 0x4u; 00535 00536 while(k > 0u) 00537 { 00538 /* Perform the multiply-accumulates */ 00539 sum += ((q15_t) * px++ * *py++); 00540 00541 /* Decrement the loop counter */ 00542 k--; 00543 } 00544 00545 /* Store the result in the accumulator in the destination buffer. */ 00546 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00547 /* Destination pointer is updated according to the address modifier, inc */ 00548 pOut += inc; 00549 00550 /* Increment the pointer pIn1 index, count by 1 */ 00551 count++; 00552 00553 /* Update the inputA and inputB pointers for next MAC calculation */ 00554 px = pIn1 + count; 00555 py = pIn2; 00556 00557 /* Decrement the loop counter */ 00558 blkCnt--; 00559 } 00560 } 00561 else 00562 { 00563 /* If the srcBLen is not a multiple of 4, 00564 * the blockSize2 loop cannot be unrolled by 4 */ 00565 blkCnt = blockSize2; 00566 00567 while(blkCnt > 0u) 00568 { 00569 /* Accumulator is made zero for every iteration */ 00570 sum = 0; 00571 00572 /* Loop over srcBLen */ 00573 k = srcBLen; 00574 00575 while(k > 0u) 00576 { 00577 /* Perform the multiply-accumulate */ 00578 sum += ((q15_t) * px++ * *py++); 00579 00580 /* Decrement the loop counter */ 00581 k--; 00582 } 00583 00584 /* Store the result in the accumulator in the destination buffer. */ 00585 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00586 /* Destination pointer is updated according to the address modifier, inc */ 00587 pOut += inc; 00588 00589 /* Increment the MAC count */ 00590 count++; 00591 00592 /* Update the inputA and inputB pointers for next MAC calculation */ 00593 px = pIn1 + count; 00594 py = pIn2; 00595 00596 00597 /* Decrement the loop counter */ 00598 blkCnt--; 00599 } 00600 } 00601 00602 /* -------------------------- 00603 * Initializations of stage3 00604 * -------------------------*/ 00605 00606 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00607 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00608 * .... 00609 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00610 * sum += x[srcALen-1] * y[0] 00611 */ 00612 00613 /* In this stage the MAC operations are decreased by 1 for every iteration. 00614 The count variable holds the number of MAC operations performed */ 00615 count = srcBLen - 1u; 00616 00617 /* Working pointer of inputA */ 00618 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00619 px = pSrc1; 00620 00621 /* Working pointer of inputB */ 00622 py = pIn2; 00623 00624 /* ------------------- 00625 * Stage3 process 00626 * ------------------*/ 00627 00628 while(blockSize3 > 0u) 00629 { 00630 /* Accumulator is made zero for every iteration */ 00631 sum = 0; 00632 00633 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00634 k = count >> 2u; 00635 00636 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00637 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00638 while(k > 0u) 00639 { 00640 /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2] */ 00641 in1 = (q15_t) * px++; 00642 in2 = (q15_t) * px++; 00643 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00644 00645 /* y[0] , y[1] */ 00646 in1 = (q15_t) * py++; 00647 in2 = (q15_t) * py++; 00648 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00649 00650 /* sum += x[srcALen - srcBLen + 1] * y[0] */ 00651 /* sum += x[srcALen - srcBLen + 2] * y[1] */ 00652 sum = __SMLAD(input1, input2, sum); 00653 00654 /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */ 00655 in1 = (q15_t) * px++; 00656 in2 = (q15_t) * px++; 00657 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00658 00659 /* y[2] , y[3] */ 00660 in1 = (q15_t) * py++; 00661 in2 = (q15_t) * py++; 00662 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00663 00664 /* sum += x[srcALen - srcBLen + 3] * y[2] */ 00665 /* sum += x[srcALen - srcBLen + 4] * y[3] */ 00666 sum = __SMLAD(input1, input2, sum); 00667 00668 /* Decrement the loop counter */ 00669 k--; 00670 } 00671 00672 /* If the count is not a multiple of 4, compute any remaining MACs here. 00673 ** No loop unrolling is used. */ 00674 k = count % 0x4u; 00675 00676 while(k > 0u) 00677 { 00678 /* Perform the multiply-accumulates */ 00679 sum += ((q15_t) * px++ * *py++); 00680 00681 /* Decrement the loop counter */ 00682 k--; 00683 } 00684 00685 /* Store the result in the accumulator in the destination buffer. */ 00686 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00687 /* Destination pointer is updated according to the address modifier, inc */ 00688 pOut += inc; 00689 00690 /* Update the inputA and inputB pointers for next MAC calculation */ 00691 px = ++pSrc1; 00692 py = pIn2; 00693 00694 /* Decrement the MAC count */ 00695 count--; 00696 00697 /* Decrement the loop counter */ 00698 blockSize3--; 00699 } 00700 00701 #else 00702 00703 /* Run the below code for Cortex-M0 */ 00704 00705 q7_t *pIn1 = pSrcA; /* inputA pointer */ 00706 q7_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */ 00707 q31_t sum; /* Accumulator */ 00708 uint32_t i = 0u, j; /* loop counters */ 00709 uint32_t inv = 0u; /* Reverse order flag */ 00710 uint32_t tot = 0u; /* Length */ 00711 00712 /* The algorithm implementation is based on the lengths of the inputs. */ 00713 /* srcB is always made to slide across srcA. */ 00714 /* So srcBLen is always considered as shorter or equal to srcALen */ 00715 /* But CORR(x, y) is reverse of CORR(y, x) */ 00716 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00717 /* and a varaible, inv is set to 1 */ 00718 /* If lengths are not equal then zero pad has to be done to make the two 00719 * inputs of same length. But to improve the performance, we include zeroes 00720 * in the output instead of zero padding either of the the inputs*/ 00721 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the 00722 * starting of the output buffer */ 00723 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the 00724 * ending of the output buffer */ 00725 /* Once the zero padding is done the remaining of the output is calcualted 00726 * using convolution but with the shorter signal time shifted. */ 00727 00728 /* Calculate the length of the remaining sequence */ 00729 tot = ((srcALen + srcBLen) - 2u); 00730 00731 if(srcALen > srcBLen) 00732 { 00733 /* Calculating the number of zeros to be padded to the output */ 00734 j = srcALen - srcBLen; 00735 00736 /* Initialise the pointer after zero padding */ 00737 pDst += j; 00738 } 00739 00740 else if(srcALen < srcBLen) 00741 { 00742 /* Initialization to inputB pointer */ 00743 pIn1 = pSrcB; 00744 00745 /* Initialization to the end of inputA pointer */ 00746 pIn2 = pSrcA + (srcALen - 1u); 00747 00748 /* Initialisation of the pointer after zero padding */ 00749 pDst = pDst + tot; 00750 00751 /* Swapping the lengths */ 00752 j = srcALen; 00753 srcALen = srcBLen; 00754 srcBLen = j; 00755 00756 /* Setting the reverse flag */ 00757 inv = 1; 00758 00759 } 00760 00761 /* Loop to calculate convolution for output length number of times */ 00762 for (i = 0u; i <= tot; i++) 00763 { 00764 /* Initialize sum with zero to carry on MAC operations */ 00765 sum = 0; 00766 00767 /* Loop to perform MAC operations according to convolution equation */ 00768 for (j = 0u; j <= i; j++) 00769 { 00770 /* Check the array limitations */ 00771 if((((i - j) < srcBLen) && (j < srcALen))) 00772 { 00773 /* z[i] += x[i-j] * y[j] */ 00774 sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]); 00775 } 00776 } 00777 /* Store the output in the destination buffer */ 00778 if(inv == 1) 00779 *pDst-- = (q7_t) __SSAT((sum >> 7u), 8u); 00780 else 00781 *pDst++ = (q7_t) __SSAT((sum >> 7u), 8u); 00782 } 00783 00784 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00785 00786 } 00787 00788 /** 00789 * @} end of Corr group 00790 */
Generated on Sun Jul 17 2022 08:25:19 by 1.7.2