Aded CMSIS5 DSP and NN folder. Needs some work
Embed:
(wiki syntax)
Show/hide line numbers
arm_correlate_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_correlate_q7.c 00004 * Description: Correlation of Q7 sequences 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup Corr 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Correlation of Q7 sequences. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. 00047 * @return none. 00048 * 00049 * @details 00050 * <b>Scaling and Overflow Behavior:</b> 00051 * 00052 * \par 00053 * The function is implemented using a 32-bit internal accumulator. 00054 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 00055 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00056 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 00057 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format. 00058 * 00059 * \par 00060 * Refer the function <code>arm_correlate_opt_q7()</code> for a faster implementation of this function. 00061 * 00062 */ 00063 00064 void arm_correlate_q7( 00065 q7_t * pSrcA, 00066 uint32_t srcALen, 00067 q7_t * pSrcB, 00068 uint32_t srcBLen, 00069 q7_t * pDst) 00070 { 00071 00072 00073 #if defined (ARM_MATH_DSP) 00074 00075 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00076 00077 q7_t *pIn1; /* inputA pointer */ 00078 q7_t *pIn2; /* inputB pointer */ 00079 q7_t *pOut = pDst; /* output pointer */ 00080 q7_t *px; /* Intermediate inputA pointer */ 00081 q7_t *py; /* Intermediate inputB pointer */ 00082 q7_t *pSrc1; /* Intermediate pointers */ 00083 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00084 q31_t input1, input2; /* temporary variables */ 00085 q15_t in1, in2; /* temporary variables */ 00086 q7_t x0, x1, x2, x3, c0, c1; /* temporary variables for holding input and coefficient values */ 00087 uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00088 int32_t inc = 1; 00089 00090 00091 /* The algorithm implementation is based on the lengths of the inputs. */ 00092 /* srcB is always made to slide across srcA. */ 00093 /* So srcBLen is always considered as shorter or equal to srcALen */ 00094 /* But CORR(x, y) is reverse of CORR(y, x) */ 00095 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00096 /* and the destination pointer modifier, inc is set to -1 */ 00097 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00098 /* But to improve the performance, 00099 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00100 /* If srcALen > srcBLen, 00101 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00102 /* If srcALen < srcBLen, 00103 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00104 if (srcALen >= srcBLen) 00105 { 00106 /* Initialization of inputA pointer */ 00107 pIn1 = (pSrcA); 00108 00109 /* Initialization of inputB pointer */ 00110 pIn2 = (pSrcB); 00111 00112 /* Number of output samples is calculated */ 00113 outBlockSize = (2U * srcALen) - 1U; 00114 00115 /* When srcALen > srcBLen, zero padding is done to srcB 00116 * to make their lengths equal. 00117 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00118 * number of output samples are made zero */ 00119 j = outBlockSize - (srcALen + (srcBLen - 1U)); 00120 00121 /* Updating the pointer position to non zero value */ 00122 pOut += j; 00123 00124 } 00125 else 00126 { 00127 /* Initialization of inputA pointer */ 00128 pIn1 = (pSrcB); 00129 00130 /* Initialization of inputB pointer */ 00131 pIn2 = (pSrcA); 00132 00133 /* srcBLen is always considered as shorter or equal to srcALen */ 00134 j = srcBLen; 00135 srcBLen = srcALen; 00136 srcALen = j; 00137 00138 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00139 /* Hence set the destination pointer to point to the last output sample */ 00140 pOut = pDst + ((srcALen + srcBLen) - 2U); 00141 00142 /* Destination address modifier is set to -1 */ 00143 inc = -1; 00144 00145 } 00146 00147 /* The function is internally 00148 * divided into three parts according to the number of multiplications that has to be 00149 * taken place between inputA samples and inputB samples. In the first part of the 00150 * algorithm, the multiplications increase by one for every iteration. 00151 * In the second part of the algorithm, srcBLen number of multiplications are done. 00152 * In the third part of the algorithm, the multiplications decrease by one 00153 * for every iteration.*/ 00154 /* The algorithm is implemented in three stages. 00155 * The loop counters of each stage is initiated here. */ 00156 blockSize1 = srcBLen - 1U; 00157 blockSize2 = srcALen - (srcBLen - 1U); 00158 blockSize3 = blockSize1; 00159 00160 /* -------------------------- 00161 * Initializations of stage1 00162 * -------------------------*/ 00163 00164 /* sum = x[0] * y[srcBlen - 1] 00165 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00166 * .... 00167 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00168 */ 00169 00170 /* In this stage the MAC operations are increased by 1 for every iteration. 00171 The count variable holds the number of MAC operations performed */ 00172 count = 1U; 00173 00174 /* Working pointer of inputA */ 00175 px = pIn1; 00176 00177 /* Working pointer of inputB */ 00178 pSrc1 = pIn2 + (srcBLen - 1U); 00179 py = pSrc1; 00180 00181 /* ------------------------ 00182 * Stage1 process 00183 * ----------------------*/ 00184 00185 /* The first stage starts here */ 00186 while (blockSize1 > 0U) 00187 { 00188 /* Accumulator is made zero for every iteration */ 00189 sum = 0; 00190 00191 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00192 k = count >> 2; 00193 00194 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00195 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00196 while (k > 0U) 00197 { 00198 /* x[0] , x[1] */ 00199 in1 = (q15_t) * px++; 00200 in2 = (q15_t) * px++; 00201 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00202 00203 /* y[srcBLen - 4] , y[srcBLen - 3] */ 00204 in1 = (q15_t) * py++; 00205 in2 = (q15_t) * py++; 00206 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00207 00208 /* x[0] * y[srcBLen - 4] */ 00209 /* x[1] * y[srcBLen - 3] */ 00210 sum = __SMLAD(input1, input2, sum); 00211 00212 /* x[2] , x[3] */ 00213 in1 = (q15_t) * px++; 00214 in2 = (q15_t) * px++; 00215 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00216 00217 /* y[srcBLen - 2] , y[srcBLen - 1] */ 00218 in1 = (q15_t) * py++; 00219 in2 = (q15_t) * py++; 00220 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00221 00222 /* x[2] * y[srcBLen - 2] */ 00223 /* x[3] * y[srcBLen - 1] */ 00224 sum = __SMLAD(input1, input2, sum); 00225 00226 00227 /* Decrement the loop counter */ 00228 k--; 00229 } 00230 00231 /* If the count is not a multiple of 4, compute any remaining MACs here. 00232 ** No loop unrolling is used. */ 00233 k = count % 0x4U; 00234 00235 while (k > 0U) 00236 { 00237 /* Perform the multiply-accumulates */ 00238 /* x[0] * y[srcBLen - 1] */ 00239 sum += (q31_t) ((q15_t) * px++ * *py++); 00240 00241 /* Decrement the loop counter */ 00242 k--; 00243 } 00244 00245 /* Store the result in the accumulator in the destination buffer. */ 00246 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00247 /* Destination pointer is updated according to the address modifier, inc */ 00248 pOut += inc; 00249 00250 /* Update the inputA and inputB pointers for next MAC calculation */ 00251 py = pSrc1 - count; 00252 px = pIn1; 00253 00254 /* Increment the MAC count */ 00255 count++; 00256 00257 /* Decrement the loop counter */ 00258 blockSize1--; 00259 } 00260 00261 /* -------------------------- 00262 * Initializations of stage2 00263 * ------------------------*/ 00264 00265 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00266 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00267 * .... 00268 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00269 */ 00270 00271 /* Working pointer of inputA */ 00272 px = pIn1; 00273 00274 /* Working pointer of inputB */ 00275 py = pIn2; 00276 00277 /* count is index by which the pointer pIn1 to be incremented */ 00278 count = 0U; 00279 00280 /* ------------------- 00281 * Stage2 process 00282 * ------------------*/ 00283 00284 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00285 * So, to loop unroll over blockSize2, 00286 * srcBLen should be greater than or equal to 4 */ 00287 if (srcBLen >= 4U) 00288 { 00289 /* Loop unroll over blockSize2, by 4 */ 00290 blkCnt = blockSize2 >> 2U; 00291 00292 while (blkCnt > 0U) 00293 { 00294 /* Set all accumulators to zero */ 00295 acc0 = 0; 00296 acc1 = 0; 00297 acc2 = 0; 00298 acc3 = 0; 00299 00300 /* read x[0], x[1], x[2] samples */ 00301 x0 = *px++; 00302 x1 = *px++; 00303 x2 = *px++; 00304 00305 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00306 k = srcBLen >> 2U; 00307 00308 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00309 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00310 do 00311 { 00312 /* Read y[0] sample */ 00313 c0 = *py++; 00314 /* Read y[1] sample */ 00315 c1 = *py++; 00316 00317 /* Read x[3] sample */ 00318 x3 = *px++; 00319 00320 /* x[0] and x[1] are packed */ 00321 in1 = (q15_t) x0; 00322 in2 = (q15_t) x1; 00323 00324 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00325 00326 /* y[0] and y[1] are packed */ 00327 in1 = (q15_t) c0; 00328 in2 = (q15_t) c1; 00329 00330 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00331 00332 /* acc0 += x[0] * y[0] + x[1] * y[1] */ 00333 acc0 = __SMLAD(input1, input2, acc0); 00334 00335 /* x[1] and x[2] are packed */ 00336 in1 = (q15_t) x1; 00337 in2 = (q15_t) x2; 00338 00339 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00340 00341 /* acc1 += x[1] * y[0] + x[2] * y[1] */ 00342 acc1 = __SMLAD(input1, input2, acc1); 00343 00344 /* x[2] and x[3] are packed */ 00345 in1 = (q15_t) x2; 00346 in2 = (q15_t) x3; 00347 00348 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00349 00350 /* acc2 += x[2] * y[0] + x[3] * y[1] */ 00351 acc2 = __SMLAD(input1, input2, acc2); 00352 00353 /* Read x[4] sample */ 00354 x0 = *(px++); 00355 00356 /* x[3] and x[4] are packed */ 00357 in1 = (q15_t) x3; 00358 in2 = (q15_t) x0; 00359 00360 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00361 00362 /* acc3 += x[3] * y[0] + x[4] * y[1] */ 00363 acc3 = __SMLAD(input1, input2, acc3); 00364 00365 /* Read y[2] sample */ 00366 c0 = *py++; 00367 /* Read y[3] sample */ 00368 c1 = *py++; 00369 00370 /* Read x[5] sample */ 00371 x1 = *px++; 00372 00373 /* x[2] and x[3] are packed */ 00374 in1 = (q15_t) x2; 00375 in2 = (q15_t) x3; 00376 00377 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00378 00379 /* y[2] and y[3] are packed */ 00380 in1 = (q15_t) c0; 00381 in2 = (q15_t) c1; 00382 00383 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00384 00385 /* acc0 += x[2] * y[2] + x[3] * y[3] */ 00386 acc0 = __SMLAD(input1, input2, acc0); 00387 00388 /* x[3] and x[4] are packed */ 00389 in1 = (q15_t) x3; 00390 in2 = (q15_t) x0; 00391 00392 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00393 00394 /* acc1 += x[3] * y[2] + x[4] * y[3] */ 00395 acc1 = __SMLAD(input1, input2, acc1); 00396 00397 /* x[4] and x[5] are packed */ 00398 in1 = (q15_t) x0; 00399 in2 = (q15_t) x1; 00400 00401 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00402 00403 /* acc2 += x[4] * y[2] + x[5] * y[3] */ 00404 acc2 = __SMLAD(input1, input2, acc2); 00405 00406 /* Read x[6] sample */ 00407 x2 = *px++; 00408 00409 /* x[5] and x[6] are packed */ 00410 in1 = (q15_t) x1; 00411 in2 = (q15_t) x2; 00412 00413 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00414 00415 /* acc3 += x[5] * y[2] + x[6] * y[3] */ 00416 acc3 = __SMLAD(input1, input2, acc3); 00417 00418 } while (--k); 00419 00420 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00421 ** No loop unrolling is used. */ 00422 k = srcBLen % 0x4U; 00423 00424 while (k > 0U) 00425 { 00426 /* Read y[4] sample */ 00427 c0 = *py++; 00428 00429 /* Read x[7] sample */ 00430 x3 = *px++; 00431 00432 /* Perform the multiply-accumulates */ 00433 /* acc0 += x[4] * y[4] */ 00434 acc0 += ((q15_t) x0 * c0); 00435 /* acc1 += x[5] * y[4] */ 00436 acc1 += ((q15_t) x1 * c0); 00437 /* acc2 += x[6] * y[4] */ 00438 acc2 += ((q15_t) x2 * c0); 00439 /* acc3 += x[7] * y[4] */ 00440 acc3 += ((q15_t) x3 * c0); 00441 00442 /* Reuse the present samples for the next MAC */ 00443 x0 = x1; 00444 x1 = x2; 00445 x2 = x3; 00446 00447 /* Decrement the loop counter */ 00448 k--; 00449 } 00450 00451 /* Store the result in the accumulator in the destination buffer. */ 00452 *pOut = (q7_t) (__SSAT(acc0 >> 7, 8)); 00453 /* Destination pointer is updated according to the address modifier, inc */ 00454 pOut += inc; 00455 00456 *pOut = (q7_t) (__SSAT(acc1 >> 7, 8)); 00457 pOut += inc; 00458 00459 *pOut = (q7_t) (__SSAT(acc2 >> 7, 8)); 00460 pOut += inc; 00461 00462 *pOut = (q7_t) (__SSAT(acc3 >> 7, 8)); 00463 pOut += inc; 00464 00465 count += 4U; 00466 /* Update the inputA and inputB pointers for next MAC calculation */ 00467 px = pIn1 + count; 00468 py = pIn2; 00469 00470 /* Decrement the loop counter */ 00471 blkCnt--; 00472 } 00473 00474 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00475 ** No loop unrolling is used. */ 00476 blkCnt = blockSize2 % 0x4U; 00477 00478 while (blkCnt > 0U) 00479 { 00480 /* Accumulator is made zero for every iteration */ 00481 sum = 0; 00482 00483 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00484 k = srcBLen >> 2U; 00485 00486 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00487 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00488 while (k > 0U) 00489 { 00490 /* Reading two inputs of SrcA buffer and packing */ 00491 in1 = (q15_t) * px++; 00492 in2 = (q15_t) * px++; 00493 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00494 00495 /* Reading two inputs of SrcB buffer and packing */ 00496 in1 = (q15_t) * py++; 00497 in2 = (q15_t) * py++; 00498 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00499 00500 /* Perform the multiply-accumulates */ 00501 sum = __SMLAD(input1, input2, sum); 00502 00503 /* Reading two inputs of SrcA buffer and packing */ 00504 in1 = (q15_t) * px++; 00505 in2 = (q15_t) * px++; 00506 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00507 00508 /* Reading two inputs of SrcB buffer and packing */ 00509 in1 = (q15_t) * py++; 00510 in2 = (q15_t) * py++; 00511 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00512 00513 /* Perform the multiply-accumulates */ 00514 sum = __SMLAD(input1, input2, sum); 00515 00516 /* Decrement the loop counter */ 00517 k--; 00518 } 00519 00520 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00521 ** No loop unrolling is used. */ 00522 k = srcBLen % 0x4U; 00523 00524 while (k > 0U) 00525 { 00526 /* Perform the multiply-accumulates */ 00527 sum += ((q15_t) * px++ * *py++); 00528 00529 /* Decrement the loop counter */ 00530 k--; 00531 } 00532 00533 /* Store the result in the accumulator in the destination buffer. */ 00534 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00535 /* Destination pointer is updated according to the address modifier, inc */ 00536 pOut += inc; 00537 00538 /* Increment the pointer pIn1 index, count by 1 */ 00539 count++; 00540 00541 /* Update the inputA and inputB pointers for next MAC calculation */ 00542 px = pIn1 + count; 00543 py = pIn2; 00544 00545 /* Decrement the loop counter */ 00546 blkCnt--; 00547 } 00548 } 00549 else 00550 { 00551 /* If the srcBLen is not a multiple of 4, 00552 * the blockSize2 loop cannot be unrolled by 4 */ 00553 blkCnt = blockSize2; 00554 00555 while (blkCnt > 0U) 00556 { 00557 /* Accumulator is made zero for every iteration */ 00558 sum = 0; 00559 00560 /* Loop over srcBLen */ 00561 k = srcBLen; 00562 00563 while (k > 0U) 00564 { 00565 /* Perform the multiply-accumulate */ 00566 sum += ((q15_t) * px++ * *py++); 00567 00568 /* Decrement the loop counter */ 00569 k--; 00570 } 00571 00572 /* Store the result in the accumulator in the destination buffer. */ 00573 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00574 /* Destination pointer is updated according to the address modifier, inc */ 00575 pOut += inc; 00576 00577 /* Increment the MAC count */ 00578 count++; 00579 00580 /* Update the inputA and inputB pointers for next MAC calculation */ 00581 px = pIn1 + count; 00582 py = pIn2; 00583 00584 00585 /* Decrement the loop counter */ 00586 blkCnt--; 00587 } 00588 } 00589 00590 /* -------------------------- 00591 * Initializations of stage3 00592 * -------------------------*/ 00593 00594 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00595 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00596 * .... 00597 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00598 * sum += x[srcALen-1] * y[0] 00599 */ 00600 00601 /* In this stage the MAC operations are decreased by 1 for every iteration. 00602 The count variable holds the number of MAC operations performed */ 00603 count = srcBLen - 1U; 00604 00605 /* Working pointer of inputA */ 00606 pSrc1 = pIn1 + (srcALen - (srcBLen - 1U)); 00607 px = pSrc1; 00608 00609 /* Working pointer of inputB */ 00610 py = pIn2; 00611 00612 /* ------------------- 00613 * Stage3 process 00614 * ------------------*/ 00615 00616 while (blockSize3 > 0U) 00617 { 00618 /* Accumulator is made zero for every iteration */ 00619 sum = 0; 00620 00621 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00622 k = count >> 2U; 00623 00624 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00625 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00626 while (k > 0U) 00627 { 00628 /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2] */ 00629 in1 = (q15_t) * px++; 00630 in2 = (q15_t) * px++; 00631 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00632 00633 /* y[0] , y[1] */ 00634 in1 = (q15_t) * py++; 00635 in2 = (q15_t) * py++; 00636 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00637 00638 /* sum += x[srcALen - srcBLen + 1] * y[0] */ 00639 /* sum += x[srcALen - srcBLen + 2] * y[1] */ 00640 sum = __SMLAD(input1, input2, sum); 00641 00642 /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */ 00643 in1 = (q15_t) * px++; 00644 in2 = (q15_t) * px++; 00645 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00646 00647 /* y[2] , y[3] */ 00648 in1 = (q15_t) * py++; 00649 in2 = (q15_t) * py++; 00650 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00651 00652 /* sum += x[srcALen - srcBLen + 3] * y[2] */ 00653 /* sum += x[srcALen - srcBLen + 4] * y[3] */ 00654 sum = __SMLAD(input1, input2, sum); 00655 00656 /* Decrement the loop counter */ 00657 k--; 00658 } 00659 00660 /* If the count is not a multiple of 4, compute any remaining MACs here. 00661 ** No loop unrolling is used. */ 00662 k = count % 0x4U; 00663 00664 while (k > 0U) 00665 { 00666 /* Perform the multiply-accumulates */ 00667 sum += ((q15_t) * px++ * *py++); 00668 00669 /* Decrement the loop counter */ 00670 k--; 00671 } 00672 00673 /* Store the result in the accumulator in the destination buffer. */ 00674 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00675 /* Destination pointer is updated according to the address modifier, inc */ 00676 pOut += inc; 00677 00678 /* Update the inputA and inputB pointers for next MAC calculation */ 00679 px = ++pSrc1; 00680 py = pIn2; 00681 00682 /* Decrement the MAC count */ 00683 count--; 00684 00685 /* Decrement the loop counter */ 00686 blockSize3--; 00687 } 00688 00689 #else 00690 00691 /* Run the below code for Cortex-M0 */ 00692 00693 q7_t *pIn1 = pSrcA; /* inputA pointer */ 00694 q7_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */ 00695 q31_t sum; /* Accumulator */ 00696 uint32_t i = 0U, j; /* loop counters */ 00697 uint32_t inv = 0U; /* Reverse order flag */ 00698 uint32_t tot = 0U; /* Length */ 00699 00700 /* The algorithm implementation is based on the lengths of the inputs. */ 00701 /* srcB is always made to slide across srcA. */ 00702 /* So srcBLen is always considered as shorter or equal to srcALen */ 00703 /* But CORR(x, y) is reverse of CORR(y, x) */ 00704 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00705 /* and a varaible, inv is set to 1 */ 00706 /* If lengths are not equal then zero pad has to be done to make the two 00707 * inputs of same length. But to improve the performance, we include zeroes 00708 * in the output instead of zero padding either of the the inputs*/ 00709 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the 00710 * starting of the output buffer */ 00711 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the 00712 * ending of the output buffer */ 00713 /* Once the zero padding is done the remaining of the output is calcualted 00714 * using convolution but with the shorter signal time shifted. */ 00715 00716 /* Calculate the length of the remaining sequence */ 00717 tot = ((srcALen + srcBLen) - 2U); 00718 00719 if (srcALen > srcBLen) 00720 { 00721 /* Calculating the number of zeros to be padded to the output */ 00722 j = srcALen - srcBLen; 00723 00724 /* Initialise the pointer after zero padding */ 00725 pDst += j; 00726 } 00727 00728 else if (srcALen < srcBLen) 00729 { 00730 /* Initialization to inputB pointer */ 00731 pIn1 = pSrcB; 00732 00733 /* Initialization to the end of inputA pointer */ 00734 pIn2 = pSrcA + (srcALen - 1U); 00735 00736 /* Initialisation of the pointer after zero padding */ 00737 pDst = pDst + tot; 00738 00739 /* Swapping the lengths */ 00740 j = srcALen; 00741 srcALen = srcBLen; 00742 srcBLen = j; 00743 00744 /* Setting the reverse flag */ 00745 inv = 1; 00746 00747 } 00748 00749 /* Loop to calculate convolution for output length number of times */ 00750 for (i = 0U; i <= tot; i++) 00751 { 00752 /* Initialize sum with zero to carry on MAC operations */ 00753 sum = 0; 00754 00755 /* Loop to perform MAC operations according to convolution equation */ 00756 for (j = 0U; j <= i; j++) 00757 { 00758 /* Check the array limitations */ 00759 if ((((i - j) < srcBLen) && (j < srcALen))) 00760 { 00761 /* z[i] += x[i-j] * y[j] */ 00762 sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]); 00763 } 00764 } 00765 /* Store the output in the destination buffer */ 00766 if (inv == 1) 00767 *pDst-- = (q7_t) __SSAT((sum >> 7U), 8U); 00768 else 00769 *pDst++ = (q7_t) __SSAT((sum >> 7U), 8U); 00770 } 00771 00772 #endif /* #if defined (ARM_MATH_DSP) */ 00773 00774 } 00775 00776 /** 00777 * @} end of Corr group 00778 */ 00779
Generated on Tue Jul 12 2022 16:46:23 by 1.7.2