CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_correlate_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_fast_q15.c 00009 * 00010 * Description: Fast Q15 Correlation. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Corr 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. 00059 * @return none. 00060 * 00061 * <b>Scaling and Overflow Behavior:</b> 00062 * 00063 * \par 00064 * This fast version uses a 32-bit accumulator with 2.30 format. 00065 * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. 00066 * There is no saturation on intermediate additions. 00067 * Thus, if the accumulator overflows it wraps around and distorts the result. 00068 * The input signals should be scaled down to avoid intermediate overflows. 00069 * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a 00070 * maximum of min(srcALen, srcBLen) number of additions is carried internally. 00071 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. 00072 * 00073 * \par 00074 * See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. 00075 */ 00076 00077 void arm_correlate_fast_q15( 00078 q15_t * pSrcA, 00079 uint32_t srcALen, 00080 q15_t * pSrcB, 00081 uint32_t srcBLen, 00082 q15_t * pDst) 00083 { 00084 #ifndef UNALIGNED_SUPPORT_DISABLE 00085 00086 q15_t *pIn1; /* inputA pointer */ 00087 q15_t *pIn2; /* inputB pointer */ 00088 q15_t *pOut = pDst; /* output pointer */ 00089 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00090 q15_t *px; /* Intermediate inputA pointer */ 00091 q15_t *py; /* Intermediate inputB pointer */ 00092 q15_t *pSrc1; /* Intermediate pointers */ 00093 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ 00094 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00095 int32_t inc = 1; /* Destination address modifier */ 00096 00097 00098 /* The algorithm implementation is based on the lengths of the inputs. */ 00099 /* srcB is always made to slide across srcA. */ 00100 /* So srcBLen is always considered as shorter or equal to srcALen */ 00101 /* But CORR(x, y) is reverse of CORR(y, x) */ 00102 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00103 /* and the destination pointer modifier, inc is set to -1 */ 00104 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00105 /* But to improve the performance, 00106 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00107 /* If srcALen > srcBLen, 00108 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00109 /* If srcALen < srcBLen, 00110 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00111 if(srcALen >= srcBLen) 00112 { 00113 /* Initialization of inputA pointer */ 00114 pIn1 = (pSrcA); 00115 00116 /* Initialization of inputB pointer */ 00117 pIn2 = (pSrcB); 00118 00119 /* Number of output samples is calculated */ 00120 outBlockSize = (2u * srcALen) - 1u; 00121 00122 /* When srcALen > srcBLen, zero padding is done to srcB 00123 * to make their lengths equal. 00124 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00125 * number of output samples are made zero */ 00126 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00127 00128 /* Updating the pointer position to non zero value */ 00129 pOut += j; 00130 00131 } 00132 else 00133 { 00134 /* Initialization of inputA pointer */ 00135 pIn1 = (pSrcB); 00136 00137 /* Initialization of inputB pointer */ 00138 pIn2 = (pSrcA); 00139 00140 /* srcBLen is always considered as shorter or equal to srcALen */ 00141 j = srcBLen; 00142 srcBLen = srcALen; 00143 srcALen = j; 00144 00145 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00146 /* Hence set the destination pointer to point to the last output sample */ 00147 pOut = pDst + ((srcALen + srcBLen) - 2u); 00148 00149 /* Destination address modifier is set to -1 */ 00150 inc = -1; 00151 00152 } 00153 00154 /* The function is internally 00155 * divided into three parts according to the number of multiplications that has to be 00156 * taken place between inputA samples and inputB samples. In the first part of the 00157 * algorithm, the multiplications increase by one for every iteration. 00158 * In the second part of the algorithm, srcBLen number of multiplications are done. 00159 * In the third part of the algorithm, the multiplications decrease by one 00160 * for every iteration.*/ 00161 /* The algorithm is implemented in three stages. 00162 * The loop counters of each stage is initiated here. */ 00163 blockSize1 = srcBLen - 1u; 00164 blockSize2 = srcALen - (srcBLen - 1u); 00165 blockSize3 = blockSize1; 00166 00167 /* -------------------------- 00168 * Initializations of stage1 00169 * -------------------------*/ 00170 00171 /* sum = x[0] * y[srcBlen - 1] 00172 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00173 * .... 00174 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00175 */ 00176 00177 /* In this stage the MAC operations are increased by 1 for every iteration. 00178 The count variable holds the number of MAC operations performed */ 00179 count = 1u; 00180 00181 /* Working pointer of inputA */ 00182 px = pIn1; 00183 00184 /* Working pointer of inputB */ 00185 pSrc1 = pIn2 + (srcBLen - 1u); 00186 py = pSrc1; 00187 00188 /* ------------------------ 00189 * Stage1 process 00190 * ----------------------*/ 00191 00192 /* The first loop starts here */ 00193 while(blockSize1 > 0u) 00194 { 00195 /* Accumulator is made zero for every iteration */ 00196 sum = 0; 00197 00198 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00199 k = count >> 2; 00200 00201 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00202 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00203 while(k > 0u) 00204 { 00205 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */ 00206 sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 00207 /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */ 00208 sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 00209 00210 /* Decrement the loop counter */ 00211 k--; 00212 } 00213 00214 /* If the count is not a multiple of 4, compute any remaining MACs here. 00215 ** No loop unrolling is used. */ 00216 k = count % 0x4u; 00217 00218 while(k > 0u) 00219 { 00220 /* Perform the multiply-accumulates */ 00221 /* x[0] * y[srcBLen - 1] */ 00222 sum = __SMLAD(*px++, *py++, sum); 00223 00224 /* Decrement the loop counter */ 00225 k--; 00226 } 00227 00228 /* Store the result in the accumulator in the destination buffer. */ 00229 *pOut = (q15_t) (sum >> 15); 00230 /* Destination pointer is updated according to the address modifier, inc */ 00231 pOut += inc; 00232 00233 /* Update the inputA and inputB pointers for next MAC calculation */ 00234 py = pSrc1 - count; 00235 px = pIn1; 00236 00237 /* Increment the MAC count */ 00238 count++; 00239 00240 /* Decrement the loop counter */ 00241 blockSize1--; 00242 } 00243 00244 /* -------------------------- 00245 * Initializations of stage2 00246 * ------------------------*/ 00247 00248 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00249 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00250 * .... 00251 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00252 */ 00253 00254 /* Working pointer of inputA */ 00255 px = pIn1; 00256 00257 /* Working pointer of inputB */ 00258 py = pIn2; 00259 00260 /* count is index by which the pointer pIn1 to be incremented */ 00261 count = 0u; 00262 00263 /* ------------------- 00264 * Stage2 process 00265 * ------------------*/ 00266 00267 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00268 * So, to loop unroll over blockSize2, 00269 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ 00270 if(srcBLen >= 4u) 00271 { 00272 /* Loop unroll over blockSize2, by 4 */ 00273 blkCnt = blockSize2 >> 2u; 00274 00275 while(blkCnt > 0u) 00276 { 00277 /* Set all accumulators to zero */ 00278 acc0 = 0; 00279 acc1 = 0; 00280 acc2 = 0; 00281 acc3 = 0; 00282 00283 /* read x[0], x[1] samples */ 00284 x0 = *__SIMD32(px); 00285 /* read x[1], x[2] samples */ 00286 x1 = _SIMD32_OFFSET(px + 1); 00287 px += 2u; 00288 00289 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00290 k = srcBLen >> 2u; 00291 00292 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00293 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00294 do 00295 { 00296 /* Read the first two inputB samples using SIMD: 00297 * y[0] and y[1] */ 00298 c0 = *__SIMD32(py)++; 00299 00300 /* acc0 += x[0] * y[0] + x[1] * y[1] */ 00301 acc0 = __SMLAD(x0, c0, acc0); 00302 00303 /* acc1 += x[1] * y[0] + x[2] * y[1] */ 00304 acc1 = __SMLAD(x1, c0, acc1); 00305 00306 /* Read x[2], x[3] */ 00307 x2 = *__SIMD32(px); 00308 00309 /* Read x[3], x[4] */ 00310 x3 = _SIMD32_OFFSET(px + 1); 00311 00312 /* acc2 += x[2] * y[0] + x[3] * y[1] */ 00313 acc2 = __SMLAD(x2, c0, acc2); 00314 00315 /* acc3 += x[3] * y[0] + x[4] * y[1] */ 00316 acc3 = __SMLAD(x3, c0, acc3); 00317 00318 /* Read y[2] and y[3] */ 00319 c0 = *__SIMD32(py)++; 00320 00321 /* acc0 += x[2] * y[2] + x[3] * y[3] */ 00322 acc0 = __SMLAD(x2, c0, acc0); 00323 00324 /* acc1 += x[3] * y[2] + x[4] * y[3] */ 00325 acc1 = __SMLAD(x3, c0, acc1); 00326 00327 /* Read x[4], x[5] */ 00328 x0 = _SIMD32_OFFSET(px + 2); 00329 00330 /* Read x[5], x[6] */ 00331 x1 = _SIMD32_OFFSET(px + 3); 00332 px += 4u; 00333 00334 /* acc2 += x[4] * y[2] + x[5] * y[3] */ 00335 acc2 = __SMLAD(x0, c0, acc2); 00336 00337 /* acc3 += x[5] * y[2] + x[6] * y[3] */ 00338 acc3 = __SMLAD(x1, c0, acc3); 00339 00340 } while(--k); 00341 00342 /* For the next MAC operations, SIMD is not used 00343 * So, the 16 bit pointer if inputB, py is updated */ 00344 00345 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00346 ** No loop unrolling is used. */ 00347 k = srcBLen % 0x4u; 00348 00349 if(k == 1u) 00350 { 00351 /* Read y[4] */ 00352 c0 = *py; 00353 #ifdef ARM_MATH_BIG_ENDIAN 00354 00355 c0 = c0 << 16u; 00356 00357 #else 00358 00359 c0 = c0 & 0x0000FFFF; 00360 00361 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00362 00363 /* Read x[7] */ 00364 x3 = *__SIMD32(px); 00365 px++; 00366 00367 /* Perform the multiply-accumulates */ 00368 acc0 = __SMLAD(x0, c0, acc0); 00369 acc1 = __SMLAD(x1, c0, acc1); 00370 acc2 = __SMLADX(x1, c0, acc2); 00371 acc3 = __SMLADX(x3, c0, acc3); 00372 } 00373 00374 if(k == 2u) 00375 { 00376 /* Read y[4], y[5] */ 00377 c0 = *__SIMD32(py); 00378 00379 /* Read x[7], x[8] */ 00380 x3 = *__SIMD32(px); 00381 00382 /* Read x[9] */ 00383 x2 = _SIMD32_OFFSET(px + 1); 00384 px += 2u; 00385 00386 /* Perform the multiply-accumulates */ 00387 acc0 = __SMLAD(x0, c0, acc0); 00388 acc1 = __SMLAD(x1, c0, acc1); 00389 acc2 = __SMLAD(x3, c0, acc2); 00390 acc3 = __SMLAD(x2, c0, acc3); 00391 } 00392 00393 if(k == 3u) 00394 { 00395 /* Read y[4], y[5] */ 00396 c0 = *__SIMD32(py)++; 00397 00398 /* Read x[7], x[8] */ 00399 x3 = *__SIMD32(px); 00400 00401 /* Read x[9] */ 00402 x2 = _SIMD32_OFFSET(px + 1); 00403 00404 /* Perform the multiply-accumulates */ 00405 acc0 = __SMLAD(x0, c0, acc0); 00406 acc1 = __SMLAD(x1, c0, acc1); 00407 acc2 = __SMLAD(x3, c0, acc2); 00408 acc3 = __SMLAD(x2, c0, acc3); 00409 00410 c0 = (*py); 00411 /* Read y[6] */ 00412 #ifdef ARM_MATH_BIG_ENDIAN 00413 00414 c0 = c0 << 16u; 00415 #else 00416 00417 c0 = c0 & 0x0000FFFF; 00418 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00419 00420 /* Read x[10] */ 00421 x3 = _SIMD32_OFFSET(px + 2); 00422 px += 3u; 00423 00424 /* Perform the multiply-accumulates */ 00425 acc0 = __SMLADX(x1, c0, acc0); 00426 acc1 = __SMLAD(x2, c0, acc1); 00427 acc2 = __SMLADX(x2, c0, acc2); 00428 acc3 = __SMLADX(x3, c0, acc3); 00429 } 00430 00431 /* Store the result in the accumulator in the destination buffer. */ 00432 *pOut = (q15_t) (acc0 >> 15); 00433 /* Destination pointer is updated according to the address modifier, inc */ 00434 pOut += inc; 00435 00436 *pOut = (q15_t) (acc1 >> 15); 00437 pOut += inc; 00438 00439 *pOut = (q15_t) (acc2 >> 15); 00440 pOut += inc; 00441 00442 *pOut = (q15_t) (acc3 >> 15); 00443 pOut += inc; 00444 00445 /* Increment the pointer pIn1 index, count by 1 */ 00446 count += 4u; 00447 00448 /* Update the inputA and inputB pointers for next MAC calculation */ 00449 px = pIn1 + count; 00450 py = pIn2; 00451 00452 00453 /* Decrement the loop counter */ 00454 blkCnt--; 00455 } 00456 00457 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00458 ** No loop unrolling is used. */ 00459 blkCnt = blockSize2 % 0x4u; 00460 00461 while(blkCnt > 0u) 00462 { 00463 /* Accumulator is made zero for every iteration */ 00464 sum = 0; 00465 00466 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00467 k = srcBLen >> 2u; 00468 00469 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00470 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00471 while(k > 0u) 00472 { 00473 /* Perform the multiply-accumulates */ 00474 sum += ((q31_t) * px++ * *py++); 00475 sum += ((q31_t) * px++ * *py++); 00476 sum += ((q31_t) * px++ * *py++); 00477 sum += ((q31_t) * px++ * *py++); 00478 00479 /* Decrement the loop counter */ 00480 k--; 00481 } 00482 00483 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00484 ** No loop unrolling is used. */ 00485 k = srcBLen % 0x4u; 00486 00487 while(k > 0u) 00488 { 00489 /* Perform the multiply-accumulates */ 00490 sum += ((q31_t) * px++ * *py++); 00491 00492 /* Decrement the loop counter */ 00493 k--; 00494 } 00495 00496 /* Store the result in the accumulator in the destination buffer. */ 00497 *pOut = (q15_t) (sum >> 15); 00498 /* Destination pointer is updated according to the address modifier, inc */ 00499 pOut += inc; 00500 00501 /* Increment the pointer pIn1 index, count by 1 */ 00502 count++; 00503 00504 /* Update the inputA and inputB pointers for next MAC calculation */ 00505 px = pIn1 + count; 00506 py = pIn2; 00507 00508 /* Decrement the loop counter */ 00509 blkCnt--; 00510 } 00511 } 00512 else 00513 { 00514 /* If the srcBLen is not a multiple of 4, 00515 * the blockSize2 loop cannot be unrolled by 4 */ 00516 blkCnt = blockSize2; 00517 00518 while(blkCnt > 0u) 00519 { 00520 /* Accumulator is made zero for every iteration */ 00521 sum = 0; 00522 00523 /* Loop over srcBLen */ 00524 k = srcBLen; 00525 00526 while(k > 0u) 00527 { 00528 /* Perform the multiply-accumulate */ 00529 sum += ((q31_t) * px++ * *py++); 00530 00531 /* Decrement the loop counter */ 00532 k--; 00533 } 00534 00535 /* Store the result in the accumulator in the destination buffer. */ 00536 *pOut = (q15_t) (sum >> 15); 00537 /* Destination pointer is updated according to the address modifier, inc */ 00538 pOut += inc; 00539 00540 /* Increment the MAC count */ 00541 count++; 00542 00543 /* Update the inputA and inputB pointers for next MAC calculation */ 00544 px = pIn1 + count; 00545 py = pIn2; 00546 00547 /* Decrement the loop counter */ 00548 blkCnt--; 00549 } 00550 } 00551 00552 /* -------------------------- 00553 * Initializations of stage3 00554 * -------------------------*/ 00555 00556 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00557 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00558 * .... 00559 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00560 * sum += x[srcALen-1] * y[0] 00561 */ 00562 00563 /* In this stage the MAC operations are decreased by 1 for every iteration. 00564 The count variable holds the number of MAC operations performed */ 00565 count = srcBLen - 1u; 00566 00567 /* Working pointer of inputA */ 00568 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00569 px = pSrc1; 00570 00571 /* Working pointer of inputB */ 00572 py = pIn2; 00573 00574 /* ------------------- 00575 * Stage3 process 00576 * ------------------*/ 00577 00578 while(blockSize3 > 0u) 00579 { 00580 /* Accumulator is made zero for every iteration */ 00581 sum = 0; 00582 00583 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00584 k = count >> 2u; 00585 00586 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00587 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00588 while(k > 0u) 00589 { 00590 /* Perform the multiply-accumulates */ 00591 /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */ 00592 sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 00593 /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */ 00594 sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 00595 00596 /* Decrement the loop counter */ 00597 k--; 00598 } 00599 00600 /* If the count is not a multiple of 4, compute any remaining MACs here. 00601 ** No loop unrolling is used. */ 00602 k = count % 0x4u; 00603 00604 while(k > 0u) 00605 { 00606 /* Perform the multiply-accumulates */ 00607 sum = __SMLAD(*px++, *py++, sum); 00608 00609 /* Decrement the loop counter */ 00610 k--; 00611 } 00612 00613 /* Store the result in the accumulator in the destination buffer. */ 00614 *pOut = (q15_t) (sum >> 15); 00615 /* Destination pointer is updated according to the address modifier, inc */ 00616 pOut += inc; 00617 00618 /* Update the inputA and inputB pointers for next MAC calculation */ 00619 px = ++pSrc1; 00620 py = pIn2; 00621 00622 /* Decrement the MAC count */ 00623 count--; 00624 00625 /* Decrement the loop counter */ 00626 blockSize3--; 00627 } 00628 00629 #else 00630 00631 q15_t *pIn1; /* inputA pointer */ 00632 q15_t *pIn2; /* inputB pointer */ 00633 q15_t *pOut = pDst; /* output pointer */ 00634 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00635 q15_t *px; /* Intermediate inputA pointer */ 00636 q15_t *py; /* Intermediate inputB pointer */ 00637 q15_t *pSrc1; /* Intermediate pointers */ 00638 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ 00639 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00640 int32_t inc = 1; /* Destination address modifier */ 00641 q15_t a, b; 00642 00643 00644 /* The algorithm implementation is based on the lengths of the inputs. */ 00645 /* srcB is always made to slide across srcA. */ 00646 /* So srcBLen is always considered as shorter or equal to srcALen */ 00647 /* But CORR(x, y) is reverse of CORR(y, x) */ 00648 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00649 /* and the destination pointer modifier, inc is set to -1 */ 00650 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00651 /* But to improve the performance, 00652 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00653 /* If srcALen > srcBLen, 00654 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00655 /* If srcALen < srcBLen, 00656 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00657 if(srcALen >= srcBLen) 00658 { 00659 /* Initialization of inputA pointer */ 00660 pIn1 = (pSrcA); 00661 00662 /* Initialization of inputB pointer */ 00663 pIn2 = (pSrcB); 00664 00665 /* Number of output samples is calculated */ 00666 outBlockSize = (2u * srcALen) - 1u; 00667 00668 /* When srcALen > srcBLen, zero padding is done to srcB 00669 * to make their lengths equal. 00670 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00671 * number of output samples are made zero */ 00672 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00673 00674 /* Updating the pointer position to non zero value */ 00675 pOut += j; 00676 00677 } 00678 else 00679 { 00680 /* Initialization of inputA pointer */ 00681 pIn1 = (pSrcB); 00682 00683 /* Initialization of inputB pointer */ 00684 pIn2 = (pSrcA); 00685 00686 /* srcBLen is always considered as shorter or equal to srcALen */ 00687 j = srcBLen; 00688 srcBLen = srcALen; 00689 srcALen = j; 00690 00691 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00692 /* Hence set the destination pointer to point to the last output sample */ 00693 pOut = pDst + ((srcALen + srcBLen) - 2u); 00694 00695 /* Destination address modifier is set to -1 */ 00696 inc = -1; 00697 00698 } 00699 00700 /* The function is internally 00701 * divided into three parts according to the number of multiplications that has to be 00702 * taken place between inputA samples and inputB samples. In the first part of the 00703 * algorithm, the multiplications increase by one for every iteration. 00704 * In the second part of the algorithm, srcBLen number of multiplications are done. 00705 * In the third part of the algorithm, the multiplications decrease by one 00706 * for every iteration.*/ 00707 /* The algorithm is implemented in three stages. 00708 * The loop counters of each stage is initiated here. */ 00709 blockSize1 = srcBLen - 1u; 00710 blockSize2 = srcALen - (srcBLen - 1u); 00711 blockSize3 = blockSize1; 00712 00713 /* -------------------------- 00714 * Initializations of stage1 00715 * -------------------------*/ 00716 00717 /* sum = x[0] * y[srcBlen - 1] 00718 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00719 * .... 00720 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00721 */ 00722 00723 /* In this stage the MAC operations are increased by 1 for every iteration. 00724 The count variable holds the number of MAC operations performed */ 00725 count = 1u; 00726 00727 /* Working pointer of inputA */ 00728 px = pIn1; 00729 00730 /* Working pointer of inputB */ 00731 pSrc1 = pIn2 + (srcBLen - 1u); 00732 py = pSrc1; 00733 00734 /* ------------------------ 00735 * Stage1 process 00736 * ----------------------*/ 00737 00738 /* The first loop starts here */ 00739 while(blockSize1 > 0u) 00740 { 00741 /* Accumulator is made zero for every iteration */ 00742 sum = 0; 00743 00744 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00745 k = count >> 2; 00746 00747 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00748 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00749 while(k > 0u) 00750 { 00751 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */ 00752 sum += ((q31_t) * px++ * *py++); 00753 sum += ((q31_t) * px++ * *py++); 00754 sum += ((q31_t) * px++ * *py++); 00755 sum += ((q31_t) * px++ * *py++); 00756 00757 /* Decrement the loop counter */ 00758 k--; 00759 } 00760 00761 /* If the count is not a multiple of 4, compute any remaining MACs here. 00762 ** No loop unrolling is used. */ 00763 k = count % 0x4u; 00764 00765 while(k > 0u) 00766 { 00767 /* Perform the multiply-accumulates */ 00768 /* x[0] * y[srcBLen - 1] */ 00769 sum += ((q31_t) * px++ * *py++); 00770 00771 /* Decrement the loop counter */ 00772 k--; 00773 } 00774 00775 /* Store the result in the accumulator in the destination buffer. */ 00776 *pOut = (q15_t) (sum >> 15); 00777 /* Destination pointer is updated according to the address modifier, inc */ 00778 pOut += inc; 00779 00780 /* Update the inputA and inputB pointers for next MAC calculation */ 00781 py = pSrc1 - count; 00782 px = pIn1; 00783 00784 /* Increment the MAC count */ 00785 count++; 00786 00787 /* Decrement the loop counter */ 00788 blockSize1--; 00789 } 00790 00791 /* -------------------------- 00792 * Initializations of stage2 00793 * ------------------------*/ 00794 00795 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00796 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00797 * .... 00798 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00799 */ 00800 00801 /* Working pointer of inputA */ 00802 px = pIn1; 00803 00804 /* Working pointer of inputB */ 00805 py = pIn2; 00806 00807 /* count is index by which the pointer pIn1 to be incremented */ 00808 count = 0u; 00809 00810 /* ------------------- 00811 * Stage2 process 00812 * ------------------*/ 00813 00814 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00815 * So, to loop unroll over blockSize2, 00816 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ 00817 if(srcBLen >= 4u) 00818 { 00819 /* Loop unroll over blockSize2, by 4 */ 00820 blkCnt = blockSize2 >> 2u; 00821 00822 while(blkCnt > 0u) 00823 { 00824 /* Set all accumulators to zero */ 00825 acc0 = 0; 00826 acc1 = 0; 00827 acc2 = 0; 00828 acc3 = 0; 00829 00830 /* read x[0], x[1], x[2] samples */ 00831 a = *px; 00832 b = *(px + 1); 00833 00834 #ifndef ARM_MATH_BIG_ENDIAN 00835 00836 x0 = __PKHBT(a, b, 16); 00837 a = *(px + 2); 00838 x1 = __PKHBT(b, a, 16); 00839 00840 #else 00841 00842 x0 = __PKHBT(b, a, 16); 00843 a = *(px + 2); 00844 x1 = __PKHBT(a, b, 16); 00845 00846 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00847 00848 px += 2u; 00849 00850 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00851 k = srcBLen >> 2u; 00852 00853 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00854 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00855 do 00856 { 00857 /* Read the first two inputB samples using SIMD: 00858 * y[0] and y[1] */ 00859 a = *py; 00860 b = *(py + 1); 00861 00862 #ifndef ARM_MATH_BIG_ENDIAN 00863 00864 c0 = __PKHBT(a, b, 16); 00865 00866 #else 00867 00868 c0 = __PKHBT(b, a, 16); 00869 00870 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00871 00872 /* acc0 += x[0] * y[0] + x[1] * y[1] */ 00873 acc0 = __SMLAD(x0, c0, acc0); 00874 00875 /* acc1 += x[1] * y[0] + x[2] * y[1] */ 00876 acc1 = __SMLAD(x1, c0, acc1); 00877 00878 /* Read x[2], x[3], x[4] */ 00879 a = *px; 00880 b = *(px + 1); 00881 00882 #ifndef ARM_MATH_BIG_ENDIAN 00883 00884 x2 = __PKHBT(a, b, 16); 00885 a = *(px + 2); 00886 x3 = __PKHBT(b, a, 16); 00887 00888 #else 00889 00890 x2 = __PKHBT(b, a, 16); 00891 a = *(px + 2); 00892 x3 = __PKHBT(a, b, 16); 00893 00894 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00895 00896 /* acc2 += x[2] * y[0] + x[3] * y[1] */ 00897 acc2 = __SMLAD(x2, c0, acc2); 00898 00899 /* acc3 += x[3] * y[0] + x[4] * y[1] */ 00900 acc3 = __SMLAD(x3, c0, acc3); 00901 00902 /* Read y[2] and y[3] */ 00903 a = *(py + 2); 00904 b = *(py + 3); 00905 00906 py += 4u; 00907 00908 #ifndef ARM_MATH_BIG_ENDIAN 00909 00910 c0 = __PKHBT(a, b, 16); 00911 00912 #else 00913 00914 c0 = __PKHBT(b, a, 16); 00915 00916 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00917 00918 /* acc0 += x[2] * y[2] + x[3] * y[3] */ 00919 acc0 = __SMLAD(x2, c0, acc0); 00920 00921 /* acc1 += x[3] * y[2] + x[4] * y[3] */ 00922 acc1 = __SMLAD(x3, c0, acc1); 00923 00924 /* Read x[4], x[5], x[6] */ 00925 a = *(px + 2); 00926 b = *(px + 3); 00927 00928 #ifndef ARM_MATH_BIG_ENDIAN 00929 00930 x0 = __PKHBT(a, b, 16); 00931 a = *(px + 4); 00932 x1 = __PKHBT(b, a, 16); 00933 00934 #else 00935 00936 x0 = __PKHBT(b, a, 16); 00937 a = *(px + 4); 00938 x1 = __PKHBT(a, b, 16); 00939 00940 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00941 00942 px += 4u; 00943 00944 /* acc2 += x[4] * y[2] + x[5] * y[3] */ 00945 acc2 = __SMLAD(x0, c0, acc2); 00946 00947 /* acc3 += x[5] * y[2] + x[6] * y[3] */ 00948 acc3 = __SMLAD(x1, c0, acc3); 00949 00950 } while(--k); 00951 00952 /* For the next MAC operations, SIMD is not used 00953 * So, the 16 bit pointer if inputB, py is updated */ 00954 00955 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00956 ** No loop unrolling is used. */ 00957 k = srcBLen % 0x4u; 00958 00959 if(k == 1u) 00960 { 00961 /* Read y[4] */ 00962 c0 = *py; 00963 #ifdef ARM_MATH_BIG_ENDIAN 00964 00965 c0 = c0 << 16u; 00966 00967 #else 00968 00969 c0 = c0 & 0x0000FFFF; 00970 00971 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00972 00973 /* Read x[7] */ 00974 a = *px; 00975 b = *(px + 1); 00976 00977 px++;; 00978 00979 #ifndef ARM_MATH_BIG_ENDIAN 00980 00981 x3 = __PKHBT(a, b, 16); 00982 00983 #else 00984 00985 x3 = __PKHBT(b, a, 16); 00986 00987 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00988 00989 px++; 00990 00991 /* Perform the multiply-accumulates */ 00992 acc0 = __SMLAD(x0, c0, acc0); 00993 acc1 = __SMLAD(x1, c0, acc1); 00994 acc2 = __SMLADX(x1, c0, acc2); 00995 acc3 = __SMLADX(x3, c0, acc3); 00996 } 00997 00998 if(k == 2u) 00999 { 01000 /* Read y[4], y[5] */ 01001 a = *py; 01002 b = *(py + 1); 01003 01004 #ifndef ARM_MATH_BIG_ENDIAN 01005 01006 c0 = __PKHBT(a, b, 16); 01007 01008 #else 01009 01010 c0 = __PKHBT(b, a, 16); 01011 01012 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01013 01014 /* Read x[7], x[8], x[9] */ 01015 a = *px; 01016 b = *(px + 1); 01017 01018 #ifndef ARM_MATH_BIG_ENDIAN 01019 01020 x3 = __PKHBT(a, b, 16); 01021 a = *(px + 2); 01022 x2 = __PKHBT(b, a, 16); 01023 01024 #else 01025 01026 x3 = __PKHBT(b, a, 16); 01027 a = *(px + 2); 01028 x2 = __PKHBT(a, b, 16); 01029 01030 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01031 01032 px += 2u; 01033 01034 /* Perform the multiply-accumulates */ 01035 acc0 = __SMLAD(x0, c0, acc0); 01036 acc1 = __SMLAD(x1, c0, acc1); 01037 acc2 = __SMLAD(x3, c0, acc2); 01038 acc3 = __SMLAD(x2, c0, acc3); 01039 } 01040 01041 if(k == 3u) 01042 { 01043 /* Read y[4], y[5] */ 01044 a = *py; 01045 b = *(py + 1); 01046 01047 #ifndef ARM_MATH_BIG_ENDIAN 01048 01049 c0 = __PKHBT(a, b, 16); 01050 01051 #else 01052 01053 c0 = __PKHBT(b, a, 16); 01054 01055 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01056 01057 py += 2u; 01058 01059 /* Read x[7], x[8], x[9] */ 01060 a = *px; 01061 b = *(px + 1); 01062 01063 #ifndef ARM_MATH_BIG_ENDIAN 01064 01065 x3 = __PKHBT(a, b, 16); 01066 a = *(px + 2); 01067 x2 = __PKHBT(b, a, 16); 01068 01069 #else 01070 01071 x3 = __PKHBT(b, a, 16); 01072 a = *(px + 2); 01073 x2 = __PKHBT(a, b, 16); 01074 01075 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01076 01077 /* Perform the multiply-accumulates */ 01078 acc0 = __SMLAD(x0, c0, acc0); 01079 acc1 = __SMLAD(x1, c0, acc1); 01080 acc2 = __SMLAD(x3, c0, acc2); 01081 acc3 = __SMLAD(x2, c0, acc3); 01082 01083 c0 = (*py); 01084 /* Read y[6] */ 01085 #ifdef ARM_MATH_BIG_ENDIAN 01086 01087 c0 = c0 << 16u; 01088 #else 01089 01090 c0 = c0 & 0x0000FFFF; 01091 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 01092 01093 /* Read x[10] */ 01094 b = *(px + 3); 01095 01096 #ifndef ARM_MATH_BIG_ENDIAN 01097 01098 x3 = __PKHBT(a, b, 16); 01099 01100 #else 01101 01102 x3 = __PKHBT(b, a, 16); 01103 01104 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 01105 01106 px += 3u; 01107 01108 /* Perform the multiply-accumulates */ 01109 acc0 = __SMLADX(x1, c0, acc0); 01110 acc1 = __SMLAD(x2, c0, acc1); 01111 acc2 = __SMLADX(x2, c0, acc2); 01112 acc3 = __SMLADX(x3, c0, acc3); 01113 } 01114 01115 /* Store the result in the accumulator in the destination buffer. */ 01116 *pOut = (q15_t) (acc0 >> 15); 01117 /* Destination pointer is updated according to the address modifier, inc */ 01118 pOut += inc; 01119 01120 *pOut = (q15_t) (acc1 >> 15); 01121 pOut += inc; 01122 01123 *pOut = (q15_t) (acc2 >> 15); 01124 pOut += inc; 01125 01126 *pOut = (q15_t) (acc3 >> 15); 01127 pOut += inc; 01128 01129 /* Increment the pointer pIn1 index, count by 1 */ 01130 count += 4u; 01131 01132 /* Update the inputA and inputB pointers for next MAC calculation */ 01133 px = pIn1 + count; 01134 py = pIn2; 01135 01136 01137 /* Decrement the loop counter */ 01138 blkCnt--; 01139 } 01140 01141 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 01142 ** No loop unrolling is used. */ 01143 blkCnt = blockSize2 % 0x4u; 01144 01145 while(blkCnt > 0u) 01146 { 01147 /* Accumulator is made zero for every iteration */ 01148 sum = 0; 01149 01150 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01151 k = srcBLen >> 2u; 01152 01153 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01154 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01155 while(k > 0u) 01156 { 01157 /* Perform the multiply-accumulates */ 01158 sum += ((q31_t) * px++ * *py++); 01159 sum += ((q31_t) * px++ * *py++); 01160 sum += ((q31_t) * px++ * *py++); 01161 sum += ((q31_t) * px++ * *py++); 01162 01163 /* Decrement the loop counter */ 01164 k--; 01165 } 01166 01167 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 01168 ** No loop unrolling is used. */ 01169 k = srcBLen % 0x4u; 01170 01171 while(k > 0u) 01172 { 01173 /* Perform the multiply-accumulates */ 01174 sum += ((q31_t) * px++ * *py++); 01175 01176 /* Decrement the loop counter */ 01177 k--; 01178 } 01179 01180 /* Store the result in the accumulator in the destination buffer. */ 01181 *pOut = (q15_t) (sum >> 15); 01182 /* Destination pointer is updated according to the address modifier, inc */ 01183 pOut += inc; 01184 01185 /* Increment the pointer pIn1 index, count by 1 */ 01186 count++; 01187 01188 /* Update the inputA and inputB pointers for next MAC calculation */ 01189 px = pIn1 + count; 01190 py = pIn2; 01191 01192 /* Decrement the loop counter */ 01193 blkCnt--; 01194 } 01195 } 01196 else 01197 { 01198 /* If the srcBLen is not a multiple of 4, 01199 * the blockSize2 loop cannot be unrolled by 4 */ 01200 blkCnt = blockSize2; 01201 01202 while(blkCnt > 0u) 01203 { 01204 /* Accumulator is made zero for every iteration */ 01205 sum = 0; 01206 01207 /* Loop over srcBLen */ 01208 k = srcBLen; 01209 01210 while(k > 0u) 01211 { 01212 /* Perform the multiply-accumulate */ 01213 sum += ((q31_t) * px++ * *py++); 01214 01215 /* Decrement the loop counter */ 01216 k--; 01217 } 01218 01219 /* Store the result in the accumulator in the destination buffer. */ 01220 *pOut = (q15_t) (sum >> 15); 01221 /* Destination pointer is updated according to the address modifier, inc */ 01222 pOut += inc; 01223 01224 /* Increment the MAC count */ 01225 count++; 01226 01227 /* Update the inputA and inputB pointers for next MAC calculation */ 01228 px = pIn1 + count; 01229 py = pIn2; 01230 01231 /* Decrement the loop counter */ 01232 blkCnt--; 01233 } 01234 } 01235 01236 /* -------------------------- 01237 * Initializations of stage3 01238 * -------------------------*/ 01239 01240 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 01241 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 01242 * .... 01243 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 01244 * sum += x[srcALen-1] * y[0] 01245 */ 01246 01247 /* In this stage the MAC operations are decreased by 1 for every iteration. 01248 The count variable holds the number of MAC operations performed */ 01249 count = srcBLen - 1u; 01250 01251 /* Working pointer of inputA */ 01252 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 01253 px = pSrc1; 01254 01255 /* Working pointer of inputB */ 01256 py = pIn2; 01257 01258 /* ------------------- 01259 * Stage3 process 01260 * ------------------*/ 01261 01262 while(blockSize3 > 0u) 01263 { 01264 /* Accumulator is made zero for every iteration */ 01265 sum = 0; 01266 01267 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 01268 k = count >> 2u; 01269 01270 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 01271 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 01272 while(k > 0u) 01273 { 01274 /* Perform the multiply-accumulates */ 01275 sum += ((q31_t) * px++ * *py++); 01276 sum += ((q31_t) * px++ * *py++); 01277 sum += ((q31_t) * px++ * *py++); 01278 sum += ((q31_t) * px++ * *py++); 01279 01280 /* Decrement the loop counter */ 01281 k--; 01282 } 01283 01284 /* If the count is not a multiple of 4, compute any remaining MACs here. 01285 ** No loop unrolling is used. */ 01286 k = count % 0x4u; 01287 01288 while(k > 0u) 01289 { 01290 /* Perform the multiply-accumulates */ 01291 sum += ((q31_t) * px++ * *py++); 01292 01293 /* Decrement the loop counter */ 01294 k--; 01295 } 01296 01297 /* Store the result in the accumulator in the destination buffer. */ 01298 *pOut = (q15_t) (sum >> 15); 01299 /* Destination pointer is updated according to the address modifier, inc */ 01300 pOut += inc; 01301 01302 /* Update the inputA and inputB pointers for next MAC calculation */ 01303 px = ++pSrc1; 01304 py = pIn2; 01305 01306 /* Decrement the MAC count */ 01307 count--; 01308 01309 /* Decrement the loop counter */ 01310 blockSize3--; 01311 } 01312 01313 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 01314 01315 } 01316 01317 /** 01318 * @} end of Corr group 01319 */
Generated on Tue Jul 12 2022 11:59:16 by 1.7.2