Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_conv_fast_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_fast_q31.c 00009 * 00010 * Description: Q31 Convolution (fast version). 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @param[in] *pSrcA points to the first input sequence. 00054 * @param[in] srcALen length of the first input sequence. 00055 * @param[in] *pSrcB points to the second input sequence. 00056 * @param[in] srcBLen length of the second input sequence. 00057 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00058 * @return none. 00059 * 00060 * @details 00061 * <b>Scaling and Overflow Behavior:</b> 00062 * 00063 * \par 00064 * This function is optimized for speed at the expense of fixed-point precision and overflow protection. 00065 * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. 00066 * These intermediate results are accumulated in a 32-bit register in 2.30 format. 00067 * Finally, the accumulator is saturated and converted to a 1.31 result. 00068 * 00069 * \par 00070 * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result. 00071 * In order to avoid overflows completely the input signals must be scaled down. 00072 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, 00073 * as maximum of min(srcALen, srcBLen) number of additions are carried internally. 00074 * 00075 * \par 00076 * See <code>arm_conv_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision. 00077 */ 00078 00079 void arm_conv_fast_q31( 00080 q31_t * pSrcA, 00081 uint32_t srcALen, 00082 q31_t * pSrcB, 00083 uint32_t srcBLen, 00084 q31_t * pDst) 00085 { 00086 q31_t *pIn1; /* inputA pointer */ 00087 q31_t *pIn2; /* inputB pointer */ 00088 q31_t *pOut = pDst; /* output pointer */ 00089 q31_t *px; /* Intermediate inputA pointer */ 00090 q31_t *py; /* Intermediate inputB pointer */ 00091 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00092 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00093 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00094 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ 00095 00096 /* The algorithm implementation is based on the lengths of the inputs. */ 00097 /* srcB is always made to slide across srcA. */ 00098 /* So srcBLen is always considered as shorter or equal to srcALen */ 00099 if(srcALen >= srcBLen) 00100 { 00101 /* Initialization of inputA pointer */ 00102 pIn1 = pSrcA; 00103 00104 /* Initialization of inputB pointer */ 00105 pIn2 = pSrcB; 00106 } 00107 else 00108 { 00109 /* Initialization of inputA pointer */ 00110 pIn1 = pSrcB; 00111 00112 /* Initialization of inputB pointer */ 00113 pIn2 = pSrcA; 00114 00115 /* srcBLen is always considered as shorter or equal to srcALen */ 00116 j = srcBLen; 00117 srcBLen = srcALen; 00118 srcALen = j; 00119 } 00120 00121 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00122 /* The function is internally 00123 * divided into three stages according to the number of multiplications that has to be 00124 * taken place between inputA samples and inputB samples. In the first stage of the 00125 * algorithm, the multiplications increase by one for every iteration. 00126 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00127 * In the third stage of the algorithm, the multiplications decrease by one 00128 * for every iteration. */ 00129 00130 /* The algorithm is implemented in three stages. 00131 The loop counters of each stage is initiated here. */ 00132 blockSize1 = srcBLen - 1u; 00133 blockSize2 = srcALen - (srcBLen - 1u); 00134 blockSize3 = blockSize1; 00135 00136 /* -------------------------- 00137 * Initializations of stage1 00138 * -------------------------*/ 00139 00140 /* sum = x[0] * y[0] 00141 * sum = x[0] * y[1] + x[1] * y[0] 00142 * .... 00143 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00144 */ 00145 00146 /* In this stage the MAC operations are increased by 1 for every iteration. 00147 The count variable holds the number of MAC operations performed */ 00148 count = 1u; 00149 00150 /* Working pointer of inputA */ 00151 px = pIn1; 00152 00153 /* Working pointer of inputB */ 00154 py = pIn2; 00155 00156 00157 /* ------------------------ 00158 * Stage1 process 00159 * ----------------------*/ 00160 00161 /* The first stage starts here */ 00162 while(blockSize1 > 0u) 00163 { 00164 /* Accumulator is made zero for every iteration */ 00165 sum = 0; 00166 00167 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00168 k = count >> 2u; 00169 00170 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00171 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00172 while(k > 0u) 00173 { 00174 /* x[0] * y[srcBLen - 1] */ 00175 sum = (q31_t) ((((q63_t) sum << 32) + 00176 ((q63_t) * px++ * (*py--))) >> 32); 00177 00178 /* x[1] * y[srcBLen - 2] */ 00179 sum = (q31_t) ((((q63_t) sum << 32) + 00180 ((q63_t) * px++ * (*py--))) >> 32); 00181 00182 /* x[2] * y[srcBLen - 3] */ 00183 sum = (q31_t) ((((q63_t) sum << 32) + 00184 ((q63_t) * px++ * (*py--))) >> 32); 00185 00186 /* x[3] * y[srcBLen - 4] */ 00187 sum = (q31_t) ((((q63_t) sum << 32) + 00188 ((q63_t) * px++ * (*py--))) >> 32); 00189 00190 /* Decrement the loop counter */ 00191 k--; 00192 } 00193 00194 /* If the count is not a multiple of 4, compute any remaining MACs here. 00195 ** No loop unrolling is used. */ 00196 k = count % 0x4u; 00197 00198 while(k > 0u) 00199 { 00200 /* Perform the multiply-accumulate */ 00201 sum = (q31_t) ((((q63_t) sum << 32) + 00202 ((q63_t) * px++ * (*py--))) >> 32); 00203 00204 /* Decrement the loop counter */ 00205 k--; 00206 } 00207 00208 /* Store the result in the accumulator in the destination buffer. */ 00209 *pOut++ = sum << 1; 00210 00211 /* Update the inputA and inputB pointers for next MAC calculation */ 00212 py = pIn2 + count; 00213 px = pIn1; 00214 00215 /* Increment the MAC count */ 00216 count++; 00217 00218 /* Decrement the loop counter */ 00219 blockSize1--; 00220 } 00221 00222 /* -------------------------- 00223 * Initializations of stage2 00224 * ------------------------*/ 00225 00226 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00227 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00228 * .... 00229 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00230 */ 00231 00232 /* Working pointer of inputA */ 00233 px = pIn1; 00234 00235 /* Working pointer of inputB */ 00236 pSrc2 = pIn2 + (srcBLen - 1u); 00237 py = pSrc2; 00238 00239 /* count is index by which the pointer pIn1 to be incremented */ 00240 count = 0u; 00241 00242 /* ------------------- 00243 * Stage2 process 00244 * ------------------*/ 00245 00246 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00247 * So, to loop unroll over blockSize2, 00248 * srcBLen should be greater than or equal to 4 */ 00249 if(srcBLen >= 4u) 00250 { 00251 /* Loop unroll over blockSize2, by 4 */ 00252 blkCnt = blockSize2 >> 2u; 00253 00254 while(blkCnt > 0u) 00255 { 00256 /* Set all accumulators to zero */ 00257 acc0 = 0; 00258 acc1 = 0; 00259 acc2 = 0; 00260 acc3 = 0; 00261 00262 /* read x[0], x[1], x[2] samples */ 00263 x0 = *(px++); 00264 x1 = *(px++); 00265 x2 = *(px++); 00266 00267 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00268 k = srcBLen >> 2u; 00269 00270 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00271 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00272 do 00273 { 00274 /* Read y[srcBLen - 1] sample */ 00275 c0 = *(py--); 00276 00277 /* Read x[3] sample */ 00278 x3 = *(px++); 00279 00280 /* Perform the multiply-accumulates */ 00281 /* acc0 += x[0] * y[srcBLen - 1] */ 00282 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00283 00284 /* acc1 += x[1] * y[srcBLen - 1] */ 00285 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00286 00287 /* acc2 += x[2] * y[srcBLen - 1] */ 00288 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00289 00290 /* acc3 += x[3] * y[srcBLen - 1] */ 00291 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00292 00293 /* Read y[srcBLen - 2] sample */ 00294 c0 = *(py--); 00295 00296 /* Read x[4] sample */ 00297 x0 = *(px++); 00298 00299 /* Perform the multiply-accumulate */ 00300 /* acc0 += x[1] * y[srcBLen - 2] */ 00301 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 00302 /* acc1 += x[2] * y[srcBLen - 2] */ 00303 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 00304 /* acc2 += x[3] * y[srcBLen - 2] */ 00305 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 00306 /* acc3 += x[4] * y[srcBLen - 2] */ 00307 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 00308 00309 /* Read y[srcBLen - 3] sample */ 00310 c0 = *(py--); 00311 00312 /* Read x[5] sample */ 00313 x1 = *(px++); 00314 00315 /* Perform the multiply-accumulates */ 00316 /* acc0 += x[2] * y[srcBLen - 3] */ 00317 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 00318 /* acc1 += x[3] * y[srcBLen - 3] */ 00319 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 00320 /* acc2 += x[4] * y[srcBLen - 3] */ 00321 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 00322 /* acc3 += x[5] * y[srcBLen - 3] */ 00323 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 00324 00325 /* Read y[srcBLen - 4] sample */ 00326 c0 = *(py--); 00327 00328 /* Read x[6] sample */ 00329 x2 = *(px++); 00330 00331 /* Perform the multiply-accumulates */ 00332 /* acc0 += x[3] * y[srcBLen - 4] */ 00333 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 00334 /* acc1 += x[4] * y[srcBLen - 4] */ 00335 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 00336 /* acc2 += x[5] * y[srcBLen - 4] */ 00337 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 00338 /* acc3 += x[6] * y[srcBLen - 4] */ 00339 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 00340 00341 00342 } while(--k); 00343 00344 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00345 ** No loop unrolling is used. */ 00346 k = srcBLen % 0x4u; 00347 00348 while(k > 0u) 00349 { 00350 /* Read y[srcBLen - 5] sample */ 00351 c0 = *(py--); 00352 00353 /* Read x[7] sample */ 00354 x3 = *(px++); 00355 00356 /* Perform the multiply-accumulates */ 00357 /* acc0 += x[4] * y[srcBLen - 5] */ 00358 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00359 /* acc1 += x[5] * y[srcBLen - 5] */ 00360 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00361 /* acc2 += x[6] * y[srcBLen - 5] */ 00362 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00363 /* acc3 += x[7] * y[srcBLen - 5] */ 00364 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00365 00366 /* Reuse the present samples for the next MAC */ 00367 x0 = x1; 00368 x1 = x2; 00369 x2 = x3; 00370 00371 /* Decrement the loop counter */ 00372 k--; 00373 } 00374 00375 /* Store the results in the accumulators in the destination buffer. */ 00376 *pOut++ = (q31_t) (acc0 << 1); 00377 *pOut++ = (q31_t) (acc1 << 1); 00378 *pOut++ = (q31_t) (acc2 << 1); 00379 *pOut++ = (q31_t) (acc3 << 1); 00380 00381 /* Increment the pointer pIn1 index, count by 4 */ 00382 count += 4u; 00383 00384 /* Update the inputA and inputB pointers for next MAC calculation */ 00385 px = pIn1 + count; 00386 py = pSrc2; 00387 00388 /* Decrement the loop counter */ 00389 blkCnt--; 00390 } 00391 00392 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00393 ** No loop unrolling is used. */ 00394 blkCnt = blockSize2 % 0x4u; 00395 00396 while(blkCnt > 0u) 00397 { 00398 /* Accumulator is made zero for every iteration */ 00399 sum = 0; 00400 00401 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00402 k = srcBLen >> 2u; 00403 00404 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00405 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00406 while(k > 0u) 00407 { 00408 /* Perform the multiply-accumulates */ 00409 sum = (q31_t) ((((q63_t) sum << 32) + 00410 ((q63_t) * px++ * (*py--))) >> 32); 00411 sum = (q31_t) ((((q63_t) sum << 32) + 00412 ((q63_t) * px++ * (*py--))) >> 32); 00413 sum = (q31_t) ((((q63_t) sum << 32) + 00414 ((q63_t) * px++ * (*py--))) >> 32); 00415 sum = (q31_t) ((((q63_t) sum << 32) + 00416 ((q63_t) * px++ * (*py--))) >> 32); 00417 00418 /* Decrement the loop counter */ 00419 k--; 00420 } 00421 00422 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00423 ** No loop unrolling is used. */ 00424 k = srcBLen % 0x4u; 00425 00426 while(k > 0u) 00427 { 00428 /* Perform the multiply-accumulate */ 00429 sum = (q31_t) ((((q63_t) sum << 32) + 00430 ((q63_t) * px++ * (*py--))) >> 32); 00431 00432 /* Decrement the loop counter */ 00433 k--; 00434 } 00435 00436 /* Store the result in the accumulator in the destination buffer. */ 00437 *pOut++ = sum << 1; 00438 00439 /* Increment the MAC count */ 00440 count++; 00441 00442 /* Update the inputA and inputB pointers for next MAC calculation */ 00443 px = pIn1 + count; 00444 py = pSrc2; 00445 00446 /* Decrement the loop counter */ 00447 blkCnt--; 00448 } 00449 } 00450 else 00451 { 00452 /* If the srcBLen is not a multiple of 4, 00453 * the blockSize2 loop cannot be unrolled by 4 */ 00454 blkCnt = blockSize2; 00455 00456 while(blkCnt > 0u) 00457 { 00458 /* Accumulator is made zero for every iteration */ 00459 sum = 0; 00460 00461 /* srcBLen number of MACS should be performed */ 00462 k = srcBLen; 00463 00464 while(k > 0u) 00465 { 00466 /* Perform the multiply-accumulate */ 00467 sum = (q31_t) ((((q63_t) sum << 32) + 00468 ((q63_t) * px++ * (*py--))) >> 32); 00469 00470 /* Decrement the loop counter */ 00471 k--; 00472 } 00473 00474 /* Store the result in the accumulator in the destination buffer. */ 00475 *pOut++ = sum << 1; 00476 00477 /* Increment the MAC count */ 00478 count++; 00479 00480 /* Update the inputA and inputB pointers for next MAC calculation */ 00481 px = pIn1 + count; 00482 py = pSrc2; 00483 00484 /* Decrement the loop counter */ 00485 blkCnt--; 00486 } 00487 } 00488 00489 00490 /* -------------------------- 00491 * Initializations of stage3 00492 * -------------------------*/ 00493 00494 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00495 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00496 * .... 00497 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00498 * sum += x[srcALen-1] * y[srcBLen-1] 00499 */ 00500 00501 /* In this stage the MAC operations are decreased by 1 for every iteration. 00502 The blockSize3 variable holds the number of MAC operations performed */ 00503 00504 /* Working pointer of inputA */ 00505 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00506 px = pSrc1; 00507 00508 /* Working pointer of inputB */ 00509 pSrc2 = pIn2 + (srcBLen - 1u); 00510 py = pSrc2; 00511 00512 /* ------------------- 00513 * Stage3 process 00514 * ------------------*/ 00515 00516 while(blockSize3 > 0u) 00517 { 00518 /* Accumulator is made zero for every iteration */ 00519 sum = 0; 00520 00521 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00522 k = blockSize3 >> 2u; 00523 00524 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00525 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00526 while(k > 0u) 00527 { 00528 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00529 sum = (q31_t) ((((q63_t) sum << 32) + 00530 ((q63_t) * px++ * (*py--))) >> 32); 00531 00532 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00533 sum = (q31_t) ((((q63_t) sum << 32) + 00534 ((q63_t) * px++ * (*py--))) >> 32); 00535 00536 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00537 sum = (q31_t) ((((q63_t) sum << 32) + 00538 ((q63_t) * px++ * (*py--))) >> 32); 00539 00540 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00541 sum = (q31_t) ((((q63_t) sum << 32) + 00542 ((q63_t) * px++ * (*py--))) >> 32); 00543 00544 /* Decrement the loop counter */ 00545 k--; 00546 } 00547 00548 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00549 ** No loop unrolling is used. */ 00550 k = blockSize3 % 0x4u; 00551 00552 while(k > 0u) 00553 { 00554 /* Perform the multiply-accumulate */ 00555 sum = (q31_t) ((((q63_t) sum << 32) + 00556 ((q63_t) * px++ * (*py--))) >> 32); 00557 00558 /* Decrement the loop counter */ 00559 k--; 00560 } 00561 00562 /* Store the result in the accumulator in the destination buffer. */ 00563 *pOut++ = sum << 1; 00564 00565 /* Update the inputA and inputB pointers for next MAC calculation */ 00566 px = ++pSrc1; 00567 py = pSrc2; 00568 00569 /* Decrement the loop counter */ 00570 blockSize3--; 00571 } 00572 00573 } 00574 00575 /** 00576 * @} end of Conv group 00577 */
Generated on Tue Jul 12 2022 18:44:08 by
1.7.2
