CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_conv_partial_fast_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_fast_q31.c 00009 * 00010 * Description: Fast Q31 Partial convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00062 * 00063 * \par 00064 * See <code>arm_conv_partial_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. 00065 */ 00066 00067 arm_status arm_conv_partial_fast_q31( 00068 q31_t * pSrcA, 00069 uint32_t srcALen, 00070 q31_t * pSrcB, 00071 uint32_t srcBLen, 00072 q31_t * pDst, 00073 uint32_t firstIndex, 00074 uint32_t numPoints) 00075 { 00076 q31_t *pIn1; /* inputA pointer */ 00077 q31_t *pIn2; /* inputB pointer */ 00078 q31_t *pOut = pDst; /* output pointer */ 00079 q31_t *px; /* Intermediate inputA pointer */ 00080 q31_t *py; /* Intermediate inputB pointer */ 00081 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00082 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00083 q31_t x0, x1, x2, x3, c0; 00084 uint32_t j, k, count, check, blkCnt; 00085 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00086 arm_status status; /* status of Partial convolution */ 00087 00088 00089 /* Check for range of output samples to be calculated */ 00090 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00091 { 00092 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00093 status = ARM_MATH_ARGUMENT_ERROR; 00094 } 00095 else 00096 { 00097 00098 /* The algorithm implementation is based on the lengths of the inputs. */ 00099 /* srcB is always made to slide across srcA. */ 00100 /* So srcBLen is always considered as shorter or equal to srcALen */ 00101 if(srcALen >= srcBLen) 00102 { 00103 /* Initialization of inputA pointer */ 00104 pIn1 = pSrcA; 00105 00106 /* Initialization of inputB pointer */ 00107 pIn2 = pSrcB; 00108 } 00109 else 00110 { 00111 /* Initialization of inputA pointer */ 00112 pIn1 = pSrcB; 00113 00114 /* Initialization of inputB pointer */ 00115 pIn2 = pSrcA; 00116 00117 /* srcBLen is always considered as shorter or equal to srcALen */ 00118 j = srcBLen; 00119 srcBLen = srcALen; 00120 srcALen = j; 00121 } 00122 00123 /* Conditions to check which loopCounter holds 00124 * the first and last indices of the output samples to be calculated. */ 00125 check = firstIndex + numPoints; 00126 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; 00127 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; 00128 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00129 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00130 (int32_t) numPoints) : 0; 00131 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00132 (int32_t) firstIndex); 00133 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00134 00135 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00136 /* The function is internally 00137 * divided into three stages according to the number of multiplications that has to be 00138 * taken place between inputA samples and inputB samples. In the first stage of the 00139 * algorithm, the multiplications increase by one for every iteration. 00140 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00141 * In the third stage of the algorithm, the multiplications decrease by one 00142 * for every iteration. */ 00143 00144 /* Set the output pointer to point to the firstIndex 00145 * of the output sample to be calculated. */ 00146 pOut = pDst + firstIndex; 00147 00148 /* -------------------------- 00149 * Initializations of stage1 00150 * -------------------------*/ 00151 00152 /* sum = x[0] * y[0] 00153 * sum = x[0] * y[1] + x[1] * y[0] 00154 * .... 00155 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00156 */ 00157 00158 /* In this stage the MAC operations are increased by 1 for every iteration. 00159 The count variable holds the number of MAC operations performed. 00160 Since the partial convolution starts from firstIndex 00161 Number of Macs to be performed is firstIndex + 1 */ 00162 count = 1u + firstIndex; 00163 00164 /* Working pointer of inputA */ 00165 px = pIn1; 00166 00167 /* Working pointer of inputB */ 00168 pSrc2 = pIn2 + firstIndex; 00169 py = pSrc2; 00170 00171 /* ------------------------ 00172 * Stage1 process 00173 * ----------------------*/ 00174 00175 /* The first loop starts here */ 00176 while(blockSize1 > 0) 00177 { 00178 /* Accumulator is made zero for every iteration */ 00179 sum = 0; 00180 00181 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00182 k = count >> 2u; 00183 00184 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00185 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00186 while(k > 0u) 00187 { 00188 /* x[0] * y[srcBLen - 1] */ 00189 sum = (q31_t) ((((q63_t) sum << 32) + 00190 ((q63_t) * px++ * (*py--))) >> 32); 00191 00192 /* x[1] * y[srcBLen - 2] */ 00193 sum = (q31_t) ((((q63_t) sum << 32) + 00194 ((q63_t) * px++ * (*py--))) >> 32); 00195 00196 /* x[2] * y[srcBLen - 3] */ 00197 sum = (q31_t) ((((q63_t) sum << 32) + 00198 ((q63_t) * px++ * (*py--))) >> 32); 00199 00200 /* x[3] * y[srcBLen - 4] */ 00201 sum = (q31_t) ((((q63_t) sum << 32) + 00202 ((q63_t) * px++ * (*py--))) >> 32); 00203 00204 /* Decrement the loop counter */ 00205 k--; 00206 } 00207 00208 /* If the count is not a multiple of 4, compute any remaining MACs here. 00209 ** No loop unrolling is used. */ 00210 k = count % 0x4u; 00211 00212 while(k > 0u) 00213 { 00214 /* Perform the multiply-accumulates */ 00215 sum = (q31_t) ((((q63_t) sum << 32) + 00216 ((q63_t) * px++ * (*py--))) >> 32); 00217 00218 /* Decrement the loop counter */ 00219 k--; 00220 } 00221 00222 /* Store the result in the accumulator in the destination buffer. */ 00223 *pOut++ = sum << 1; 00224 00225 /* Update the inputA and inputB pointers for next MAC calculation */ 00226 py = ++pSrc2; 00227 px = pIn1; 00228 00229 /* Increment the MAC count */ 00230 count++; 00231 00232 /* Decrement the loop counter */ 00233 blockSize1--; 00234 } 00235 00236 /* -------------------------- 00237 * Initializations of stage2 00238 * ------------------------*/ 00239 00240 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00241 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00242 * .... 00243 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00244 */ 00245 00246 /* Working pointer of inputA */ 00247 if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00248 { 00249 px = pIn1 + firstIndex - srcBLen + 1; 00250 } 00251 else 00252 { 00253 px = pIn1; 00254 } 00255 00256 /* Working pointer of inputB */ 00257 pSrc2 = pIn2 + (srcBLen - 1u); 00258 py = pSrc2; 00259 00260 /* count is index by which the pointer pIn1 to be incremented */ 00261 count = 0u; 00262 00263 /* ------------------- 00264 * Stage2 process 00265 * ------------------*/ 00266 00267 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00268 * So, to loop unroll over blockSize2, 00269 * srcBLen should be greater than or equal to 4 */ 00270 if(srcBLen >= 4u) 00271 { 00272 /* Loop unroll over blockSize2 */ 00273 blkCnt = ((uint32_t) blockSize2 >> 2u); 00274 00275 while(blkCnt > 0u) 00276 { 00277 /* Set all accumulators to zero */ 00278 acc0 = 0; 00279 acc1 = 0; 00280 acc2 = 0; 00281 acc3 = 0; 00282 00283 /* read x[0], x[1], x[2] samples */ 00284 x0 = *(px++); 00285 x1 = *(px++); 00286 x2 = *(px++); 00287 00288 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00289 k = srcBLen >> 2u; 00290 00291 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00292 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00293 do 00294 { 00295 /* Read y[srcBLen - 1] sample */ 00296 c0 = *(py--); 00297 00298 /* Read x[3] sample */ 00299 x3 = *(px++); 00300 00301 /* Perform the multiply-accumulate */ 00302 /* acc0 += x[0] * y[srcBLen - 1] */ 00303 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00304 00305 /* acc1 += x[1] * y[srcBLen - 1] */ 00306 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00307 00308 /* acc2 += x[2] * y[srcBLen - 1] */ 00309 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00310 00311 /* acc3 += x[3] * y[srcBLen - 1] */ 00312 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00313 00314 /* Read y[srcBLen - 2] sample */ 00315 c0 = *(py--); 00316 00317 /* Read x[4] sample */ 00318 x0 = *(px++); 00319 00320 /* Perform the multiply-accumulate */ 00321 /* acc0 += x[1] * y[srcBLen - 2] */ 00322 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 00323 /* acc1 += x[2] * y[srcBLen - 2] */ 00324 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 00325 /* acc2 += x[3] * y[srcBLen - 2] */ 00326 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 00327 /* acc3 += x[4] * y[srcBLen - 2] */ 00328 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 00329 00330 /* Read y[srcBLen - 3] sample */ 00331 c0 = *(py--); 00332 00333 /* Read x[5] sample */ 00334 x1 = *(px++); 00335 00336 /* Perform the multiply-accumulates */ 00337 /* acc0 += x[2] * y[srcBLen - 3] */ 00338 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 00339 /* acc1 += x[3] * y[srcBLen - 2] */ 00340 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 00341 /* acc2 += x[4] * y[srcBLen - 2] */ 00342 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 00343 /* acc3 += x[5] * y[srcBLen - 2] */ 00344 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 00345 00346 /* Read y[srcBLen - 4] sample */ 00347 c0 = *(py--); 00348 00349 /* Read x[6] sample */ 00350 x2 = *(px++); 00351 00352 /* Perform the multiply-accumulates */ 00353 /* acc0 += x[3] * y[srcBLen - 4] */ 00354 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 00355 /* acc1 += x[4] * y[srcBLen - 4] */ 00356 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 00357 /* acc2 += x[5] * y[srcBLen - 4] */ 00358 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 00359 /* acc3 += x[6] * y[srcBLen - 4] */ 00360 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 00361 00362 00363 } while(--k); 00364 00365 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00366 ** No loop unrolling is used. */ 00367 k = srcBLen % 0x4u; 00368 00369 while(k > 0u) 00370 { 00371 /* Read y[srcBLen - 5] sample */ 00372 c0 = *(py--); 00373 00374 /* Read x[7] sample */ 00375 x3 = *(px++); 00376 00377 /* Perform the multiply-accumulates */ 00378 /* acc0 += x[4] * y[srcBLen - 5] */ 00379 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00380 /* acc1 += x[5] * y[srcBLen - 5] */ 00381 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00382 /* acc2 += x[6] * y[srcBLen - 5] */ 00383 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00384 /* acc3 += x[7] * y[srcBLen - 5] */ 00385 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00386 00387 /* Reuse the present samples for the next MAC */ 00388 x0 = x1; 00389 x1 = x2; 00390 x2 = x3; 00391 00392 /* Decrement the loop counter */ 00393 k--; 00394 } 00395 00396 /* Store the result in the accumulator in the destination buffer. */ 00397 *pOut++ = (q31_t) (acc0 << 1); 00398 *pOut++ = (q31_t) (acc1 << 1); 00399 *pOut++ = (q31_t) (acc2 << 1); 00400 *pOut++ = (q31_t) (acc3 << 1); 00401 00402 /* Increment the pointer pIn1 index, count by 4 */ 00403 count += 4u; 00404 00405 /* Update the inputA and inputB pointers for next MAC calculation */ 00406 px = pIn1 + count; 00407 py = pSrc2; 00408 00409 /* Decrement the loop counter */ 00410 blkCnt--; 00411 } 00412 00413 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00414 ** No loop unrolling is used. */ 00415 blkCnt = (uint32_t) blockSize2 % 0x4u; 00416 00417 while(blkCnt > 0u) 00418 { 00419 /* Accumulator is made zero for every iteration */ 00420 sum = 0; 00421 00422 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00423 k = srcBLen >> 2u; 00424 00425 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00426 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00427 while(k > 0u) 00428 { 00429 /* Perform the multiply-accumulates */ 00430 sum = (q31_t) ((((q63_t) sum << 32) + 00431 ((q63_t) * px++ * (*py--))) >> 32); 00432 sum = (q31_t) ((((q63_t) sum << 32) + 00433 ((q63_t) * px++ * (*py--))) >> 32); 00434 sum = (q31_t) ((((q63_t) sum << 32) + 00435 ((q63_t) * px++ * (*py--))) >> 32); 00436 sum = (q31_t) ((((q63_t) sum << 32) + 00437 ((q63_t) * px++ * (*py--))) >> 32); 00438 00439 /* Decrement the loop counter */ 00440 k--; 00441 } 00442 00443 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00444 ** No loop unrolling is used. */ 00445 k = srcBLen % 0x4u; 00446 00447 while(k > 0u) 00448 { 00449 /* Perform the multiply-accumulate */ 00450 sum = (q31_t) ((((q63_t) sum << 32) + 00451 ((q63_t) * px++ * (*py--))) >> 32); 00452 00453 /* Decrement the loop counter */ 00454 k--; 00455 } 00456 00457 /* Store the result in the accumulator in the destination buffer. */ 00458 *pOut++ = sum << 1; 00459 00460 /* Increment the MAC count */ 00461 count++; 00462 00463 /* Update the inputA and inputB pointers for next MAC calculation */ 00464 px = pIn1 + count; 00465 py = pSrc2; 00466 00467 /* Decrement the loop counter */ 00468 blkCnt--; 00469 } 00470 } 00471 else 00472 { 00473 /* If the srcBLen is not a multiple of 4, 00474 * the blockSize2 loop cannot be unrolled by 4 */ 00475 blkCnt = (uint32_t) blockSize2; 00476 00477 while(blkCnt > 0u) 00478 { 00479 /* Accumulator is made zero for every iteration */ 00480 sum = 0; 00481 00482 /* srcBLen number of MACS should be performed */ 00483 k = srcBLen; 00484 00485 while(k > 0u) 00486 { 00487 /* Perform the multiply-accumulate */ 00488 sum = (q31_t) ((((q63_t) sum << 32) + 00489 ((q63_t) * px++ * (*py--))) >> 32); 00490 00491 /* Decrement the loop counter */ 00492 k--; 00493 } 00494 00495 /* Store the result in the accumulator in the destination buffer. */ 00496 *pOut++ = sum << 1; 00497 00498 /* Increment the MAC count */ 00499 count++; 00500 00501 /* Update the inputA and inputB pointers for next MAC calculation */ 00502 px = pIn1 + count; 00503 py = pSrc2; 00504 00505 /* Decrement the loop counter */ 00506 blkCnt--; 00507 } 00508 } 00509 00510 00511 /* -------------------------- 00512 * Initializations of stage3 00513 * -------------------------*/ 00514 00515 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00516 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00517 * .... 00518 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00519 * sum += x[srcALen-1] * y[srcBLen-1] 00520 */ 00521 00522 /* In this stage the MAC operations are decreased by 1 for every iteration. 00523 The count variable holds the number of MAC operations performed */ 00524 count = srcBLen - 1u; 00525 00526 /* Working pointer of inputA */ 00527 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00528 px = pSrc1; 00529 00530 /* Working pointer of inputB */ 00531 pSrc2 = pIn2 + (srcBLen - 1u); 00532 py = pSrc2; 00533 00534 /* ------------------- 00535 * Stage3 process 00536 * ------------------*/ 00537 00538 while(blockSize3 > 0) 00539 { 00540 /* Accumulator is made zero for every iteration */ 00541 sum = 0; 00542 00543 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00544 k = count >> 2u; 00545 00546 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00547 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00548 while(k > 0u) 00549 { 00550 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00551 sum = (q31_t) ((((q63_t) sum << 32) + 00552 ((q63_t) * px++ * (*py--))) >> 32); 00553 00554 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00555 sum = (q31_t) ((((q63_t) sum << 32) + 00556 ((q63_t) * px++ * (*py--))) >> 32); 00557 00558 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00559 sum = (q31_t) ((((q63_t) sum << 32) + 00560 ((q63_t) * px++ * (*py--))) >> 32); 00561 00562 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00563 sum = (q31_t) ((((q63_t) sum << 32) + 00564 ((q63_t) * px++ * (*py--))) >> 32); 00565 00566 /* Decrement the loop counter */ 00567 k--; 00568 } 00569 00570 /* If the count is not a multiple of 4, compute any remaining MACs here. 00571 ** No loop unrolling is used. */ 00572 k = count % 0x4u; 00573 00574 while(k > 0u) 00575 { 00576 /* Perform the multiply-accumulates */ 00577 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00578 sum = (q31_t) ((((q63_t) sum << 32) + 00579 ((q63_t) * px++ * (*py--))) >> 32); 00580 00581 /* Decrement the loop counter */ 00582 k--; 00583 } 00584 00585 /* Store the result in the accumulator in the destination buffer. */ 00586 *pOut++ = sum << 1; 00587 00588 /* Update the inputA and inputB pointers for next MAC calculation */ 00589 px = ++pSrc1; 00590 py = pSrc2; 00591 00592 /* Decrement the MAC count */ 00593 count--; 00594 00595 /* Decrement the loop counter */ 00596 blockSize3--; 00597 00598 } 00599 00600 /* set status as ARM_MATH_SUCCESS */ 00601 status = ARM_MATH_SUCCESS; 00602 } 00603 00604 /* Return to application */ 00605 return (status); 00606 00607 } 00608 00609 /** 00610 * @} end of PartialConv group 00611 */
Generated on Tue Jul 12 2022 11:59:16 by 1.7.2