CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_conv_partial_fast_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_fast_q31.c 00009 * 00010 * Description: Fast Q31 Partial convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00062 * 00063 * \par 00064 * See <code>arm_conv_partial_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. 00065 */ 00066 00067 arm_status arm_conv_partial_fast_q31( 00068 q31_t * pSrcA, 00069 uint32_t srcALen, 00070 q31_t * pSrcB, 00071 uint32_t srcBLen, 00072 q31_t * pDst, 00073 uint32_t firstIndex, 00074 uint32_t numPoints) 00075 { 00076 q31_t *pIn1; /* inputA pointer */ 00077 q31_t *pIn2; /* inputB pointer */ 00078 q31_t *pOut = pDst; /* output pointer */ 00079 q31_t *px; /* Intermediate inputA pointer */ 00080 q31_t *py; /* Intermediate inputB pointer */ 00081 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00082 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00083 q31_t x0, x1, x2, x3, c0; 00084 uint32_t j, k, count, check, blkCnt; 00085 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00086 arm_status status; /* status of Partial convolution */ 00087 00088 00089 /* Check for range of output samples to be calculated */ 00090 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00091 { 00092 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00093 status = ARM_MATH_ARGUMENT_ERROR; 00094 } 00095 else 00096 { 00097 00098 /* The algorithm implementation is based on the lengths of the inputs. */ 00099 /* srcB is always made to slide across srcA. */ 00100 /* So srcBLen is always considered as shorter or equal to srcALen */ 00101 if(srcALen >= srcBLen) 00102 { 00103 /* Initialization of inputA pointer */ 00104 pIn1 = pSrcA; 00105 00106 /* Initialization of inputB pointer */ 00107 pIn2 = pSrcB; 00108 } 00109 else 00110 { 00111 /* Initialization of inputA pointer */ 00112 pIn1 = pSrcB; 00113 00114 /* Initialization of inputB pointer */ 00115 pIn2 = pSrcA; 00116 00117 /* srcBLen is always considered as shorter or equal to srcALen */ 00118 j = srcBLen; 00119 srcBLen = srcALen; 00120 srcALen = j; 00121 } 00122 00123 /* Conditions to check which loopCounter holds 00124 * the first and last indices of the output samples to be calculated. */ 00125 check = firstIndex + numPoints; 00126 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00127 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00128 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00129 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00130 (int32_t) numPoints) : 0; 00131 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00132 (int32_t) firstIndex); 00133 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00134 00135 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00136 /* The function is internally 00137 * divided into three stages according to the number of multiplications that has to be 00138 * taken place between inputA samples and inputB samples. In the first stage of the 00139 * algorithm, the multiplications increase by one for every iteration. 00140 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00141 * In the third stage of the algorithm, the multiplications decrease by one 00142 * for every iteration. */ 00143 00144 /* Set the output pointer to point to the firstIndex 00145 * of the output sample to be calculated. */ 00146 pOut = pDst + firstIndex; 00147 00148 /* -------------------------- 00149 * Initializations of stage1 00150 * -------------------------*/ 00151 00152 /* sum = x[0] * y[0] 00153 * sum = x[0] * y[1] + x[1] * y[0] 00154 * .... 00155 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00156 */ 00157 00158 /* In this stage the MAC operations are increased by 1 for every iteration. 00159 The count variable holds the number of MAC operations performed. 00160 Since the partial convolution starts from firstIndex 00161 Number of Macs to be performed is firstIndex + 1 */ 00162 count = 1u + firstIndex; 00163 00164 /* Working pointer of inputA */ 00165 px = pIn1; 00166 00167 /* Working pointer of inputB */ 00168 pSrc2 = pIn2 + firstIndex; 00169 py = pSrc2; 00170 00171 /* ------------------------ 00172 * Stage1 process 00173 * ----------------------*/ 00174 00175 /* The first loop starts here */ 00176 while(blockSize1 > 0) 00177 { 00178 /* Accumulator is made zero for every iteration */ 00179 sum = 0; 00180 00181 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00182 k = count >> 2u; 00183 00184 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00185 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00186 while(k > 0u) 00187 { 00188 /* x[0] * y[srcBLen - 1] */ 00189 sum = (q31_t) ((((q63_t) sum << 32) + 00190 ((q63_t) * px++ * (*py--))) >> 32); 00191 00192 /* x[1] * y[srcBLen - 2] */ 00193 sum = (q31_t) ((((q63_t) sum << 32) + 00194 ((q63_t) * px++ * (*py--))) >> 32); 00195 00196 /* x[2] * y[srcBLen - 3] */ 00197 sum = (q31_t) ((((q63_t) sum << 32) + 00198 ((q63_t) * px++ * (*py--))) >> 32); 00199 00200 /* x[3] * y[srcBLen - 4] */ 00201 sum = (q31_t) ((((q63_t) sum << 32) + 00202 ((q63_t) * px++ * (*py--))) >> 32); 00203 00204 /* Decrement the loop counter */ 00205 k--; 00206 } 00207 00208 /* If the count is not a multiple of 4, compute any remaining MACs here. 00209 ** No loop unrolling is used. */ 00210 k = count % 0x4u; 00211 00212 while(k > 0u) 00213 { 00214 /* Perform the multiply-accumulates */ 00215 sum = (q31_t) ((((q63_t) sum << 32) + 00216 ((q63_t) * px++ * (*py--))) >> 32); 00217 00218 /* Decrement the loop counter */ 00219 k--; 00220 } 00221 00222 /* Store the result in the accumulator in the destination buffer. */ 00223 *pOut++ = sum << 1; 00224 00225 /* Update the inputA and inputB pointers for next MAC calculation */ 00226 py = ++pSrc2; 00227 px = pIn1; 00228 00229 /* Increment the MAC count */ 00230 count++; 00231 00232 /* Decrement the loop counter */ 00233 blockSize1--; 00234 } 00235 00236 /* -------------------------- 00237 * Initializations of stage2 00238 * ------------------------*/ 00239 00240 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00241 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00242 * .... 00243 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00244 */ 00245 00246 /* Working pointer of inputA */ 00247 px = pIn1; 00248 00249 /* Working pointer of inputB */ 00250 pSrc2 = pIn2 + (srcBLen - 1u); 00251 py = pSrc2; 00252 00253 /* count is index by which the pointer pIn1 to be incremented */ 00254 count = 0u; 00255 00256 /* ------------------- 00257 * Stage2 process 00258 * ------------------*/ 00259 00260 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00261 * So, to loop unroll over blockSize2, 00262 * srcBLen should be greater than or equal to 4 */ 00263 if(srcBLen >= 4u) 00264 { 00265 /* Loop unroll over blockSize2 */ 00266 blkCnt = ((uint32_t) blockSize2 >> 2u); 00267 00268 while(blkCnt > 0u) 00269 { 00270 /* Set all accumulators to zero */ 00271 acc0 = 0; 00272 acc1 = 0; 00273 acc2 = 0; 00274 acc3 = 0; 00275 00276 /* read x[0], x[1], x[2] samples */ 00277 x0 = *(px++); 00278 x1 = *(px++); 00279 x2 = *(px++); 00280 00281 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00282 k = srcBLen >> 2u; 00283 00284 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00285 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00286 do 00287 { 00288 /* Read y[srcBLen - 1] sample */ 00289 c0 = *(py--); 00290 00291 /* Read x[3] sample */ 00292 x3 = *(px++); 00293 00294 /* Perform the multiply-accumulate */ 00295 /* acc0 += x[0] * y[srcBLen - 1] */ 00296 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00297 00298 /* acc1 += x[1] * y[srcBLen - 1] */ 00299 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00300 00301 /* acc2 += x[2] * y[srcBLen - 1] */ 00302 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00303 00304 /* acc3 += x[3] * y[srcBLen - 1] */ 00305 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00306 00307 /* Read y[srcBLen - 2] sample */ 00308 c0 = *(py--); 00309 00310 /* Read x[4] sample */ 00311 x0 = *(px++); 00312 00313 /* Perform the multiply-accumulate */ 00314 /* acc0 += x[1] * y[srcBLen - 2] */ 00315 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 00316 /* acc1 += x[2] * y[srcBLen - 2] */ 00317 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 00318 /* acc2 += x[3] * y[srcBLen - 2] */ 00319 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 00320 /* acc3 += x[4] * y[srcBLen - 2] */ 00321 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 00322 00323 /* Read y[srcBLen - 3] sample */ 00324 c0 = *(py--); 00325 00326 /* Read x[5] sample */ 00327 x1 = *(px++); 00328 00329 /* Perform the multiply-accumulates */ 00330 /* acc0 += x[2] * y[srcBLen - 3] */ 00331 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 00332 /* acc1 += x[3] * y[srcBLen - 2] */ 00333 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 00334 /* acc2 += x[4] * y[srcBLen - 2] */ 00335 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 00336 /* acc3 += x[5] * y[srcBLen - 2] */ 00337 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 00338 00339 /* Read y[srcBLen - 4] sample */ 00340 c0 = *(py--); 00341 00342 /* Read x[6] sample */ 00343 x2 = *(px++); 00344 00345 /* Perform the multiply-accumulates */ 00346 /* acc0 += x[3] * y[srcBLen - 4] */ 00347 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 00348 /* acc1 += x[4] * y[srcBLen - 4] */ 00349 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 00350 /* acc2 += x[5] * y[srcBLen - 4] */ 00351 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 00352 /* acc3 += x[6] * y[srcBLen - 4] */ 00353 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 00354 00355 00356 } while(--k); 00357 00358 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00359 ** No loop unrolling is used. */ 00360 k = srcBLen % 0x4u; 00361 00362 while(k > 0u) 00363 { 00364 /* Read y[srcBLen - 5] sample */ 00365 c0 = *(py--); 00366 00367 /* Read x[7] sample */ 00368 x3 = *(px++); 00369 00370 /* Perform the multiply-accumulates */ 00371 /* acc0 += x[4] * y[srcBLen - 5] */ 00372 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00373 /* acc1 += x[5] * y[srcBLen - 5] */ 00374 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00375 /* acc2 += x[6] * y[srcBLen - 5] */ 00376 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00377 /* acc3 += x[7] * y[srcBLen - 5] */ 00378 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00379 00380 /* Reuse the present samples for the next MAC */ 00381 x0 = x1; 00382 x1 = x2; 00383 x2 = x3; 00384 00385 /* Decrement the loop counter */ 00386 k--; 00387 } 00388 00389 /* Store the result in the accumulator in the destination buffer. */ 00390 *pOut++ = (q31_t) (acc0 << 1); 00391 *pOut++ = (q31_t) (acc1 << 1); 00392 *pOut++ = (q31_t) (acc2 << 1); 00393 *pOut++ = (q31_t) (acc3 << 1); 00394 00395 /* Increment the pointer pIn1 index, count by 4 */ 00396 count += 4u; 00397 00398 /* Update the inputA and inputB pointers for next MAC calculation */ 00399 px = pIn1 + count; 00400 py = pSrc2; 00401 00402 /* Decrement the loop counter */ 00403 blkCnt--; 00404 } 00405 00406 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00407 ** No loop unrolling is used. */ 00408 blkCnt = (uint32_t) blockSize2 % 0x4u; 00409 00410 while(blkCnt > 0u) 00411 { 00412 /* Accumulator is made zero for every iteration */ 00413 sum = 0; 00414 00415 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00416 k = srcBLen >> 2u; 00417 00418 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00419 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00420 while(k > 0u) 00421 { 00422 /* Perform the multiply-accumulates */ 00423 sum = (q31_t) ((((q63_t) sum << 32) + 00424 ((q63_t) * px++ * (*py--))) >> 32); 00425 sum = (q31_t) ((((q63_t) sum << 32) + 00426 ((q63_t) * px++ * (*py--))) >> 32); 00427 sum = (q31_t) ((((q63_t) sum << 32) + 00428 ((q63_t) * px++ * (*py--))) >> 32); 00429 sum = (q31_t) ((((q63_t) sum << 32) + 00430 ((q63_t) * px++ * (*py--))) >> 32); 00431 00432 /* Decrement the loop counter */ 00433 k--; 00434 } 00435 00436 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00437 ** No loop unrolling is used. */ 00438 k = srcBLen % 0x4u; 00439 00440 while(k > 0u) 00441 { 00442 /* Perform the multiply-accumulate */ 00443 sum = (q31_t) ((((q63_t) sum << 32) + 00444 ((q63_t) * px++ * (*py--))) >> 32); 00445 00446 /* Decrement the loop counter */ 00447 k--; 00448 } 00449 00450 /* Store the result in the accumulator in the destination buffer. */ 00451 *pOut++ = sum << 1; 00452 00453 /* Increment the MAC count */ 00454 count++; 00455 00456 /* Update the inputA and inputB pointers for next MAC calculation */ 00457 px = pIn1 + count; 00458 py = pSrc2; 00459 00460 /* Decrement the loop counter */ 00461 blkCnt--; 00462 } 00463 } 00464 else 00465 { 00466 /* If the srcBLen is not a multiple of 4, 00467 * the blockSize2 loop cannot be unrolled by 4 */ 00468 blkCnt = (uint32_t) blockSize2; 00469 00470 while(blkCnt > 0u) 00471 { 00472 /* Accumulator is made zero for every iteration */ 00473 sum = 0; 00474 00475 /* srcBLen number of MACS should be performed */ 00476 k = srcBLen; 00477 00478 while(k > 0u) 00479 { 00480 /* Perform the multiply-accumulate */ 00481 sum = (q31_t) ((((q63_t) sum << 32) + 00482 ((q63_t) * px++ * (*py--))) >> 32); 00483 00484 /* Decrement the loop counter */ 00485 k--; 00486 } 00487 00488 /* Store the result in the accumulator in the destination buffer. */ 00489 *pOut++ = sum << 1; 00490 00491 /* Increment the MAC count */ 00492 count++; 00493 00494 /* Update the inputA and inputB pointers for next MAC calculation */ 00495 px = pIn1 + count; 00496 py = pSrc2; 00497 00498 /* Decrement the loop counter */ 00499 blkCnt--; 00500 } 00501 } 00502 00503 00504 /* -------------------------- 00505 * Initializations of stage3 00506 * -------------------------*/ 00507 00508 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00509 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00510 * .... 00511 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00512 * sum += x[srcALen-1] * y[srcBLen-1] 00513 */ 00514 00515 /* In this stage the MAC operations are decreased by 1 for every iteration. 00516 The count variable holds the number of MAC operations performed */ 00517 count = srcBLen - 1u; 00518 00519 /* Working pointer of inputA */ 00520 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00521 px = pSrc1; 00522 00523 /* Working pointer of inputB */ 00524 pSrc2 = pIn2 + (srcBLen - 1u); 00525 py = pSrc2; 00526 00527 /* ------------------- 00528 * Stage3 process 00529 * ------------------*/ 00530 00531 while(blockSize3 > 0) 00532 { 00533 /* Accumulator is made zero for every iteration */ 00534 sum = 0; 00535 00536 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00537 k = count >> 2u; 00538 00539 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00540 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00541 while(k > 0u) 00542 { 00543 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00544 sum = (q31_t) ((((q63_t) sum << 32) + 00545 ((q63_t) * px++ * (*py--))) >> 32); 00546 00547 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00548 sum = (q31_t) ((((q63_t) sum << 32) + 00549 ((q63_t) * px++ * (*py--))) >> 32); 00550 00551 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00552 sum = (q31_t) ((((q63_t) sum << 32) + 00553 ((q63_t) * px++ * (*py--))) >> 32); 00554 00555 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00556 sum = (q31_t) ((((q63_t) sum << 32) + 00557 ((q63_t) * px++ * (*py--))) >> 32); 00558 00559 /* Decrement the loop counter */ 00560 k--; 00561 } 00562 00563 /* If the count is not a multiple of 4, compute any remaining MACs here. 00564 ** No loop unrolling is used. */ 00565 k = count % 0x4u; 00566 00567 while(k > 0u) 00568 { 00569 /* Perform the multiply-accumulates */ 00570 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00571 sum = (q31_t) ((((q63_t) sum << 32) + 00572 ((q63_t) * px++ * (*py--))) >> 32); 00573 00574 /* Decrement the loop counter */ 00575 k--; 00576 } 00577 00578 /* Store the result in the accumulator in the destination buffer. */ 00579 *pOut++ = sum << 1; 00580 00581 /* Update the inputA and inputB pointers for next MAC calculation */ 00582 px = ++pSrc1; 00583 py = pSrc2; 00584 00585 /* Decrement the MAC count */ 00586 count--; 00587 00588 /* Decrement the loop counter */ 00589 blockSize3--; 00590 00591 } 00592 00593 /* set status as ARM_MATH_SUCCESS */ 00594 status = ARM_MATH_SUCCESS; 00595 } 00596 00597 /* Return to application */ 00598 return (status); 00599 00600 } 00601 00602 /** 00603 * @} end of PartialConv group 00604 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2