CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_conv_partial_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_q7.c 00009 * 00010 * Description: Q7 Partial convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated 00025 * 00026 * Version 0.0.7 2010/06/10 00027 * Misra-C changes done 00028 * 00029 * -------------------------------------------------------------------- */ 00030 00031 #include "arm_math.h" 00032 00033 /** 00034 * @ingroup groupFilters 00035 */ 00036 00037 /** 00038 * @addtogroup PartialConv 00039 * @{ 00040 */ 00041 00042 /** 00043 * @brief Partial convolution of Q7 sequences 00044 * @param[in] *pSrcA points to the first input sequence. 00045 * @param[in] srcALen length of the first input sequence. 00046 * @param[in] *pSrcB points to the second input sequence. 00047 * @param[in] srcBLen length of the second input sequence. 00048 * @param[out] *pDst points to the location where the output result is written. 00049 * @param[in] firstIndex is the first output sample to start with. 00050 * @param[in] numPoints is the number of output points to be computed. 00051 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00052 * 00053 */ 00054 00055 arm_status arm_conv_partial_q7( 00056 q7_t * pSrcA, 00057 uint32_t srcALen, 00058 q7_t * pSrcB, 00059 uint32_t srcBLen, 00060 q7_t * pDst, 00061 uint32_t firstIndex, 00062 uint32_t numPoints) 00063 { 00064 q7_t *pIn1; /* inputA pointer */ 00065 q7_t *pIn2; /* inputB pointer */ 00066 q7_t *pOut = pDst; /* output pointer */ 00067 q7_t *px; /* Intermediate inputA pointer */ 00068 q7_t *py; /* Intermediate inputB pointer */ 00069 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00070 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00071 q31_t input1, input2; 00072 q15_t in1, in2; 00073 q7_t x0, x1, x2, x3, c0, c1; 00074 uint32_t j, k, count, check, blkCnt; 00075 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00076 arm_status status; 00077 00078 00079 /* Check for range of output samples to be calculated */ 00080 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00081 { 00082 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00083 status = ARM_MATH_ARGUMENT_ERROR; 00084 } 00085 else 00086 { 00087 00088 /* The algorithm implementation is based on the lengths of the inputs. */ 00089 /* srcB is always made to slide across srcA. */ 00090 /* So srcBLen is always considered as shorter or equal to srcALen */ 00091 if(srcALen >= srcBLen) 00092 { 00093 /* Initialization of inputA pointer */ 00094 pIn1 = pSrcA; 00095 00096 /* Initialization of inputB pointer */ 00097 pIn2 = pSrcB; 00098 } 00099 else 00100 { 00101 /* Initialization of inputA pointer */ 00102 pIn1 = pSrcB; 00103 00104 /* Initialization of inputB pointer */ 00105 pIn2 = pSrcA; 00106 00107 /* srcBLen is always considered as shorter or equal to srcALen */ 00108 j = srcBLen; 00109 srcBLen = srcALen; 00110 srcALen = j; 00111 } 00112 00113 /* Conditions to check which loopCounter holds 00114 * the first and last indices of the output samples to be calculated. */ 00115 check = firstIndex + numPoints; 00116 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00117 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00118 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00119 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00120 (int32_t) numPoints) : 0; 00121 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00122 (int32_t) firstIndex); 00123 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00124 00125 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00126 /* The function is internally 00127 * divided into three stages according to the number of multiplications that has to be 00128 * taken place between inputA samples and inputB samples. In the first stage of the 00129 * algorithm, the multiplications increase by one for every iteration. 00130 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00131 * In the third stage of the algorithm, the multiplications decrease by one 00132 * for every iteration. */ 00133 00134 /* Set the output pointer to point to the firstIndex 00135 * of the output sample to be calculated. */ 00136 pOut = pDst + firstIndex; 00137 00138 /* -------------------------- 00139 * Initializations of stage1 00140 * -------------------------*/ 00141 00142 /* sum = x[0] * y[0] 00143 * sum = x[0] * y[1] + x[1] * y[0] 00144 * .... 00145 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00146 */ 00147 00148 /* In this stage the MAC operations are increased by 1 for every iteration. 00149 The count variable holds the number of MAC operations performed. 00150 Since the partial convolution starts from from firstIndex 00151 Number of Macs to be performed is firstIndex + 1 */ 00152 count = 1u + firstIndex; 00153 00154 /* Working pointer of inputA */ 00155 px = pIn1; 00156 00157 /* Working pointer of inputB */ 00158 pSrc2 = pIn2 + firstIndex; 00159 py = pSrc2; 00160 00161 /* ------------------------ 00162 * Stage1 process 00163 * ----------------------*/ 00164 00165 /* The first stage starts here */ 00166 while(blockSize1 > 0) 00167 { 00168 /* Accumulator is made zero for every iteration */ 00169 sum = 0; 00170 00171 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00172 k = count >> 2u; 00173 00174 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00175 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00176 while(k > 0u) 00177 { 00178 /* x[0] , x[1] */ 00179 in1 = (q15_t) * px++; 00180 in2 = (q15_t) * px++; 00181 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00182 00183 /* y[srcBLen - 1] , y[srcBLen - 2] */ 00184 in1 = (q15_t) * py--; 00185 in2 = (q15_t) * py--; 00186 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00187 00188 /* x[0] * y[srcBLen - 1] */ 00189 /* x[1] * y[srcBLen - 2] */ 00190 sum = __SMLAD(input1, input2, sum); 00191 00192 /* x[2] , x[3] */ 00193 in1 = (q15_t) * px++; 00194 in2 = (q15_t) * px++; 00195 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00196 00197 /* y[srcBLen - 3] , y[srcBLen - 4] */ 00198 in1 = (q15_t) * py--; 00199 in2 = (q15_t) * py--; 00200 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00201 00202 /* x[2] * y[srcBLen - 3] */ 00203 /* x[3] * y[srcBLen - 4] */ 00204 sum = __SMLAD(input1, input2, sum); 00205 00206 /* Decrement the loop counter */ 00207 k--; 00208 } 00209 00210 /* If the count is not a multiple of 4, compute any remaining MACs here. 00211 ** No loop unrolling is used. */ 00212 k = count % 0x4u; 00213 00214 while(k > 0u) 00215 { 00216 /* Perform the multiply-accumulates */ 00217 sum += ((q31_t) * px++ * *py--); 00218 00219 /* Decrement the loop counter */ 00220 k--; 00221 } 00222 00223 /* Store the result in the accumulator in the destination buffer. */ 00224 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00225 00226 /* Update the inputA and inputB pointers for next MAC calculation */ 00227 py = ++pSrc2; 00228 px = pIn1; 00229 00230 /* Increment the MAC count */ 00231 count++; 00232 00233 /* Decrement the loop counter */ 00234 blockSize1--; 00235 } 00236 00237 /* -------------------------- 00238 * Initializations of stage2 00239 * ------------------------*/ 00240 00241 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00242 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00243 * .... 00244 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00245 */ 00246 00247 /* Working pointer of inputA */ 00248 px = pIn1; 00249 00250 /* Working pointer of inputB */ 00251 pSrc2 = pIn2 + (srcBLen - 1u); 00252 py = pSrc2; 00253 00254 /* count is index by which the pointer pIn1 to be incremented */ 00255 count = 1u; 00256 00257 /* ------------------- 00258 * Stage2 process 00259 * ------------------*/ 00260 00261 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00262 * So, to loop unroll over blockSize2, 00263 * srcBLen should be greater than or equal to 4 */ 00264 if(srcBLen >= 4u) 00265 { 00266 /* Loop unroll over blockSize2, by 4 */ 00267 blkCnt = ((uint32_t) blockSize2 >> 2u); 00268 00269 while(blkCnt > 0u) 00270 { 00271 /* Set all accumulators to zero */ 00272 acc0 = 0; 00273 acc1 = 0; 00274 acc2 = 0; 00275 acc3 = 0; 00276 00277 /* read x[0], x[1], x[2] samples */ 00278 x0 = *(px++); 00279 x1 = *(px++); 00280 x2 = *(px++); 00281 00282 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00283 k = srcBLen >> 2u; 00284 00285 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00286 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00287 do 00288 { 00289 /* Read y[srcBLen - 1] sample */ 00290 c0 = *(py--); 00291 /* Read y[srcBLen - 2] sample */ 00292 c1 = *(py--); 00293 00294 /* Read x[3] sample */ 00295 x3 = *(px++); 00296 00297 /* x[0] and x[1] are packed */ 00298 in1 = (q15_t) x0; 00299 in2 = (q15_t) x1; 00300 00301 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00302 00303 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ 00304 in1 = (q15_t) c0; 00305 in2 = (q15_t) c1; 00306 00307 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00308 00309 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00310 acc0 = __SMLAD(input1, input2, acc0); 00311 00312 /* x[1] and x[2] are packed */ 00313 in1 = (q15_t) x1; 00314 in2 = (q15_t) x2; 00315 00316 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00317 00318 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00319 acc1 = __SMLAD(input1, input2, acc1); 00320 00321 /* x[2] and x[3] are packed */ 00322 in1 = (q15_t) x2; 00323 in2 = (q15_t) x3; 00324 00325 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00326 00327 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00328 acc2 = __SMLAD(input1, input2, acc2); 00329 00330 /* Read x[4] sample */ 00331 x0 = *(px++); 00332 00333 /* x[3] and x[4] are packed */ 00334 in1 = (q15_t) x3; 00335 in2 = (q15_t) x0; 00336 00337 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00338 00339 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00340 acc3 = __SMLAD(input1, input2, acc3); 00341 00342 /* Read y[srcBLen - 3] sample */ 00343 c0 = *(py--); 00344 /* Read y[srcBLen - 4] sample */ 00345 c1 = *(py--); 00346 00347 /* Read x[5] sample */ 00348 x1 = *(px++); 00349 00350 /* x[2] and x[3] are packed */ 00351 in1 = (q15_t) x2; 00352 in2 = (q15_t) x3; 00353 00354 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00355 00356 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 00357 in1 = (q15_t) c0; 00358 in2 = (q15_t) c1; 00359 00360 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00361 00362 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00363 acc0 = __SMLAD(input1, input2, acc0); 00364 00365 /* x[3] and x[4] are packed */ 00366 in1 = (q15_t) x3; 00367 in2 = (q15_t) x0; 00368 00369 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00370 00371 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00372 acc1 = __SMLAD(input1, input2, acc1); 00373 00374 /* x[4] and x[5] are packed */ 00375 in1 = (q15_t) x0; 00376 in2 = (q15_t) x1; 00377 00378 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00379 00380 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00381 acc2 = __SMLAD(input1, input2, acc2); 00382 00383 /* Read x[6] sample */ 00384 x2 = *(px++); 00385 00386 /* x[5] and x[6] are packed */ 00387 in1 = (q15_t) x1; 00388 in2 = (q15_t) x2; 00389 00390 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00391 00392 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00393 acc3 = __SMLAD(input1, input2, acc3); 00394 00395 } while(--k); 00396 00397 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00398 ** No loop unrolling is used. */ 00399 k = srcBLen % 0x4u; 00400 00401 while(k > 0u) 00402 { 00403 /* Read y[srcBLen - 5] sample */ 00404 c0 = *(py--); 00405 00406 /* Read x[7] sample */ 00407 x3 = *(px++); 00408 00409 /* Perform the multiply-accumulates */ 00410 /* acc0 += x[4] * y[srcBLen - 5] */ 00411 acc0 += ((q31_t) x0 * c0); 00412 /* acc1 += x[5] * y[srcBLen - 5] */ 00413 acc1 += ((q31_t) x1 * c0); 00414 /* acc2 += x[6] * y[srcBLen - 5] */ 00415 acc2 += ((q31_t) x2 * c0); 00416 /* acc3 += x[7] * y[srcBLen - 5] */ 00417 acc3 += ((q31_t) x3 * c0); 00418 00419 /* Reuse the present samples for the next MAC */ 00420 x0 = x1; 00421 x1 = x2; 00422 x2 = x3; 00423 00424 /* Decrement the loop counter */ 00425 k--; 00426 } 00427 00428 /* Store the result in the accumulator in the destination buffer. */ 00429 *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8)); 00430 *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8)); 00431 *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8)); 00432 *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8)); 00433 00434 /* Update the inputA and inputB pointers for next MAC calculation */ 00435 px = pIn1 + count * 4u; 00436 py = pSrc2; 00437 00438 /* Increment the pointer pIn1 index, count by 1 */ 00439 count++; 00440 00441 /* Decrement the loop counter */ 00442 blkCnt--; 00443 } 00444 00445 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00446 ** No loop unrolling is used. */ 00447 blkCnt = (uint32_t) blockSize2 % 0x4u; 00448 00449 while(blkCnt > 0u) 00450 { 00451 /* Accumulator is made zero for every iteration */ 00452 sum = 0; 00453 00454 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00455 k = srcBLen >> 2u; 00456 00457 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00458 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00459 while(k > 0u) 00460 { 00461 00462 /* Reading two inputs of SrcA buffer and packing */ 00463 in1 = (q15_t) * px++; 00464 in2 = (q15_t) * px++; 00465 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00466 00467 /* Reading two inputs of SrcB buffer and packing */ 00468 in1 = (q15_t) * py--; 00469 in2 = (q15_t) * py--; 00470 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00471 00472 /* Perform the multiply-accumulates */ 00473 sum = __SMLAD(input1, input2, sum); 00474 00475 /* Reading two inputs of SrcA buffer and packing */ 00476 in1 = (q15_t) * px++; 00477 in2 = (q15_t) * px++; 00478 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00479 00480 /* Reading two inputs of SrcB buffer and packing */ 00481 in1 = (q15_t) * py--; 00482 in2 = (q15_t) * py--; 00483 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00484 00485 /* Perform the multiply-accumulates */ 00486 sum = __SMLAD(input1, input2, sum); 00487 00488 /* Decrement the loop counter */ 00489 k--; 00490 } 00491 00492 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00493 ** No loop unrolling is used. */ 00494 k = srcBLen % 0x4u; 00495 00496 while(k > 0u) 00497 { 00498 /* Perform the multiply-accumulates */ 00499 sum += ((q31_t) * px++ * *py--); 00500 00501 /* Decrement the loop counter */ 00502 k--; 00503 } 00504 00505 /* Store the result in the accumulator in the destination buffer. */ 00506 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00507 00508 /* Update the inputA and inputB pointers for next MAC calculation */ 00509 px = pIn1 + count; 00510 py = pSrc2; 00511 00512 /* Increment the pointer pIn1 index, count by 1 */ 00513 count++; 00514 00515 /* Decrement the loop counter */ 00516 blkCnt--; 00517 } 00518 } 00519 else 00520 { 00521 /* If the srcBLen is not a multiple of 4, 00522 * the blockSize2 loop cannot be unrolled by 4 */ 00523 blkCnt = (uint32_t) blockSize2; 00524 00525 while(blkCnt > 0u) 00526 { 00527 /* Accumulator is made zero for every iteration */ 00528 sum = 0; 00529 00530 /* srcBLen number of MACS should be performed */ 00531 k = srcBLen; 00532 00533 while(k > 0u) 00534 { 00535 /* Perform the multiply-accumulate */ 00536 sum += ((q31_t) * px++ * *py--); 00537 00538 /* Decrement the loop counter */ 00539 k--; 00540 } 00541 00542 /* Store the result in the accumulator in the destination buffer. */ 00543 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00544 00545 /* Update the inputA and inputB pointers for next MAC calculation */ 00546 px = pIn1 + count; 00547 py = pSrc2; 00548 00549 /* Increment the MAC count */ 00550 count++; 00551 00552 /* Decrement the loop counter */ 00553 blkCnt--; 00554 } 00555 } 00556 00557 00558 /* -------------------------- 00559 * Initializations of stage3 00560 * -------------------------*/ 00561 00562 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00563 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00564 * .... 00565 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00566 * sum += x[srcALen-1] * y[srcBLen-1] 00567 */ 00568 00569 /* In this stage the MAC operations are decreased by 1 for every iteration. 00570 The count variable holds the number of MAC operations performed */ 00571 count = srcBLen - 1u; 00572 00573 /* Working pointer of inputA */ 00574 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00575 px = pSrc1; 00576 00577 /* Working pointer of inputB */ 00578 pSrc2 = pIn2 + (srcBLen - 1u); 00579 py = pSrc2; 00580 00581 /* ------------------- 00582 * Stage3 process 00583 * ------------------*/ 00584 00585 while(blockSize3 > 0) 00586 { 00587 /* Accumulator is made zero for every iteration */ 00588 sum = 0; 00589 00590 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00591 k = count >> 2u; 00592 00593 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00594 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00595 while(k > 0u) 00596 { 00597 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 00598 in1 = (q15_t) * px++; 00599 in2 = (q15_t) * px++; 00600 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00601 00602 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 00603 in1 = (q15_t) * py--; 00604 in2 = (q15_t) * py--; 00605 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00606 00607 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00608 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00609 sum = __SMLAD(input1, input2, sum); 00610 00611 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 00612 in1 = (q15_t) * px++; 00613 in2 = (q15_t) * px++; 00614 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00615 00616 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 00617 in1 = (q15_t) * py--; 00618 in2 = (q15_t) * py--; 00619 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00620 00621 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00622 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00623 sum = __SMLAD(input1, input2, sum); 00624 00625 /* Decrement the loop counter */ 00626 k--; 00627 } 00628 00629 /* If the count is not a multiple of 4, compute any remaining MACs here. 00630 ** No loop unrolling is used. */ 00631 k = count % 0x4u; 00632 00633 while(k > 0u) 00634 { 00635 /* Perform the multiply-accumulates */ 00636 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00637 sum += ((q31_t) * px++ * *py--); 00638 00639 /* Decrement the loop counter */ 00640 k--; 00641 } 00642 00643 /* Store the result in the accumulator in the destination buffer. */ 00644 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00645 00646 /* Update the inputA and inputB pointers for next MAC calculation */ 00647 px = ++pSrc1; 00648 py = pSrc2; 00649 00650 /* Decrement the MAC count */ 00651 count--; 00652 00653 /* Decrement the loop counter */ 00654 blockSize3--; 00655 00656 } 00657 00658 /* set status as ARM_MATH_SUCCESS */ 00659 status = ARM_MATH_SUCCESS; 00660 } 00661 00662 /* Return to application */ 00663 return (status); 00664 00665 } 00666 00667 /** 00668 * @} end of PartialConv group 00669 */
Generated on Tue Jul 12 2022 14:13:52 by 1.7.2