CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_conv_partial_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_q7.c 00009 * 00010 * Description: Partial convolution of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup PartialConv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Partial convolution of Q7 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. 00059 * @param[in] firstIndex is the first output sample to start with. 00060 * @param[in] numPoints is the number of output points to be computed. 00061 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. 00062 * 00063 * \par 00064 * Refer the function <code>arm_conv_partial_opt_q7()</code> for a faster implementation of this function. 00065 * 00066 */ 00067 00068 arm_status arm_conv_partial_q7( 00069 q7_t * pSrcA, 00070 uint32_t srcALen, 00071 q7_t * pSrcB, 00072 uint32_t srcBLen, 00073 q7_t * pDst, 00074 uint32_t firstIndex, 00075 uint32_t numPoints) 00076 { 00077 00078 00079 #ifndef ARM_MATH_CM0_FAMILY 00080 00081 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00082 00083 q7_t *pIn1; /* inputA pointer */ 00084 q7_t *pIn2; /* inputB pointer */ 00085 q7_t *pOut = pDst; /* output pointer */ 00086 q7_t *px; /* Intermediate inputA pointer */ 00087 q7_t *py; /* Intermediate inputB pointer */ 00088 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00089 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00090 q31_t input1, input2; 00091 q15_t in1, in2; 00092 q7_t x0, x1, x2, x3, c0, c1; 00093 uint32_t j, k, count, check, blkCnt; 00094 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00095 arm_status status; 00096 00097 00098 /* Check for range of output samples to be calculated */ 00099 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00100 { 00101 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00102 status = ARM_MATH_ARGUMENT_ERROR; 00103 } 00104 else 00105 { 00106 00107 /* The algorithm implementation is based on the lengths of the inputs. */ 00108 /* srcB is always made to slide across srcA. */ 00109 /* So srcBLen is always considered as shorter or equal to srcALen */ 00110 if(srcALen >= srcBLen) 00111 { 00112 /* Initialization of inputA pointer */ 00113 pIn1 = pSrcA; 00114 00115 /* Initialization of inputB pointer */ 00116 pIn2 = pSrcB; 00117 } 00118 else 00119 { 00120 /* Initialization of inputA pointer */ 00121 pIn1 = pSrcB; 00122 00123 /* Initialization of inputB pointer */ 00124 pIn2 = pSrcA; 00125 00126 /* srcBLen is always considered as shorter or equal to srcALen */ 00127 j = srcBLen; 00128 srcBLen = srcALen; 00129 srcALen = j; 00130 } 00131 00132 /* Conditions to check which loopCounter holds 00133 * the first and last indices of the output samples to be calculated. */ 00134 check = firstIndex + numPoints; 00135 blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; 00136 blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; 00137 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00138 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00139 (int32_t) numPoints) : 0; 00140 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00141 (int32_t) firstIndex); 00142 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00143 00144 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00145 /* The function is internally 00146 * divided into three stages according to the number of multiplications that has to be 00147 * taken place between inputA samples and inputB samples. In the first stage of the 00148 * algorithm, the multiplications increase by one for every iteration. 00149 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00150 * In the third stage of the algorithm, the multiplications decrease by one 00151 * for every iteration. */ 00152 00153 /* Set the output pointer to point to the firstIndex 00154 * of the output sample to be calculated. */ 00155 pOut = pDst + firstIndex; 00156 00157 /* -------------------------- 00158 * Initializations of stage1 00159 * -------------------------*/ 00160 00161 /* sum = x[0] * y[0] 00162 * sum = x[0] * y[1] + x[1] * y[0] 00163 * .... 00164 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00165 */ 00166 00167 /* In this stage the MAC operations are increased by 1 for every iteration. 00168 The count variable holds the number of MAC operations performed. 00169 Since the partial convolution starts from from firstIndex 00170 Number of Macs to be performed is firstIndex + 1 */ 00171 count = 1u + firstIndex; 00172 00173 /* Working pointer of inputA */ 00174 px = pIn1; 00175 00176 /* Working pointer of inputB */ 00177 pSrc2 = pIn2 + firstIndex; 00178 py = pSrc2; 00179 00180 /* ------------------------ 00181 * Stage1 process 00182 * ----------------------*/ 00183 00184 /* The first stage starts here */ 00185 while(blockSize1 > 0) 00186 { 00187 /* Accumulator is made zero for every iteration */ 00188 sum = 0; 00189 00190 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00191 k = count >> 2u; 00192 00193 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00194 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00195 while(k > 0u) 00196 { 00197 /* x[0] , x[1] */ 00198 in1 = (q15_t) * px++; 00199 in2 = (q15_t) * px++; 00200 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00201 00202 /* y[srcBLen - 1] , y[srcBLen - 2] */ 00203 in1 = (q15_t) * py--; 00204 in2 = (q15_t) * py--; 00205 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00206 00207 /* x[0] * y[srcBLen - 1] */ 00208 /* x[1] * y[srcBLen - 2] */ 00209 sum = __SMLAD(input1, input2, sum); 00210 00211 /* x[2] , x[3] */ 00212 in1 = (q15_t) * px++; 00213 in2 = (q15_t) * px++; 00214 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00215 00216 /* y[srcBLen - 3] , y[srcBLen - 4] */ 00217 in1 = (q15_t) * py--; 00218 in2 = (q15_t) * py--; 00219 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00220 00221 /* x[2] * y[srcBLen - 3] */ 00222 /* x[3] * y[srcBLen - 4] */ 00223 sum = __SMLAD(input1, input2, sum); 00224 00225 /* Decrement the loop counter */ 00226 k--; 00227 } 00228 00229 /* If the count is not a multiple of 4, compute any remaining MACs here. 00230 ** No loop unrolling is used. */ 00231 k = count % 0x4u; 00232 00233 while(k > 0u) 00234 { 00235 /* Perform the multiply-accumulates */ 00236 sum += ((q31_t) * px++ * *py--); 00237 00238 /* Decrement the loop counter */ 00239 k--; 00240 } 00241 00242 /* Store the result in the accumulator in the destination buffer. */ 00243 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00244 00245 /* Update the inputA and inputB pointers for next MAC calculation */ 00246 py = ++pSrc2; 00247 px = pIn1; 00248 00249 /* Increment the MAC count */ 00250 count++; 00251 00252 /* Decrement the loop counter */ 00253 blockSize1--; 00254 } 00255 00256 /* -------------------------- 00257 * Initializations of stage2 00258 * ------------------------*/ 00259 00260 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00261 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00262 * .... 00263 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00264 */ 00265 00266 /* Working pointer of inputA */ 00267 if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0) 00268 { 00269 px = pIn1 + firstIndex - srcBLen + 1; 00270 } 00271 else 00272 { 00273 px = pIn1; 00274 } 00275 00276 /* Working pointer of inputB */ 00277 pSrc2 = pIn2 + (srcBLen - 1u); 00278 py = pSrc2; 00279 00280 /* count is index by which the pointer pIn1 to be incremented */ 00281 count = 0u; 00282 00283 /* ------------------- 00284 * Stage2 process 00285 * ------------------*/ 00286 00287 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00288 * So, to loop unroll over blockSize2, 00289 * srcBLen should be greater than or equal to 4 */ 00290 if(srcBLen >= 4u) 00291 { 00292 /* Loop unroll over blockSize2, by 4 */ 00293 blkCnt = ((uint32_t) blockSize2 >> 2u); 00294 00295 while(blkCnt > 0u) 00296 { 00297 /* Set all accumulators to zero */ 00298 acc0 = 0; 00299 acc1 = 0; 00300 acc2 = 0; 00301 acc3 = 0; 00302 00303 /* read x[0], x[1], x[2] samples */ 00304 x0 = *(px++); 00305 x1 = *(px++); 00306 x2 = *(px++); 00307 00308 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00309 k = srcBLen >> 2u; 00310 00311 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00312 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00313 do 00314 { 00315 /* Read y[srcBLen - 1] sample */ 00316 c0 = *(py--); 00317 /* Read y[srcBLen - 2] sample */ 00318 c1 = *(py--); 00319 00320 /* Read x[3] sample */ 00321 x3 = *(px++); 00322 00323 /* x[0] and x[1] are packed */ 00324 in1 = (q15_t) x0; 00325 in2 = (q15_t) x1; 00326 00327 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00328 00329 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ 00330 in1 = (q15_t) c0; 00331 in2 = (q15_t) c1; 00332 00333 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00334 00335 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00336 acc0 = __SMLAD(input1, input2, acc0); 00337 00338 /* x[1] and x[2] are packed */ 00339 in1 = (q15_t) x1; 00340 in2 = (q15_t) x2; 00341 00342 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00343 00344 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00345 acc1 = __SMLAD(input1, input2, acc1); 00346 00347 /* x[2] and x[3] are packed */ 00348 in1 = (q15_t) x2; 00349 in2 = (q15_t) x3; 00350 00351 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00352 00353 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00354 acc2 = __SMLAD(input1, input2, acc2); 00355 00356 /* Read x[4] sample */ 00357 x0 = *(px++); 00358 00359 /* x[3] and x[4] are packed */ 00360 in1 = (q15_t) x3; 00361 in2 = (q15_t) x0; 00362 00363 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00364 00365 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00366 acc3 = __SMLAD(input1, input2, acc3); 00367 00368 /* Read y[srcBLen - 3] sample */ 00369 c0 = *(py--); 00370 /* Read y[srcBLen - 4] sample */ 00371 c1 = *(py--); 00372 00373 /* Read x[5] sample */ 00374 x1 = *(px++); 00375 00376 /* x[2] and x[3] are packed */ 00377 in1 = (q15_t) x2; 00378 in2 = (q15_t) x3; 00379 00380 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00381 00382 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 00383 in1 = (q15_t) c0; 00384 in2 = (q15_t) c1; 00385 00386 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00387 00388 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00389 acc0 = __SMLAD(input1, input2, acc0); 00390 00391 /* x[3] and x[4] are packed */ 00392 in1 = (q15_t) x3; 00393 in2 = (q15_t) x0; 00394 00395 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00396 00397 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00398 acc1 = __SMLAD(input1, input2, acc1); 00399 00400 /* x[4] and x[5] are packed */ 00401 in1 = (q15_t) x0; 00402 in2 = (q15_t) x1; 00403 00404 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00405 00406 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00407 acc2 = __SMLAD(input1, input2, acc2); 00408 00409 /* Read x[6] sample */ 00410 x2 = *(px++); 00411 00412 /* x[5] and x[6] are packed */ 00413 in1 = (q15_t) x1; 00414 in2 = (q15_t) x2; 00415 00416 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00417 00418 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00419 acc3 = __SMLAD(input1, input2, acc3); 00420 00421 } while(--k); 00422 00423 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00424 ** No loop unrolling is used. */ 00425 k = srcBLen % 0x4u; 00426 00427 while(k > 0u) 00428 { 00429 /* Read y[srcBLen - 5] sample */ 00430 c0 = *(py--); 00431 00432 /* Read x[7] sample */ 00433 x3 = *(px++); 00434 00435 /* Perform the multiply-accumulates */ 00436 /* acc0 += x[4] * y[srcBLen - 5] */ 00437 acc0 += ((q31_t) x0 * c0); 00438 /* acc1 += x[5] * y[srcBLen - 5] */ 00439 acc1 += ((q31_t) x1 * c0); 00440 /* acc2 += x[6] * y[srcBLen - 5] */ 00441 acc2 += ((q31_t) x2 * c0); 00442 /* acc3 += x[7] * y[srcBLen - 5] */ 00443 acc3 += ((q31_t) x3 * c0); 00444 00445 /* Reuse the present samples for the next MAC */ 00446 x0 = x1; 00447 x1 = x2; 00448 x2 = x3; 00449 00450 /* Decrement the loop counter */ 00451 k--; 00452 } 00453 00454 /* Store the result in the accumulator in the destination buffer. */ 00455 *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8)); 00456 *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8)); 00457 *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8)); 00458 *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8)); 00459 00460 /* Increment the pointer pIn1 index, count by 4 */ 00461 count += 4u; 00462 00463 /* Update the inputA and inputB pointers for next MAC calculation */ 00464 px = pIn1 + count; 00465 py = pSrc2; 00466 00467 00468 /* Decrement the loop counter */ 00469 blkCnt--; 00470 } 00471 00472 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00473 ** No loop unrolling is used. */ 00474 blkCnt = (uint32_t) blockSize2 % 0x4u; 00475 00476 while(blkCnt > 0u) 00477 { 00478 /* Accumulator is made zero for every iteration */ 00479 sum = 0; 00480 00481 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00482 k = srcBLen >> 2u; 00483 00484 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00485 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00486 while(k > 0u) 00487 { 00488 00489 /* Reading two inputs of SrcA buffer and packing */ 00490 in1 = (q15_t) * px++; 00491 in2 = (q15_t) * px++; 00492 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00493 00494 /* Reading two inputs of SrcB buffer and packing */ 00495 in1 = (q15_t) * py--; 00496 in2 = (q15_t) * py--; 00497 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00498 00499 /* Perform the multiply-accumulates */ 00500 sum = __SMLAD(input1, input2, sum); 00501 00502 /* Reading two inputs of SrcA buffer and packing */ 00503 in1 = (q15_t) * px++; 00504 in2 = (q15_t) * px++; 00505 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00506 00507 /* Reading two inputs of SrcB buffer and packing */ 00508 in1 = (q15_t) * py--; 00509 in2 = (q15_t) * py--; 00510 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00511 00512 /* Perform the multiply-accumulates */ 00513 sum = __SMLAD(input1, input2, sum); 00514 00515 /* Decrement the loop counter */ 00516 k--; 00517 } 00518 00519 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00520 ** No loop unrolling is used. */ 00521 k = srcBLen % 0x4u; 00522 00523 while(k > 0u) 00524 { 00525 /* Perform the multiply-accumulates */ 00526 sum += ((q31_t) * px++ * *py--); 00527 00528 /* Decrement the loop counter */ 00529 k--; 00530 } 00531 00532 /* Store the result in the accumulator in the destination buffer. */ 00533 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00534 00535 /* Increment the pointer pIn1 index, count by 1 */ 00536 count++; 00537 00538 /* Update the inputA and inputB pointers for next MAC calculation */ 00539 px = pIn1 + count; 00540 py = pSrc2; 00541 00542 /* Decrement the loop counter */ 00543 blkCnt--; 00544 } 00545 } 00546 else 00547 { 00548 /* If the srcBLen is not a multiple of 4, 00549 * the blockSize2 loop cannot be unrolled by 4 */ 00550 blkCnt = (uint32_t) blockSize2; 00551 00552 while(blkCnt > 0u) 00553 { 00554 /* Accumulator is made zero for every iteration */ 00555 sum = 0; 00556 00557 /* srcBLen number of MACS should be performed */ 00558 k = srcBLen; 00559 00560 while(k > 0u) 00561 { 00562 /* Perform the multiply-accumulate */ 00563 sum += ((q31_t) * px++ * *py--); 00564 00565 /* Decrement the loop counter */ 00566 k--; 00567 } 00568 00569 /* Store the result in the accumulator in the destination buffer. */ 00570 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00571 00572 /* Increment the MAC count */ 00573 count++; 00574 00575 /* Update the inputA and inputB pointers for next MAC calculation */ 00576 px = pIn1 + count; 00577 py = pSrc2; 00578 00579 /* Decrement the loop counter */ 00580 blkCnt--; 00581 } 00582 } 00583 00584 00585 /* -------------------------- 00586 * Initializations of stage3 00587 * -------------------------*/ 00588 00589 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00590 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00591 * .... 00592 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00593 * sum += x[srcALen-1] * y[srcBLen-1] 00594 */ 00595 00596 /* In this stage the MAC operations are decreased by 1 for every iteration. 00597 The count variable holds the number of MAC operations performed */ 00598 count = srcBLen - 1u; 00599 00600 /* Working pointer of inputA */ 00601 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00602 px = pSrc1; 00603 00604 /* Working pointer of inputB */ 00605 pSrc2 = pIn2 + (srcBLen - 1u); 00606 py = pSrc2; 00607 00608 /* ------------------- 00609 * Stage3 process 00610 * ------------------*/ 00611 00612 while(blockSize3 > 0) 00613 { 00614 /* Accumulator is made zero for every iteration */ 00615 sum = 0; 00616 00617 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00618 k = count >> 2u; 00619 00620 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00621 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00622 while(k > 0u) 00623 { 00624 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 00625 in1 = (q15_t) * px++; 00626 in2 = (q15_t) * px++; 00627 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00628 00629 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 00630 in1 = (q15_t) * py--; 00631 in2 = (q15_t) * py--; 00632 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00633 00634 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00635 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00636 sum = __SMLAD(input1, input2, sum); 00637 00638 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 00639 in1 = (q15_t) * px++; 00640 in2 = (q15_t) * px++; 00641 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00642 00643 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 00644 in1 = (q15_t) * py--; 00645 in2 = (q15_t) * py--; 00646 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00647 00648 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00649 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00650 sum = __SMLAD(input1, input2, sum); 00651 00652 /* Decrement the loop counter */ 00653 k--; 00654 } 00655 00656 /* If the count is not a multiple of 4, compute any remaining MACs here. 00657 ** No loop unrolling is used. */ 00658 k = count % 0x4u; 00659 00660 while(k > 0u) 00661 { 00662 /* Perform the multiply-accumulates */ 00663 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00664 sum += ((q31_t) * px++ * *py--); 00665 00666 /* Decrement the loop counter */ 00667 k--; 00668 } 00669 00670 /* Store the result in the accumulator in the destination buffer. */ 00671 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00672 00673 /* Update the inputA and inputB pointers for next MAC calculation */ 00674 px = ++pSrc1; 00675 py = pSrc2; 00676 00677 /* Decrement the MAC count */ 00678 count--; 00679 00680 /* Decrement the loop counter */ 00681 blockSize3--; 00682 00683 } 00684 00685 /* set status as ARM_MATH_SUCCESS */ 00686 status = ARM_MATH_SUCCESS; 00687 } 00688 00689 /* Return to application */ 00690 return (status); 00691 00692 #else 00693 00694 /* Run the below code for Cortex-M0 */ 00695 00696 q7_t *pIn1 = pSrcA; /* inputA pointer */ 00697 q7_t *pIn2 = pSrcB; /* inputB pointer */ 00698 q31_t sum; /* Accumulator */ 00699 uint32_t i, j; /* loop counters */ 00700 arm_status status; /* status of Partial convolution */ 00701 00702 /* Check for range of output samples to be calculated */ 00703 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00704 { 00705 /* Set status as ARM_ARGUMENT_ERROR */ 00706 status = ARM_MATH_ARGUMENT_ERROR; 00707 } 00708 else 00709 { 00710 /* Loop to calculate convolution for output length number of values */ 00711 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00712 { 00713 /* Initialize sum with zero to carry on MAC operations */ 00714 sum = 0; 00715 00716 /* Loop to perform MAC operations according to convolution equation */ 00717 for (j = 0; j <= i; j++) 00718 { 00719 /* Check the array limitations */ 00720 if(((i - j) < srcBLen) && (j < srcALen)) 00721 { 00722 /* z[i] += x[i-j] * y[j] */ 00723 sum += ((q15_t) pIn1[j] * (pIn2[i - j])); 00724 } 00725 } 00726 00727 /* Store the output in the destination buffer */ 00728 pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u); 00729 } 00730 /* set status as ARM_SUCCESS as there are no argument errors */ 00731 status = ARM_MATH_SUCCESS; 00732 } 00733 return (status); 00734 00735 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00736 00737 } 00738 00739 /** 00740 * @} end of PartialConv group 00741 */
Generated on Tue Jul 12 2022 11:59:16 by 1.7.2