CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_conv_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_q7.c 00009 * 00010 * Description: Convolution of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q7 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @return none. 00060 * 00061 * @details 00062 * <b>Scaling and Overflow Behavior:</b> 00063 * 00064 * \par 00065 * The function is implemented using a 32-bit internal accumulator. 00066 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 00067 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00068 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 00069 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. 00070 * 00071 * \par 00072 * Refer the function <code>arm_conv_opt_q7()</code> for a faster implementation of this function. 00073 * 00074 */ 00075 00076 void arm_conv_q7( 00077 q7_t * pSrcA, 00078 uint32_t srcALen, 00079 q7_t * pSrcB, 00080 uint32_t srcBLen, 00081 q7_t * pDst) 00082 { 00083 00084 00085 #ifndef ARM_MATH_CM0_FAMILY 00086 00087 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00088 00089 q7_t *pIn1; /* inputA pointer */ 00090 q7_t *pIn2; /* inputB pointer */ 00091 q7_t *pOut = pDst; /* output pointer */ 00092 q7_t *px; /* Intermediate inputA pointer */ 00093 q7_t *py; /* Intermediate inputB pointer */ 00094 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00095 q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */ 00096 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00097 q31_t input1, input2; /* Temporary input variables */ 00098 q15_t in1, in2; /* Temporary input variables */ 00099 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ 00100 00101 /* The algorithm implementation is based on the lengths of the inputs. */ 00102 /* srcB is always made to slide across srcA. */ 00103 /* So srcBLen is always considered as shorter or equal to srcALen */ 00104 if(srcALen >= srcBLen) 00105 { 00106 /* Initialization of inputA pointer */ 00107 pIn1 = pSrcA; 00108 00109 /* Initialization of inputB pointer */ 00110 pIn2 = pSrcB; 00111 } 00112 else 00113 { 00114 /* Initialization of inputA pointer */ 00115 pIn1 = pSrcB; 00116 00117 /* Initialization of inputB pointer */ 00118 pIn2 = pSrcA; 00119 00120 /* srcBLen is always considered as shorter or equal to srcALen */ 00121 j = srcBLen; 00122 srcBLen = srcALen; 00123 srcALen = j; 00124 } 00125 00126 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00127 /* The function is internally 00128 * divided into three stages according to the number of multiplications that has to be 00129 * taken place between inputA samples and inputB samples. In the first stage of the 00130 * algorithm, the multiplications increase by one for every iteration. 00131 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00132 * In the third stage of the algorithm, the multiplications decrease by one 00133 * for every iteration. */ 00134 00135 /* The algorithm is implemented in three stages. 00136 The loop counters of each stage is initiated here. */ 00137 blockSize1 = srcBLen - 1u; 00138 blockSize2 = (srcALen - srcBLen) + 1u; 00139 blockSize3 = blockSize1; 00140 00141 /* -------------------------- 00142 * Initializations of stage1 00143 * -------------------------*/ 00144 00145 /* sum = x[0] * y[0] 00146 * sum = x[0] * y[1] + x[1] * y[0] 00147 * .... 00148 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00149 */ 00150 00151 /* In this stage the MAC operations are increased by 1 for every iteration. 00152 The count variable holds the number of MAC operations performed */ 00153 count = 1u; 00154 00155 /* Working pointer of inputA */ 00156 px = pIn1; 00157 00158 /* Working pointer of inputB */ 00159 py = pIn2; 00160 00161 00162 /* ------------------------ 00163 * Stage1 process 00164 * ----------------------*/ 00165 00166 /* The first stage starts here */ 00167 while(blockSize1 > 0u) 00168 { 00169 /* Accumulator is made zero for every iteration */ 00170 sum = 0; 00171 00172 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00173 k = count >> 2u; 00174 00175 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00176 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00177 while(k > 0u) 00178 { 00179 /* x[0] , x[1] */ 00180 in1 = (q15_t) * px++; 00181 in2 = (q15_t) * px++; 00182 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00183 00184 /* y[srcBLen - 1] , y[srcBLen - 2] */ 00185 in1 = (q15_t) * py--; 00186 in2 = (q15_t) * py--; 00187 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00188 00189 /* x[0] * y[srcBLen - 1] */ 00190 /* x[1] * y[srcBLen - 2] */ 00191 sum = __SMLAD(input1, input2, sum); 00192 00193 /* x[2] , x[3] */ 00194 in1 = (q15_t) * px++; 00195 in2 = (q15_t) * px++; 00196 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00197 00198 /* y[srcBLen - 3] , y[srcBLen - 4] */ 00199 in1 = (q15_t) * py--; 00200 in2 = (q15_t) * py--; 00201 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00202 00203 /* x[2] * y[srcBLen - 3] */ 00204 /* x[3] * y[srcBLen - 4] */ 00205 sum = __SMLAD(input1, input2, sum); 00206 00207 /* Decrement the loop counter */ 00208 k--; 00209 } 00210 00211 /* If the count is not a multiple of 4, compute any remaining MACs here. 00212 ** No loop unrolling is used. */ 00213 k = count % 0x4u; 00214 00215 while(k > 0u) 00216 { 00217 /* Perform the multiply-accumulates */ 00218 sum += ((q15_t) * px++ * *py--); 00219 00220 /* Decrement the loop counter */ 00221 k--; 00222 } 00223 00224 /* Store the result in the accumulator in the destination buffer. */ 00225 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00226 00227 /* Update the inputA and inputB pointers for next MAC calculation */ 00228 py = pIn2 + count; 00229 px = pIn1; 00230 00231 /* Increment the MAC count */ 00232 count++; 00233 00234 /* Decrement the loop counter */ 00235 blockSize1--; 00236 } 00237 00238 /* -------------------------- 00239 * Initializations of stage2 00240 * ------------------------*/ 00241 00242 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00243 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00244 * .... 00245 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00246 */ 00247 00248 /* Working pointer of inputA */ 00249 px = pIn1; 00250 00251 /* Working pointer of inputB */ 00252 pSrc2 = pIn2 + (srcBLen - 1u); 00253 py = pSrc2; 00254 00255 /* count is index by which the pointer pIn1 to be incremented */ 00256 count = 0u; 00257 00258 /* ------------------- 00259 * Stage2 process 00260 * ------------------*/ 00261 00262 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00263 * So, to loop unroll over blockSize2, 00264 * srcBLen should be greater than or equal to 4 */ 00265 if(srcBLen >= 4u) 00266 { 00267 /* Loop unroll over blockSize2, by 4 */ 00268 blkCnt = blockSize2 >> 2u; 00269 00270 while(blkCnt > 0u) 00271 { 00272 /* Set all accumulators to zero */ 00273 acc0 = 0; 00274 acc1 = 0; 00275 acc2 = 0; 00276 acc3 = 0; 00277 00278 /* read x[0], x[1], x[2] samples */ 00279 x0 = *(px++); 00280 x1 = *(px++); 00281 x2 = *(px++); 00282 00283 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00284 k = srcBLen >> 2u; 00285 00286 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00287 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00288 do 00289 { 00290 /* Read y[srcBLen - 1] sample */ 00291 c0 = *(py--); 00292 /* Read y[srcBLen - 2] sample */ 00293 c1 = *(py--); 00294 00295 /* Read x[3] sample */ 00296 x3 = *(px++); 00297 00298 /* x[0] and x[1] are packed */ 00299 in1 = (q15_t) x0; 00300 in2 = (q15_t) x1; 00301 00302 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00303 00304 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ 00305 in1 = (q15_t) c0; 00306 in2 = (q15_t) c1; 00307 00308 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00309 00310 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00311 acc0 = __SMLAD(input1, input2, acc0); 00312 00313 /* x[1] and x[2] are packed */ 00314 in1 = (q15_t) x1; 00315 in2 = (q15_t) x2; 00316 00317 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00318 00319 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00320 acc1 = __SMLAD(input1, input2, acc1); 00321 00322 /* x[2] and x[3] are packed */ 00323 in1 = (q15_t) x2; 00324 in2 = (q15_t) x3; 00325 00326 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00327 00328 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00329 acc2 = __SMLAD(input1, input2, acc2); 00330 00331 /* Read x[4] sample */ 00332 x0 = *(px++); 00333 00334 /* x[3] and x[4] are packed */ 00335 in1 = (q15_t) x3; 00336 in2 = (q15_t) x0; 00337 00338 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00339 00340 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00341 acc3 = __SMLAD(input1, input2, acc3); 00342 00343 /* Read y[srcBLen - 3] sample */ 00344 c0 = *(py--); 00345 /* Read y[srcBLen - 4] sample */ 00346 c1 = *(py--); 00347 00348 /* Read x[5] sample */ 00349 x1 = *(px++); 00350 00351 /* x[2] and x[3] are packed */ 00352 in1 = (q15_t) x2; 00353 in2 = (q15_t) x3; 00354 00355 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00356 00357 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 00358 in1 = (q15_t) c0; 00359 in2 = (q15_t) c1; 00360 00361 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00362 00363 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00364 acc0 = __SMLAD(input1, input2, acc0); 00365 00366 /* x[3] and x[4] are packed */ 00367 in1 = (q15_t) x3; 00368 in2 = (q15_t) x0; 00369 00370 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00371 00372 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00373 acc1 = __SMLAD(input1, input2, acc1); 00374 00375 /* x[4] and x[5] are packed */ 00376 in1 = (q15_t) x0; 00377 in2 = (q15_t) x1; 00378 00379 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00380 00381 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00382 acc2 = __SMLAD(input1, input2, acc2); 00383 00384 /* Read x[6] sample */ 00385 x2 = *(px++); 00386 00387 /* x[5] and x[6] are packed */ 00388 in1 = (q15_t) x1; 00389 in2 = (q15_t) x2; 00390 00391 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00392 00393 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00394 acc3 = __SMLAD(input1, input2, acc3); 00395 00396 } while(--k); 00397 00398 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00399 ** No loop unrolling is used. */ 00400 k = srcBLen % 0x4u; 00401 00402 while(k > 0u) 00403 { 00404 /* Read y[srcBLen - 5] sample */ 00405 c0 = *(py--); 00406 00407 /* Read x[7] sample */ 00408 x3 = *(px++); 00409 00410 /* Perform the multiply-accumulates */ 00411 /* acc0 += x[4] * y[srcBLen - 5] */ 00412 acc0 += ((q15_t) x0 * c0); 00413 /* acc1 += x[5] * y[srcBLen - 5] */ 00414 acc1 += ((q15_t) x1 * c0); 00415 /* acc2 += x[6] * y[srcBLen - 5] */ 00416 acc2 += ((q15_t) x2 * c0); 00417 /* acc3 += x[7] * y[srcBLen - 5] */ 00418 acc3 += ((q15_t) x3 * c0); 00419 00420 /* Reuse the present samples for the next MAC */ 00421 x0 = x1; 00422 x1 = x2; 00423 x2 = x3; 00424 00425 /* Decrement the loop counter */ 00426 k--; 00427 } 00428 00429 00430 /* Store the result in the accumulator in the destination buffer. */ 00431 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00432 *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00433 *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00434 *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00435 00436 /* Increment the pointer pIn1 index, count by 4 */ 00437 count += 4u; 00438 00439 /* Update the inputA and inputB pointers for next MAC calculation */ 00440 px = pIn1 + count; 00441 py = pSrc2; 00442 00443 /* Decrement the loop counter */ 00444 blkCnt--; 00445 } 00446 00447 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00448 ** No loop unrolling is used. */ 00449 blkCnt = blockSize2 % 0x4u; 00450 00451 while(blkCnt > 0u) 00452 { 00453 /* Accumulator is made zero for every iteration */ 00454 sum = 0; 00455 00456 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00457 k = srcBLen >> 2u; 00458 00459 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00460 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00461 while(k > 0u) 00462 { 00463 00464 /* Reading two inputs of SrcA buffer and packing */ 00465 in1 = (q15_t) * px++; 00466 in2 = (q15_t) * px++; 00467 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00468 00469 /* Reading two inputs of SrcB buffer and packing */ 00470 in1 = (q15_t) * py--; 00471 in2 = (q15_t) * py--; 00472 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00473 00474 /* Perform the multiply-accumulates */ 00475 sum = __SMLAD(input1, input2, sum); 00476 00477 /* Reading two inputs of SrcA buffer and packing */ 00478 in1 = (q15_t) * px++; 00479 in2 = (q15_t) * px++; 00480 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00481 00482 /* Reading two inputs of SrcB buffer and packing */ 00483 in1 = (q15_t) * py--; 00484 in2 = (q15_t) * py--; 00485 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00486 00487 /* Perform the multiply-accumulates */ 00488 sum = __SMLAD(input1, input2, sum); 00489 00490 /* Decrement the loop counter */ 00491 k--; 00492 } 00493 00494 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00495 ** No loop unrolling is used. */ 00496 k = srcBLen % 0x4u; 00497 00498 while(k > 0u) 00499 { 00500 /* Perform the multiply-accumulates */ 00501 sum += ((q15_t) * px++ * *py--); 00502 00503 /* Decrement the loop counter */ 00504 k--; 00505 } 00506 00507 /* Store the result in the accumulator in the destination buffer. */ 00508 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00509 00510 /* Increment the pointer pIn1 index, count by 1 */ 00511 count++; 00512 00513 /* Update the inputA and inputB pointers for next MAC calculation */ 00514 px = pIn1 + count; 00515 py = pSrc2; 00516 00517 /* Decrement the loop counter */ 00518 blkCnt--; 00519 } 00520 } 00521 else 00522 { 00523 /* If the srcBLen is not a multiple of 4, 00524 * the blockSize2 loop cannot be unrolled by 4 */ 00525 blkCnt = blockSize2; 00526 00527 while(blkCnt > 0u) 00528 { 00529 /* Accumulator is made zero for every iteration */ 00530 sum = 0; 00531 00532 /* srcBLen number of MACS should be performed */ 00533 k = srcBLen; 00534 00535 while(k > 0u) 00536 { 00537 /* Perform the multiply-accumulate */ 00538 sum += ((q15_t) * px++ * *py--); 00539 00540 /* Decrement the loop counter */ 00541 k--; 00542 } 00543 00544 /* Store the result in the accumulator in the destination buffer. */ 00545 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00546 00547 /* Increment the MAC count */ 00548 count++; 00549 00550 /* Update the inputA and inputB pointers for next MAC calculation */ 00551 px = pIn1 + count; 00552 py = pSrc2; 00553 00554 /* Decrement the loop counter */ 00555 blkCnt--; 00556 } 00557 } 00558 00559 00560 /* -------------------------- 00561 * Initializations of stage3 00562 * -------------------------*/ 00563 00564 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00565 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00566 * .... 00567 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00568 * sum += x[srcALen-1] * y[srcBLen-1] 00569 */ 00570 00571 /* In this stage the MAC operations are decreased by 1 for every iteration. 00572 The blockSize3 variable holds the number of MAC operations performed */ 00573 00574 /* Working pointer of inputA */ 00575 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00576 px = pSrc1; 00577 00578 /* Working pointer of inputB */ 00579 pSrc2 = pIn2 + (srcBLen - 1u); 00580 py = pSrc2; 00581 00582 /* ------------------- 00583 * Stage3 process 00584 * ------------------*/ 00585 00586 while(blockSize3 > 0u) 00587 { 00588 /* Accumulator is made zero for every iteration */ 00589 sum = 0; 00590 00591 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00592 k = blockSize3 >> 2u; 00593 00594 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00595 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00596 while(k > 0u) 00597 { 00598 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 00599 in1 = (q15_t) * px++; 00600 in2 = (q15_t) * px++; 00601 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00602 00603 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 00604 in1 = (q15_t) * py--; 00605 in2 = (q15_t) * py--; 00606 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00607 00608 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00609 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00610 sum = __SMLAD(input1, input2, sum); 00611 00612 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 00613 in1 = (q15_t) * px++; 00614 in2 = (q15_t) * px++; 00615 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00616 00617 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 00618 in1 = (q15_t) * py--; 00619 in2 = (q15_t) * py--; 00620 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00621 00622 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00623 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00624 sum = __SMLAD(input1, input2, sum); 00625 00626 /* Decrement the loop counter */ 00627 k--; 00628 } 00629 00630 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00631 ** No loop unrolling is used. */ 00632 k = blockSize3 % 0x4u; 00633 00634 while(k > 0u) 00635 { 00636 /* Perform the multiply-accumulates */ 00637 sum += ((q15_t) * px++ * *py--); 00638 00639 /* Decrement the loop counter */ 00640 k--; 00641 } 00642 00643 /* Store the result in the accumulator in the destination buffer. */ 00644 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00645 00646 /* Update the inputA and inputB pointers for next MAC calculation */ 00647 px = ++pSrc1; 00648 py = pSrc2; 00649 00650 /* Decrement the loop counter */ 00651 blockSize3--; 00652 } 00653 00654 #else 00655 00656 /* Run the below code for Cortex-M0 */ 00657 00658 q7_t *pIn1 = pSrcA; /* input pointer */ 00659 q7_t *pIn2 = pSrcB; /* coefficient pointer */ 00660 q31_t sum; /* Accumulator */ 00661 uint32_t i, j; /* loop counter */ 00662 00663 /* Loop to calculate output of convolution for output length number of times */ 00664 for (i = 0; i < (srcALen + srcBLen - 1); i++) 00665 { 00666 /* Initialize sum with zero to carry on MAC operations */ 00667 sum = 0; 00668 00669 /* Loop to perform MAC operations according to convolution equation */ 00670 for (j = 0; j <= i; j++) 00671 { 00672 /* Check the array limitations */ 00673 if(((i - j) < srcBLen) && (j < srcALen)) 00674 { 00675 /* z[i] += x[i-j] * y[j] */ 00676 sum += (q15_t) pIn1[j] * (pIn2[i - j]); 00677 } 00678 } 00679 00680 /* Store the output in the destination buffer */ 00681 pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u); 00682 } 00683 00684 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00685 00686 } 00687 00688 /** 00689 * @} end of Conv group 00690 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2