Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_conv_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_q7.c 00009 * 00010 * Description: Convolution of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q7 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @return none. 00060 * 00061 * @details 00062 * <b>Scaling and Overflow Behavior:</b> 00063 * 00064 * \par 00065 * The function is implemented using a 32-bit internal accumulator. 00066 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 00067 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00068 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 00069 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. 00070 * 00071 * \par 00072 * Refer the function <code>arm_conv_opt_q7()</code> for a faster implementation of this function. 00073 * 00074 */ 00075 00076 void arm_conv_q7( 00077 q7_t * pSrcA, 00078 uint32_t srcALen, 00079 q7_t * pSrcB, 00080 uint32_t srcBLen, 00081 q7_t * pDst) 00082 { 00083 00084 00085 #ifndef ARM_MATH_CM0_FAMILY 00086 00087 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00088 00089 q7_t *pIn1; /* inputA pointer */ 00090 q7_t *pIn2; /* inputB pointer */ 00091 q7_t *pOut = pDst; /* output pointer */ 00092 q7_t *px; /* Intermediate inputA pointer */ 00093 q7_t *py; /* Intermediate inputB pointer */ 00094 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00095 q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */ 00096 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00097 q31_t input1, input2; /* Temporary input variables */ 00098 q15_t in1, in2; /* Temporary input variables */ 00099 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ 00100 00101 /* The algorithm implementation is based on the lengths of the inputs. */ 00102 /* srcB is always made to slide across srcA. */ 00103 /* So srcBLen is always considered as shorter or equal to srcALen */ 00104 if(srcALen >= srcBLen) 00105 { 00106 /* Initialization of inputA pointer */ 00107 pIn1 = pSrcA; 00108 00109 /* Initialization of inputB pointer */ 00110 pIn2 = pSrcB; 00111 } 00112 else 00113 { 00114 /* Initialization of inputA pointer */ 00115 pIn1 = pSrcB; 00116 00117 /* Initialization of inputB pointer */ 00118 pIn2 = pSrcA; 00119 00120 /* srcBLen is always considered as shorter or equal to srcALen */ 00121 j = srcBLen; 00122 srcBLen = srcALen; 00123 srcALen = j; 00124 } 00125 00126 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00127 /* The function is internally 00128 * divided into three stages according to the number of multiplications that has to be 00129 * taken place between inputA samples and inputB samples. In the first stage of the 00130 * algorithm, the multiplications increase by one for every iteration. 00131 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00132 * In the third stage of the algorithm, the multiplications decrease by one 00133 * for every iteration. */ 00134 00135 /* The algorithm is implemented in three stages. 00136 The loop counters of each stage is initiated here. */ 00137 blockSize1 = srcBLen - 1u; 00138 blockSize2 = (srcALen - srcBLen) + 1u; 00139 blockSize3 = blockSize1; 00140 00141 /* -------------------------- 00142 * Initializations of stage1 00143 * -------------------------*/ 00144 00145 /* sum = x[0] * y[0] 00146 * sum = x[0] * y[1] + x[1] * y[0] 00147 * .... 00148 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00149 */ 00150 00151 /* In this stage the MAC operations are increased by 1 for every iteration. 00152 The count variable holds the number of MAC operations performed */ 00153 count = 1u; 00154 00155 /* Working pointer of inputA */ 00156 px = pIn1; 00157 00158 /* Working pointer of inputB */ 00159 py = pIn2; 00160 00161 00162 /* ------------------------ 00163 * Stage1 process 00164 * ----------------------*/ 00165 00166 /* The first stage starts here */ 00167 while(blockSize1 > 0u) 00168 { 00169 /* Accumulator is made zero for every iteration */ 00170 sum = 0; 00171 00172 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00173 k = count >> 2u; 00174 00175 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00176 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00177 while(k > 0u) 00178 { 00179 /* x[0] , x[1] */ 00180 in1 = (q15_t) * px++; 00181 in2 = (q15_t) * px++; 00182 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00183 00184 /* y[srcBLen - 1] , y[srcBLen - 2] */ 00185 in1 = (q15_t) * py--; 00186 in2 = (q15_t) * py--; 00187 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00188 00189 /* x[0] * y[srcBLen - 1] */ 00190 /* x[1] * y[srcBLen - 2] */ 00191 sum = __SMLAD(input1, input2, sum); 00192 00193 /* x[2] , x[3] */ 00194 in1 = (q15_t) * px++; 00195 in2 = (q15_t) * px++; 00196 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00197 00198 /* y[srcBLen - 3] , y[srcBLen - 4] */ 00199 in1 = (q15_t) * py--; 00200 in2 = (q15_t) * py--; 00201 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00202 00203 /* x[2] * y[srcBLen - 3] */ 00204 /* x[3] * y[srcBLen - 4] */ 00205 sum = __SMLAD(input1, input2, sum); 00206 00207 /* Decrement the loop counter */ 00208 k--; 00209 } 00210 00211 /* If the count is not a multiple of 4, compute any remaining MACs here. 00212 ** No loop unrolling is used. */ 00213 k = count % 0x4u; 00214 00215 while(k > 0u) 00216 { 00217 /* Perform the multiply-accumulates */ 00218 sum += ((q15_t) * px++ * *py--); 00219 00220 /* Decrement the loop counter */ 00221 k--; 00222 } 00223 00224 /* Store the result in the accumulator in the destination buffer. */ 00225 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00226 00227 /* Update the inputA and inputB pointers for next MAC calculation */ 00228 py = pIn2 + count; 00229 px = pIn1; 00230 00231 /* Increment the MAC count */ 00232 count++; 00233 00234 /* Decrement the loop counter */ 00235 blockSize1--; 00236 } 00237 00238 /* -------------------------- 00239 * Initializations of stage2 00240 * ------------------------*/ 00241 00242 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00243 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00244 * .... 00245 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00246 */ 00247 00248 /* Working pointer of inputA */ 00249 px = pIn1; 00250 00251 /* Working pointer of inputB */ 00252 pSrc2 = pIn2 + (srcBLen - 1u); 00253 py = pSrc2; 00254 00255 /* count is index by which the pointer pIn1 to be incremented */ 00256 count = 0u; 00257 00258 /* ------------------- 00259 * Stage2 process 00260 * ------------------*/ 00261 00262 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00263 * So, to loop unroll over blockSize2, 00264 * srcBLen should be greater than or equal to 4 */ 00265 if(srcBLen >= 4u) 00266 { 00267 /* Loop unroll over blockSize2, by 4 */ 00268 blkCnt = blockSize2 >> 2u; 00269 00270 while(blkCnt > 0u) 00271 { 00272 /* Set all accumulators to zero */ 00273 acc0 = 0; 00274 acc1 = 0; 00275 acc2 = 0; 00276 acc3 = 0; 00277 00278 /* read x[0], x[1], x[2] samples */ 00279 x0 = *(px++); 00280 x1 = *(px++); 00281 x2 = *(px++); 00282 00283 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00284 k = srcBLen >> 2u; 00285 00286 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00287 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00288 do 00289 { 00290 /* Read y[srcBLen - 1] sample */ 00291 c0 = *(py--); 00292 /* Read y[srcBLen - 2] sample */ 00293 c1 = *(py--); 00294 00295 /* Read x[3] sample */ 00296 x3 = *(px++); 00297 00298 /* x[0] and x[1] are packed */ 00299 in1 = (q15_t) x0; 00300 in2 = (q15_t) x1; 00301 00302 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00303 00304 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ 00305 in1 = (q15_t) c0; 00306 in2 = (q15_t) c1; 00307 00308 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00309 00310 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00311 acc0 = __SMLAD(input1, input2, acc0); 00312 00313 /* x[1] and x[2] are packed */ 00314 in1 = (q15_t) x1; 00315 in2 = (q15_t) x2; 00316 00317 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00318 00319 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00320 acc1 = __SMLAD(input1, input2, acc1); 00321 00322 /* x[2] and x[3] are packed */ 00323 in1 = (q15_t) x2; 00324 in2 = (q15_t) x3; 00325 00326 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00327 00328 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00329 acc2 = __SMLAD(input1, input2, acc2); 00330 00331 /* Read x[4] sample */ 00332 x0 = *(px++); 00333 00334 /* x[3] and x[4] are packed */ 00335 in1 = (q15_t) x3; 00336 in2 = (q15_t) x0; 00337 00338 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00339 00340 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00341 acc3 = __SMLAD(input1, input2, acc3); 00342 00343 /* Read y[srcBLen - 3] sample */ 00344 c0 = *(py--); 00345 /* Read y[srcBLen - 4] sample */ 00346 c1 = *(py--); 00347 00348 /* Read x[5] sample */ 00349 x1 = *(px++); 00350 00351 /* x[2] and x[3] are packed */ 00352 in1 = (q15_t) x2; 00353 in2 = (q15_t) x3; 00354 00355 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00356 00357 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 00358 in1 = (q15_t) c0; 00359 in2 = (q15_t) c1; 00360 00361 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00362 00363 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00364 acc0 = __SMLAD(input1, input2, acc0); 00365 00366 /* x[3] and x[4] are packed */ 00367 in1 = (q15_t) x3; 00368 in2 = (q15_t) x0; 00369 00370 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00371 00372 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00373 acc1 = __SMLAD(input1, input2, acc1); 00374 00375 /* x[4] and x[5] are packed */ 00376 in1 = (q15_t) x0; 00377 in2 = (q15_t) x1; 00378 00379 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00380 00381 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00382 acc2 = __SMLAD(input1, input2, acc2); 00383 00384 /* Read x[6] sample */ 00385 x2 = *(px++); 00386 00387 /* x[5] and x[6] are packed */ 00388 in1 = (q15_t) x1; 00389 in2 = (q15_t) x2; 00390 00391 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00392 00393 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00394 acc3 = __SMLAD(input1, input2, acc3); 00395 00396 } while(--k); 00397 00398 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00399 ** No loop unrolling is used. */ 00400 k = srcBLen % 0x4u; 00401 00402 while(k > 0u) 00403 { 00404 /* Read y[srcBLen - 5] sample */ 00405 c0 = *(py--); 00406 00407 /* Read x[7] sample */ 00408 x3 = *(px++); 00409 00410 /* Perform the multiply-accumulates */ 00411 /* acc0 += x[4] * y[srcBLen - 5] */ 00412 acc0 += ((q15_t) x0 * c0); 00413 /* acc1 += x[5] * y[srcBLen - 5] */ 00414 acc1 += ((q15_t) x1 * c0); 00415 /* acc2 += x[6] * y[srcBLen - 5] */ 00416 acc2 += ((q15_t) x2 * c0); 00417 /* acc3 += x[7] * y[srcBLen - 5] */ 00418 acc3 += ((q15_t) x3 * c0); 00419 00420 /* Reuse the present samples for the next MAC */ 00421 x0 = x1; 00422 x1 = x2; 00423 x2 = x3; 00424 00425 /* Decrement the loop counter */ 00426 k--; 00427 } 00428 00429 00430 /* Store the result in the accumulator in the destination buffer. */ 00431 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00432 *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00433 *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00434 *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00435 00436 /* Increment the pointer pIn1 index, count by 4 */ 00437 count += 4u; 00438 00439 /* Update the inputA and inputB pointers for next MAC calculation */ 00440 px = pIn1 + count; 00441 py = pSrc2; 00442 00443 /* Decrement the loop counter */ 00444 blkCnt--; 00445 } 00446 00447 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00448 ** No loop unrolling is used. */ 00449 blkCnt = blockSize2 % 0x4u; 00450 00451 while(blkCnt > 0u) 00452 { 00453 /* Accumulator is made zero for every iteration */ 00454 sum = 0; 00455 00456 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00457 k = srcBLen >> 2u; 00458 00459 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00460 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00461 while(k > 0u) 00462 { 00463 00464 /* Reading two inputs of SrcA buffer and packing */ 00465 in1 = (q15_t) * px++; 00466 in2 = (q15_t) * px++; 00467 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00468 00469 /* Reading two inputs of SrcB buffer and packing */ 00470 in1 = (q15_t) * py--; 00471 in2 = (q15_t) * py--; 00472 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00473 00474 /* Perform the multiply-accumulates */ 00475 sum = __SMLAD(input1, input2, sum); 00476 00477 /* Reading two inputs of SrcA buffer and packing */ 00478 in1 = (q15_t) * px++; 00479 in2 = (q15_t) * px++; 00480 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00481 00482 /* Reading two inputs of SrcB buffer and packing */ 00483 in1 = (q15_t) * py--; 00484 in2 = (q15_t) * py--; 00485 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00486 00487 /* Perform the multiply-accumulates */ 00488 sum = __SMLAD(input1, input2, sum); 00489 00490 /* Decrement the loop counter */ 00491 k--; 00492 } 00493 00494 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00495 ** No loop unrolling is used. */ 00496 k = srcBLen % 0x4u; 00497 00498 while(k > 0u) 00499 { 00500 /* Perform the multiply-accumulates */ 00501 sum += ((q15_t) * px++ * *py--); 00502 00503 /* Decrement the loop counter */ 00504 k--; 00505 } 00506 00507 /* Store the result in the accumulator in the destination buffer. */ 00508 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00509 00510 /* Increment the pointer pIn1 index, count by 1 */ 00511 count++; 00512 00513 /* Update the inputA and inputB pointers for next MAC calculation */ 00514 px = pIn1 + count; 00515 py = pSrc2; 00516 00517 /* Decrement the loop counter */ 00518 blkCnt--; 00519 } 00520 } 00521 else 00522 { 00523 /* If the srcBLen is not a multiple of 4, 00524 * the blockSize2 loop cannot be unrolled by 4 */ 00525 blkCnt = blockSize2; 00526 00527 while(blkCnt > 0u) 00528 { 00529 /* Accumulator is made zero for every iteration */ 00530 sum = 0; 00531 00532 /* srcBLen number of MACS should be performed */ 00533 k = srcBLen; 00534 00535 while(k > 0u) 00536 { 00537 /* Perform the multiply-accumulate */ 00538 sum += ((q15_t) * px++ * *py--); 00539 00540 /* Decrement the loop counter */ 00541 k--; 00542 } 00543 00544 /* Store the result in the accumulator in the destination buffer. */ 00545 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00546 00547 /* Increment the MAC count */ 00548 count++; 00549 00550 /* Update the inputA and inputB pointers for next MAC calculation */ 00551 px = pIn1 + count; 00552 py = pSrc2; 00553 00554 /* Decrement the loop counter */ 00555 blkCnt--; 00556 } 00557 } 00558 00559 00560 /* -------------------------- 00561 * Initializations of stage3 00562 * -------------------------*/ 00563 00564 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00565 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00566 * .... 00567 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00568 * sum += x[srcALen-1] * y[srcBLen-1] 00569 */ 00570 00571 /* In this stage the MAC operations are decreased by 1 for every iteration. 00572 The blockSize3 variable holds the number of MAC operations performed */ 00573 00574 /* Working pointer of inputA */ 00575 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00576 px = pSrc1; 00577 00578 /* Working pointer of inputB */ 00579 pSrc2 = pIn2 + (srcBLen - 1u); 00580 py = pSrc2; 00581 00582 /* ------------------- 00583 * Stage3 process 00584 * ------------------*/ 00585 00586 while(blockSize3 > 0u) 00587 { 00588 /* Accumulator is made zero for every iteration */ 00589 sum = 0; 00590 00591 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00592 k = blockSize3 >> 2u; 00593 00594 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00595 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00596 while(k > 0u) 00597 { 00598 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 00599 in1 = (q15_t) * px++; 00600 in2 = (q15_t) * px++; 00601 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00602 00603 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 00604 in1 = (q15_t) * py--; 00605 in2 = (q15_t) * py--; 00606 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00607 00608 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00609 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00610 sum = __SMLAD(input1, input2, sum); 00611 00612 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 00613 in1 = (q15_t) * px++; 00614 in2 = (q15_t) * px++; 00615 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00616 00617 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 00618 in1 = (q15_t) * py--; 00619 in2 = (q15_t) * py--; 00620 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); 00621 00622 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00623 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00624 sum = __SMLAD(input1, input2, sum); 00625 00626 /* Decrement the loop counter */ 00627 k--; 00628 } 00629 00630 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00631 ** No loop unrolling is used. */ 00632 k = blockSize3 % 0x4u; 00633 00634 while(k > 0u) 00635 { 00636 /* Perform the multiply-accumulates */ 00637 sum += ((q15_t) * px++ * *py--); 00638 00639 /* Decrement the loop counter */ 00640 k--; 00641 } 00642 00643 /* Store the result in the accumulator in the destination buffer. */ 00644 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); 00645 00646 /* Update the inputA and inputB pointers for next MAC calculation */ 00647 px = ++pSrc1; 00648 py = pSrc2; 00649 00650 /* Decrement the loop counter */ 00651 blockSize3--; 00652 } 00653 00654 #else 00655 00656 /* Run the below code for Cortex-M0 */ 00657 00658 q7_t *pIn1 = pSrcA; /* input pointer */ 00659 q7_t *pIn2 = pSrcB; /* coefficient pointer */ 00660 q31_t sum; /* Accumulator */ 00661 uint32_t i, j; /* loop counter */ 00662 00663 /* Loop to calculate output of convolution for output length number of times */ 00664 for (i = 0; i < (srcALen + srcBLen - 1); i++) 00665 { 00666 /* Initialize sum with zero to carry on MAC operations */ 00667 sum = 0; 00668 00669 /* Loop to perform MAC operations according to convolution equation */ 00670 for (j = 0; j <= i; j++) 00671 { 00672 /* Check the array limitations */ 00673 if(((i - j) < srcBLen) && (j < srcALen)) 00674 { 00675 /* z[i] += x[i-j] * y[j] */ 00676 sum += (q15_t) pIn1[j] * (pIn2[i - j]); 00677 } 00678 } 00679 00680 /* Store the output in the destination buffer */ 00681 pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u); 00682 } 00683 00684 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00685 00686 } 00687 00688 /** 00689 * @} end of Conv group 00690 */
Generated on Tue Jul 12 2022 18:44:08 by
 1.7.2
 1.7.2 
    