Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
arm_conv_opt_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_conv_opt_q15.c 00004 * Description: Convolution of Q15 sequences 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup Conv 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Convolution of Q15 sequences. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00047 * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00048 * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). 00049 * @return none. 00050 * 00051 * \par Restrictions 00052 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00053 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00054 * 00055 * 00056 * @details 00057 * <b>Scaling and Overflow Behavior:</b> 00058 * 00059 * \par 00060 * The function is implemented using a 64-bit internal accumulator. 00061 * Both inputs are in 1.15 format and multiplications yield a 2.30 result. 00062 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. 00063 * This approach provides 33 guard bits and there is no risk of overflow. 00064 * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. 00065 * 00066 * 00067 * \par 00068 * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00069 * 00070 * 00071 */ 00072 00073 void arm_conv_opt_q15( 00074 q15_t * pSrcA, 00075 uint32_t srcALen, 00076 q15_t * pSrcB, 00077 uint32_t srcBLen, 00078 q15_t * pDst, 00079 q15_t * pScratch1, 00080 q15_t * pScratch2) 00081 { 00082 q63_t acc0, acc1, acc2, acc3; /* Accumulator */ 00083 q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ 00084 q31_t y1, y2; /* State variables */ 00085 q15_t *pOut = pDst; /* output pointer */ 00086 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00087 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00088 q15_t *pIn1; /* inputA pointer */ 00089 q15_t *pIn2; /* inputB pointer */ 00090 q15_t *px; /* Intermediate inputA pointer */ 00091 q15_t *py; /* Intermediate inputB pointer */ 00092 uint32_t j, k, blkCnt; /* loop counter */ 00093 uint32_t tapCnt; /* loop count */ 00094 #ifdef UNALIGNED_SUPPORT_DISABLE 00095 00096 q15_t a, b; 00097 00098 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00099 00100 /* The algorithm implementation is based on the lengths of the inputs. */ 00101 /* srcB is always made to slide across srcA. */ 00102 /* So srcBLen is always considered as shorter or equal to srcALen */ 00103 if (srcALen >= srcBLen) 00104 { 00105 /* Initialization of inputA pointer */ 00106 pIn1 = pSrcA; 00107 00108 /* Initialization of inputB pointer */ 00109 pIn2 = pSrcB; 00110 00111 } 00112 else 00113 { 00114 /* Initialization of inputA pointer */ 00115 pIn1 = pSrcB; 00116 00117 /* Initialization of inputB pointer */ 00118 pIn2 = pSrcA; 00119 00120 /* srcBLen is always considered as shorter or equal to srcALen */ 00121 j = srcBLen; 00122 srcBLen = srcALen; 00123 srcALen = j; 00124 } 00125 00126 /* pointer to take end of scratch2 buffer */ 00127 pScr2 = pScratch2 + srcBLen - 1; 00128 00129 /* points to smaller length sequence */ 00130 px = pIn2; 00131 00132 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00133 k = srcBLen >> 2U; 00134 00135 /* First part of the processing with loop unrolling copies 4 data points at a time. 00136 ** a second loop below copies for the remaining 1 to 3 samples. */ 00137 /* Copy smaller length input sequence in reverse order into second scratch buffer */ 00138 while (k > 0U) 00139 { 00140 /* copy second buffer in reversal manner */ 00141 *pScr2-- = *px++; 00142 *pScr2-- = *px++; 00143 *pScr2-- = *px++; 00144 *pScr2-- = *px++; 00145 00146 /* Decrement the loop counter */ 00147 k--; 00148 } 00149 00150 /* If the count is not a multiple of 4, copy remaining samples here. 00151 ** No loop unrolling is used. */ 00152 k = srcBLen % 0x4U; 00153 00154 while (k > 0U) 00155 { 00156 /* copy second buffer in reversal manner for remaining samples */ 00157 *pScr2-- = *px++; 00158 00159 /* Decrement the loop counter */ 00160 k--; 00161 } 00162 00163 /* Initialze temporary scratch pointer */ 00164 pScr1 = pScratch1; 00165 00166 /* Assuming scratch1 buffer is aligned by 32-bit */ 00167 /* Fill (srcBLen - 1U) zeros in scratch buffer */ 00168 arm_fill_q15(0, pScr1, (srcBLen - 1U)); 00169 00170 /* Update temporary scratch pointer */ 00171 pScr1 += (srcBLen - 1U); 00172 00173 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00174 00175 #ifndef UNALIGNED_SUPPORT_DISABLE 00176 00177 /* Copy (srcALen) samples in scratch buffer */ 00178 arm_copy_q15(pIn1, pScr1, srcALen); 00179 00180 /* Update pointers */ 00181 pScr1 += srcALen; 00182 00183 #else 00184 00185 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00186 k = srcALen >> 2U; 00187 00188 /* First part of the processing with loop unrolling copies 4 data points at a time. 00189 ** a second loop below copies for the remaining 1 to 3 samples. */ 00190 while (k > 0U) 00191 { 00192 /* copy second buffer in reversal manner */ 00193 *pScr1++ = *pIn1++; 00194 *pScr1++ = *pIn1++; 00195 *pScr1++ = *pIn1++; 00196 *pScr1++ = *pIn1++; 00197 00198 /* Decrement the loop counter */ 00199 k--; 00200 } 00201 00202 /* If the count is not a multiple of 4, copy remaining samples here. 00203 ** No loop unrolling is used. */ 00204 k = srcALen % 0x4U; 00205 00206 while (k > 0U) 00207 { 00208 /* copy second buffer in reversal manner for remaining samples */ 00209 *pScr1++ = *pIn1++; 00210 00211 /* Decrement the loop counter */ 00212 k--; 00213 } 00214 00215 #endif 00216 00217 00218 #ifndef UNALIGNED_SUPPORT_DISABLE 00219 00220 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ 00221 arm_fill_q15(0, pScr1, (srcBLen - 1U)); 00222 00223 /* Update pointer */ 00224 pScr1 += (srcBLen - 1U); 00225 00226 #else 00227 00228 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00229 k = (srcBLen - 1U) >> 2U; 00230 00231 /* First part of the processing with loop unrolling copies 4 data points at a time. 00232 ** a second loop below copies for the remaining 1 to 3 samples. */ 00233 while (k > 0U) 00234 { 00235 /* copy second buffer in reversal manner */ 00236 *pScr1++ = 0; 00237 *pScr1++ = 0; 00238 *pScr1++ = 0; 00239 *pScr1++ = 0; 00240 00241 /* Decrement the loop counter */ 00242 k--; 00243 } 00244 00245 /* If the count is not a multiple of 4, copy remaining samples here. 00246 ** No loop unrolling is used. */ 00247 k = (srcBLen - 1U) % 0x4U; 00248 00249 while (k > 0U) 00250 { 00251 /* copy second buffer in reversal manner for remaining samples */ 00252 *pScr1++ = 0; 00253 00254 /* Decrement the loop counter */ 00255 k--; 00256 } 00257 00258 #endif 00259 00260 /* Temporary pointer for scratch2 */ 00261 py = pScratch2; 00262 00263 00264 /* Initialization of pIn2 pointer */ 00265 pIn2 = py; 00266 00267 /* First part of the processing with loop unrolling process 4 data points at a time. 00268 ** a second loop below process for the remaining 1 to 3 samples. */ 00269 00270 /* Actual convolution process starts here */ 00271 blkCnt = (srcALen + srcBLen - 1U) >> 2; 00272 00273 while (blkCnt > 0) 00274 { 00275 /* Initialze temporary scratch pointer as scratch1 */ 00276 pScr1 = pScratch1; 00277 00278 /* Clear Accumlators */ 00279 acc0 = 0; 00280 acc1 = 0; 00281 acc2 = 0; 00282 acc3 = 0; 00283 00284 /* Read two samples from scratch1 buffer */ 00285 x1 = *__SIMD32(pScr1)++; 00286 00287 /* Read next two samples from scratch1 buffer */ 00288 x2 = *__SIMD32(pScr1)++; 00289 00290 tapCnt = (srcBLen) >> 2U; 00291 00292 while (tapCnt > 0U) 00293 { 00294 00295 #ifndef UNALIGNED_SUPPORT_DISABLE 00296 00297 /* Read four samples from smaller buffer */ 00298 y1 = _SIMD32_OFFSET(pIn2); 00299 y2 = _SIMD32_OFFSET(pIn2 + 2U); 00300 00301 /* multiply and accumlate */ 00302 acc0 = __SMLALD(x1, y1, acc0); 00303 acc2 = __SMLALD(x2, y1, acc2); 00304 00305 /* pack input data */ 00306 #ifndef ARM_MATH_BIG_ENDIAN 00307 x3 = __PKHBT(x2, x1, 0); 00308 #else 00309 x3 = __PKHBT(x1, x2, 0); 00310 #endif 00311 00312 /* multiply and accumlate */ 00313 acc1 = __SMLALDX(x3, y1, acc1); 00314 00315 /* Read next two samples from scratch1 buffer */ 00316 x1 = _SIMD32_OFFSET(pScr1); 00317 00318 /* multiply and accumlate */ 00319 acc0 = __SMLALD(x2, y2, acc0); 00320 acc2 = __SMLALD(x1, y2, acc2); 00321 00322 /* pack input data */ 00323 #ifndef ARM_MATH_BIG_ENDIAN 00324 x3 = __PKHBT(x1, x2, 0); 00325 #else 00326 x3 = __PKHBT(x2, x1, 0); 00327 #endif 00328 00329 acc3 = __SMLALDX(x3, y1, acc3); 00330 acc1 = __SMLALDX(x3, y2, acc1); 00331 00332 x2 = _SIMD32_OFFSET(pScr1 + 2U); 00333 00334 #ifndef ARM_MATH_BIG_ENDIAN 00335 x3 = __PKHBT(x2, x1, 0); 00336 #else 00337 x3 = __PKHBT(x1, x2, 0); 00338 #endif 00339 00340 acc3 = __SMLALDX(x3, y2, acc3); 00341 00342 #else 00343 00344 /* Read four samples from smaller buffer */ 00345 a = *pIn2; 00346 b = *(pIn2 + 1); 00347 00348 #ifndef ARM_MATH_BIG_ENDIAN 00349 y1 = __PKHBT(a, b, 16); 00350 #else 00351 y1 = __PKHBT(b, a, 16); 00352 #endif 00353 00354 a = *(pIn2 + 2); 00355 b = *(pIn2 + 3); 00356 #ifndef ARM_MATH_BIG_ENDIAN 00357 y2 = __PKHBT(a, b, 16); 00358 #else 00359 y2 = __PKHBT(b, a, 16); 00360 #endif 00361 00362 acc0 = __SMLALD(x1, y1, acc0); 00363 00364 acc2 = __SMLALD(x2, y1, acc2); 00365 00366 #ifndef ARM_MATH_BIG_ENDIAN 00367 x3 = __PKHBT(x2, x1, 0); 00368 #else 00369 x3 = __PKHBT(x1, x2, 0); 00370 #endif 00371 00372 acc1 = __SMLALDX(x3, y1, acc1); 00373 00374 a = *pScr1; 00375 b = *(pScr1 + 1); 00376 00377 #ifndef ARM_MATH_BIG_ENDIAN 00378 x1 = __PKHBT(a, b, 16); 00379 #else 00380 x1 = __PKHBT(b, a, 16); 00381 #endif 00382 00383 acc0 = __SMLALD(x2, y2, acc0); 00384 00385 acc2 = __SMLALD(x1, y2, acc2); 00386 00387 #ifndef ARM_MATH_BIG_ENDIAN 00388 x3 = __PKHBT(x1, x2, 0); 00389 #else 00390 x3 = __PKHBT(x2, x1, 0); 00391 #endif 00392 00393 acc3 = __SMLALDX(x3, y1, acc3); 00394 00395 acc1 = __SMLALDX(x3, y2, acc1); 00396 00397 a = *(pScr1 + 2); 00398 b = *(pScr1 + 3); 00399 00400 #ifndef ARM_MATH_BIG_ENDIAN 00401 x2 = __PKHBT(a, b, 16); 00402 #else 00403 x2 = __PKHBT(b, a, 16); 00404 #endif 00405 00406 #ifndef ARM_MATH_BIG_ENDIAN 00407 x3 = __PKHBT(x2, x1, 0); 00408 #else 00409 x3 = __PKHBT(x1, x2, 0); 00410 #endif 00411 00412 acc3 = __SMLALDX(x3, y2, acc3); 00413 00414 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00415 00416 pIn2 += 4U; 00417 pScr1 += 4U; 00418 00419 00420 /* Decrement the loop counter */ 00421 tapCnt--; 00422 } 00423 00424 /* Update scratch pointer for remaining samples of smaller length sequence */ 00425 pScr1 -= 4U; 00426 00427 /* apply same above for remaining samples of smaller length sequence */ 00428 tapCnt = (srcBLen) & 3U; 00429 00430 while (tapCnt > 0U) 00431 { 00432 00433 /* accumlate the results */ 00434 acc0 += (*pScr1++ * *pIn2); 00435 acc1 += (*pScr1++ * *pIn2); 00436 acc2 += (*pScr1++ * *pIn2); 00437 acc3 += (*pScr1++ * *pIn2++); 00438 00439 pScr1 -= 3U; 00440 00441 /* Decrement the loop counter */ 00442 tapCnt--; 00443 } 00444 00445 blkCnt--; 00446 00447 00448 /* Store the results in the accumulators in the destination buffer. */ 00449 00450 #ifndef ARM_MATH_BIG_ENDIAN 00451 00452 *__SIMD32(pOut)++ = 00453 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00454 00455 *__SIMD32(pOut)++ = 00456 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00457 00458 #else 00459 00460 *__SIMD32(pOut)++ = 00461 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00462 00463 *__SIMD32(pOut)++ = 00464 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00465 00466 00467 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00468 00469 /* Initialization of inputB pointer */ 00470 pIn2 = py; 00471 00472 pScratch1 += 4U; 00473 00474 } 00475 00476 00477 blkCnt = (srcALen + srcBLen - 1U) & 0x3; 00478 00479 /* Calculate convolution for remaining samples of Bigger length sequence */ 00480 while (blkCnt > 0) 00481 { 00482 /* Initialze temporary scratch pointer as scratch1 */ 00483 pScr1 = pScratch1; 00484 00485 /* Clear Accumlators */ 00486 acc0 = 0; 00487 00488 tapCnt = (srcBLen) >> 1U; 00489 00490 while (tapCnt > 0U) 00491 { 00492 00493 /* Read next two samples from scratch1 buffer */ 00494 acc0 += (*pScr1++ * *pIn2++); 00495 acc0 += (*pScr1++ * *pIn2++); 00496 00497 /* Decrement the loop counter */ 00498 tapCnt--; 00499 } 00500 00501 tapCnt = (srcBLen) & 1U; 00502 00503 /* apply same above for remaining samples of smaller length sequence */ 00504 while (tapCnt > 0U) 00505 { 00506 00507 /* accumlate the results */ 00508 acc0 += (*pScr1++ * *pIn2++); 00509 00510 /* Decrement the loop counter */ 00511 tapCnt--; 00512 } 00513 00514 blkCnt--; 00515 00516 /* The result is in 2.30 format. Convert to 1.15 with saturation. 00517 ** Then store the output in the destination buffer. */ 00518 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00519 00520 00521 /* Initialization of inputB pointer */ 00522 pIn2 = py; 00523 00524 pScratch1 += 1U; 00525 00526 } 00527 00528 } 00529 00530 00531 /** 00532 * @} end of Conv group 00533 */ 00534
Generated on Tue Jul 12 2022 16:46:23 by
1.7.2