Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_conv_opt_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_opt_q15.c 00009 * 00010 * Description: Convolution of Q15 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q15 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00060 * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). 00061 * @return none. 00062 * 00063 * \par Restrictions 00064 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00065 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00066 * 00067 * 00068 * @details 00069 * <b>Scaling and Overflow Behavior:</b> 00070 * 00071 * \par 00072 * The function is implemented using a 64-bit internal accumulator. 00073 * Both inputs are in 1.15 format and multiplications yield a 2.30 result. 00074 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. 00075 * This approach provides 33 guard bits and there is no risk of overflow. 00076 * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. 00077 * 00078 * 00079 * \par 00080 * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00081 * 00082 * 00083 */ 00084 00085 void arm_conv_opt_q15( 00086 q15_t * pSrcA, 00087 uint32_t srcALen, 00088 q15_t * pSrcB, 00089 uint32_t srcBLen, 00090 q15_t * pDst, 00091 q15_t * pScratch1, 00092 q15_t * pScratch2) 00093 { 00094 q63_t acc0, acc1, acc2, acc3; /* Accumulator */ 00095 q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ 00096 q31_t y1, y2; /* State variables */ 00097 q15_t *pOut = pDst; /* output pointer */ 00098 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00099 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00100 q15_t *pIn1; /* inputA pointer */ 00101 q15_t *pIn2; /* inputB pointer */ 00102 q15_t *px; /* Intermediate inputA pointer */ 00103 q15_t *py; /* Intermediate inputB pointer */ 00104 uint32_t j, k, blkCnt; /* loop counter */ 00105 uint32_t tapCnt; /* loop count */ 00106 #ifdef UNALIGNED_SUPPORT_DISABLE 00107 00108 q15_t a, b; 00109 00110 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00111 00112 /* The algorithm implementation is based on the lengths of the inputs. */ 00113 /* srcB is always made to slide across srcA. */ 00114 /* So srcBLen is always considered as shorter or equal to srcALen */ 00115 if(srcALen >= srcBLen) 00116 { 00117 /* Initialization of inputA pointer */ 00118 pIn1 = pSrcA; 00119 00120 /* Initialization of inputB pointer */ 00121 pIn2 = pSrcB; 00122 00123 } 00124 else 00125 { 00126 /* Initialization of inputA pointer */ 00127 pIn1 = pSrcB; 00128 00129 /* Initialization of inputB pointer */ 00130 pIn2 = pSrcA; 00131 00132 /* srcBLen is always considered as shorter or equal to srcALen */ 00133 j = srcBLen; 00134 srcBLen = srcALen; 00135 srcALen = j; 00136 } 00137 00138 /* pointer to take end of scratch2 buffer */ 00139 pScr2 = pScratch2 + srcBLen - 1; 00140 00141 /* points to smaller length sequence */ 00142 px = pIn2; 00143 00144 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00145 k = srcBLen >> 2u; 00146 00147 /* First part of the processing with loop unrolling copies 4 data points at a time. 00148 ** a second loop below copies for the remaining 1 to 3 samples. */ 00149 /* Copy smaller length input sequence in reverse order into second scratch buffer */ 00150 while(k > 0u) 00151 { 00152 /* copy second buffer in reversal manner */ 00153 *pScr2-- = *px++; 00154 *pScr2-- = *px++; 00155 *pScr2-- = *px++; 00156 *pScr2-- = *px++; 00157 00158 /* Decrement the loop counter */ 00159 k--; 00160 } 00161 00162 /* If the count is not a multiple of 4, copy remaining samples here. 00163 ** No loop unrolling is used. */ 00164 k = srcBLen % 0x4u; 00165 00166 while(k > 0u) 00167 { 00168 /* copy second buffer in reversal manner for remaining samples */ 00169 *pScr2-- = *px++; 00170 00171 /* Decrement the loop counter */ 00172 k--; 00173 } 00174 00175 /* Initialze temporary scratch pointer */ 00176 pScr1 = pScratch1; 00177 00178 /* Assuming scratch1 buffer is aligned by 32-bit */ 00179 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00180 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00181 00182 /* Update temporary scratch pointer */ 00183 pScr1 += (srcBLen - 1u); 00184 00185 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00186 00187 #ifndef UNALIGNED_SUPPORT_DISABLE 00188 00189 /* Copy (srcALen) samples in scratch buffer */ 00190 arm_copy_q15(pIn1, pScr1, srcALen); 00191 00192 /* Update pointers */ 00193 pScr1 += srcALen; 00194 00195 #else 00196 00197 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00198 k = srcALen >> 2u; 00199 00200 /* First part of the processing with loop unrolling copies 4 data points at a time. 00201 ** a second loop below copies for the remaining 1 to 3 samples. */ 00202 while(k > 0u) 00203 { 00204 /* copy second buffer in reversal manner */ 00205 *pScr1++ = *pIn1++; 00206 *pScr1++ = *pIn1++; 00207 *pScr1++ = *pIn1++; 00208 *pScr1++ = *pIn1++; 00209 00210 /* Decrement the loop counter */ 00211 k--; 00212 } 00213 00214 /* If the count is not a multiple of 4, copy remaining samples here. 00215 ** No loop unrolling is used. */ 00216 k = srcALen % 0x4u; 00217 00218 while(k > 0u) 00219 { 00220 /* copy second buffer in reversal manner for remaining samples */ 00221 *pScr1++ = *pIn1++; 00222 00223 /* Decrement the loop counter */ 00224 k--; 00225 } 00226 00227 #endif 00228 00229 00230 #ifndef UNALIGNED_SUPPORT_DISABLE 00231 00232 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00233 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00234 00235 /* Update pointer */ 00236 pScr1 += (srcBLen - 1u); 00237 00238 #else 00239 00240 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00241 k = (srcBLen - 1u) >> 2u; 00242 00243 /* First part of the processing with loop unrolling copies 4 data points at a time. 00244 ** a second loop below copies for the remaining 1 to 3 samples. */ 00245 while(k > 0u) 00246 { 00247 /* copy second buffer in reversal manner */ 00248 *pScr1++ = 0; 00249 *pScr1++ = 0; 00250 *pScr1++ = 0; 00251 *pScr1++ = 0; 00252 00253 /* Decrement the loop counter */ 00254 k--; 00255 } 00256 00257 /* If the count is not a multiple of 4, copy remaining samples here. 00258 ** No loop unrolling is used. */ 00259 k = (srcBLen - 1u) % 0x4u; 00260 00261 while(k > 0u) 00262 { 00263 /* copy second buffer in reversal manner for remaining samples */ 00264 *pScr1++ = 0; 00265 00266 /* Decrement the loop counter */ 00267 k--; 00268 } 00269 00270 #endif 00271 00272 /* Temporary pointer for scratch2 */ 00273 py = pScratch2; 00274 00275 00276 /* Initialization of pIn2 pointer */ 00277 pIn2 = py; 00278 00279 /* First part of the processing with loop unrolling process 4 data points at a time. 00280 ** a second loop below process for the remaining 1 to 3 samples. */ 00281 00282 /* Actual convolution process starts here */ 00283 blkCnt = (srcALen + srcBLen - 1u) >> 2; 00284 00285 while(blkCnt > 0) 00286 { 00287 /* Initialze temporary scratch pointer as scratch1 */ 00288 pScr1 = pScratch1; 00289 00290 /* Clear Accumlators */ 00291 acc0 = 0; 00292 acc1 = 0; 00293 acc2 = 0; 00294 acc3 = 0; 00295 00296 /* Read two samples from scratch1 buffer */ 00297 x1 = *__SIMD32(pScr1)++; 00298 00299 /* Read next two samples from scratch1 buffer */ 00300 x2 = *__SIMD32(pScr1)++; 00301 00302 tapCnt = (srcBLen) >> 2u; 00303 00304 while(tapCnt > 0u) 00305 { 00306 00307 #ifndef UNALIGNED_SUPPORT_DISABLE 00308 00309 /* Read four samples from smaller buffer */ 00310 y1 = _SIMD32_OFFSET(pIn2); 00311 y2 = _SIMD32_OFFSET(pIn2 + 2u); 00312 00313 /* multiply and accumlate */ 00314 acc0 = __SMLALD(x1, y1, acc0); 00315 acc2 = __SMLALD(x2, y1, acc2); 00316 00317 /* pack input data */ 00318 #ifndef ARM_MATH_BIG_ENDIAN 00319 x3 = __PKHBT(x2, x1, 0); 00320 #else 00321 x3 = __PKHBT(x1, x2, 0); 00322 #endif 00323 00324 /* multiply and accumlate */ 00325 acc1 = __SMLALDX(x3, y1, acc1); 00326 00327 /* Read next two samples from scratch1 buffer */ 00328 x1 = _SIMD32_OFFSET(pScr1); 00329 00330 /* multiply and accumlate */ 00331 acc0 = __SMLALD(x2, y2, acc0); 00332 acc2 = __SMLALD(x1, y2, acc2); 00333 00334 /* pack input data */ 00335 #ifndef ARM_MATH_BIG_ENDIAN 00336 x3 = __PKHBT(x1, x2, 0); 00337 #else 00338 x3 = __PKHBT(x2, x1, 0); 00339 #endif 00340 00341 acc3 = __SMLALDX(x3, y1, acc3); 00342 acc1 = __SMLALDX(x3, y2, acc1); 00343 00344 x2 = _SIMD32_OFFSET(pScr1 + 2u); 00345 00346 #ifndef ARM_MATH_BIG_ENDIAN 00347 x3 = __PKHBT(x2, x1, 0); 00348 #else 00349 x3 = __PKHBT(x1, x2, 0); 00350 #endif 00351 00352 acc3 = __SMLALDX(x3, y2, acc3); 00353 00354 #else 00355 00356 /* Read four samples from smaller buffer */ 00357 a = *pIn2; 00358 b = *(pIn2 + 1); 00359 00360 #ifndef ARM_MATH_BIG_ENDIAN 00361 y1 = __PKHBT(a, b, 16); 00362 #else 00363 y1 = __PKHBT(b, a, 16); 00364 #endif 00365 00366 a = *(pIn2 + 2); 00367 b = *(pIn2 + 3); 00368 #ifndef ARM_MATH_BIG_ENDIAN 00369 y2 = __PKHBT(a, b, 16); 00370 #else 00371 y2 = __PKHBT(b, a, 16); 00372 #endif 00373 00374 acc0 = __SMLALD(x1, y1, acc0); 00375 00376 acc2 = __SMLALD(x2, y1, acc2); 00377 00378 #ifndef ARM_MATH_BIG_ENDIAN 00379 x3 = __PKHBT(x2, x1, 0); 00380 #else 00381 x3 = __PKHBT(x1, x2, 0); 00382 #endif 00383 00384 acc1 = __SMLALDX(x3, y1, acc1); 00385 00386 a = *pScr1; 00387 b = *(pScr1 + 1); 00388 00389 #ifndef ARM_MATH_BIG_ENDIAN 00390 x1 = __PKHBT(a, b, 16); 00391 #else 00392 x1 = __PKHBT(b, a, 16); 00393 #endif 00394 00395 acc0 = __SMLALD(x2, y2, acc0); 00396 00397 acc2 = __SMLALD(x1, y2, acc2); 00398 00399 #ifndef ARM_MATH_BIG_ENDIAN 00400 x3 = __PKHBT(x1, x2, 0); 00401 #else 00402 x3 = __PKHBT(x2, x1, 0); 00403 #endif 00404 00405 acc3 = __SMLALDX(x3, y1, acc3); 00406 00407 acc1 = __SMLALDX(x3, y2, acc1); 00408 00409 a = *(pScr1 + 2); 00410 b = *(pScr1 + 3); 00411 00412 #ifndef ARM_MATH_BIG_ENDIAN 00413 x2 = __PKHBT(a, b, 16); 00414 #else 00415 x2 = __PKHBT(b, a, 16); 00416 #endif 00417 00418 #ifndef ARM_MATH_BIG_ENDIAN 00419 x3 = __PKHBT(x2, x1, 0); 00420 #else 00421 x3 = __PKHBT(x1, x2, 0); 00422 #endif 00423 00424 acc3 = __SMLALDX(x3, y2, acc3); 00425 00426 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00427 00428 pIn2 += 4u; 00429 pScr1 += 4u; 00430 00431 00432 /* Decrement the loop counter */ 00433 tapCnt--; 00434 } 00435 00436 /* Update scratch pointer for remaining samples of smaller length sequence */ 00437 pScr1 -= 4u; 00438 00439 /* apply same above for remaining samples of smaller length sequence */ 00440 tapCnt = (srcBLen) & 3u; 00441 00442 while(tapCnt > 0u) 00443 { 00444 00445 /* accumlate the results */ 00446 acc0 += (*pScr1++ * *pIn2); 00447 acc1 += (*pScr1++ * *pIn2); 00448 acc2 += (*pScr1++ * *pIn2); 00449 acc3 += (*pScr1++ * *pIn2++); 00450 00451 pScr1 -= 3u; 00452 00453 /* Decrement the loop counter */ 00454 tapCnt--; 00455 } 00456 00457 blkCnt--; 00458 00459 00460 /* Store the results in the accumulators in the destination buffer. */ 00461 00462 #ifndef ARM_MATH_BIG_ENDIAN 00463 00464 *__SIMD32(pOut)++ = 00465 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00466 00467 *__SIMD32(pOut)++ = 00468 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00469 00470 #else 00471 00472 *__SIMD32(pOut)++ = 00473 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00474 00475 *__SIMD32(pOut)++ = 00476 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00477 00478 00479 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00480 00481 /* Initialization of inputB pointer */ 00482 pIn2 = py; 00483 00484 pScratch1 += 4u; 00485 00486 } 00487 00488 00489 blkCnt = (srcALen + srcBLen - 1u) & 0x3; 00490 00491 /* Calculate convolution for remaining samples of Bigger length sequence */ 00492 while(blkCnt > 0) 00493 { 00494 /* Initialze temporary scratch pointer as scratch1 */ 00495 pScr1 = pScratch1; 00496 00497 /* Clear Accumlators */ 00498 acc0 = 0; 00499 00500 tapCnt = (srcBLen) >> 1u; 00501 00502 while(tapCnt > 0u) 00503 { 00504 00505 /* Read next two samples from scratch1 buffer */ 00506 acc0 += (*pScr1++ * *pIn2++); 00507 acc0 += (*pScr1++ * *pIn2++); 00508 00509 /* Decrement the loop counter */ 00510 tapCnt--; 00511 } 00512 00513 tapCnt = (srcBLen) & 1u; 00514 00515 /* apply same above for remaining samples of smaller length sequence */ 00516 while(tapCnt > 0u) 00517 { 00518 00519 /* accumlate the results */ 00520 acc0 += (*pScr1++ * *pIn2++); 00521 00522 /* Decrement the loop counter */ 00523 tapCnt--; 00524 } 00525 00526 blkCnt--; 00527 00528 /* The result is in 2.30 format. Convert to 1.15 with saturation. 00529 ** Then store the output in the destination buffer. */ 00530 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00531 00532 00533 /* Initialization of inputB pointer */ 00534 pIn2 = py; 00535 00536 pScratch1 += 1u; 00537 00538 } 00539 00540 } 00541 00542 00543 /** 00544 * @} end of Conv group 00545 */
Generated on Tue Jul 12 2022 18:44:08 by
1.7.2
