Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_conv_fast_opt_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_fast_opt_q15.c 00009 * 00010 * Description: Fast Q15 Convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00060 * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). 00061 * @return none. 00062 * 00063 * \par Restrictions 00064 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00065 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00066 * 00067 * <b>Scaling and Overflow Behavior:</b> 00068 * 00069 * \par 00070 * This fast version uses a 32-bit accumulator with 2.30 format. 00071 * The accumulator maintains full precision of the intermediate multiplication results 00072 * but provides only a single guard bit. There is no saturation on intermediate additions. 00073 * Thus, if the accumulator overflows it wraps around and distorts the result. 00074 * The input signals should be scaled down to avoid intermediate overflows. 00075 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, 00076 * as maximum of min(srcALen, srcBLen) number of additions are carried internally. 00077 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. 00078 * 00079 * \par 00080 * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. 00081 */ 00082 00083 void arm_conv_fast_opt_q15( 00084 q15_t * pSrcA, 00085 uint32_t srcALen, 00086 q15_t * pSrcB, 00087 uint32_t srcBLen, 00088 q15_t * pDst, 00089 q15_t * pScratch1, 00090 q15_t * pScratch2) 00091 { 00092 q31_t acc0, acc1, acc2, acc3; /* Accumulators */ 00093 q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ 00094 q31_t y1, y2; /* State variables */ 00095 q15_t *pOut = pDst; /* output pointer */ 00096 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00097 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00098 q15_t *pIn1; /* inputA pointer */ 00099 q15_t *pIn2; /* inputB pointer */ 00100 q15_t *px; /* Intermediate inputA pointer */ 00101 q15_t *py; /* Intermediate inputB pointer */ 00102 uint32_t j, k, blkCnt; /* loop counter */ 00103 uint32_t tapCnt; /* loop count */ 00104 #ifdef UNALIGNED_SUPPORT_DISABLE 00105 00106 q15_t a, b; 00107 00108 #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */ 00109 00110 /* The algorithm implementation is based on the lengths of the inputs. */ 00111 /* srcB is always made to slide across srcA. */ 00112 /* So srcBLen is always considered as shorter or equal to srcALen */ 00113 if(srcALen >= srcBLen) 00114 { 00115 /* Initialization of inputA pointer */ 00116 pIn1 = pSrcA; 00117 00118 /* Initialization of inputB pointer */ 00119 pIn2 = pSrcB; 00120 } 00121 else 00122 { 00123 /* Initialization of inputA pointer */ 00124 pIn1 = pSrcB; 00125 00126 /* Initialization of inputB pointer */ 00127 pIn2 = pSrcA; 00128 00129 /* srcBLen is always considered as shorter or equal to srcALen */ 00130 j = srcBLen; 00131 srcBLen = srcALen; 00132 srcALen = j; 00133 } 00134 00135 /* Pointer to take end of scratch2 buffer */ 00136 pScr2 = pScratch2 + srcBLen - 1; 00137 00138 /* points to smaller length sequence */ 00139 px = pIn2; 00140 00141 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00142 k = srcBLen >> 2u; 00143 00144 /* First part of the processing with loop unrolling copies 4 data points at a time. 00145 ** a second loop below copies for the remaining 1 to 3 samples. */ 00146 00147 /* Copy smaller length input sequence in reverse order into second scratch buffer */ 00148 while(k > 0u) 00149 { 00150 /* copy second buffer in reversal manner */ 00151 *pScr2-- = *px++; 00152 *pScr2-- = *px++; 00153 *pScr2-- = *px++; 00154 *pScr2-- = *px++; 00155 00156 /* Decrement the loop counter */ 00157 k--; 00158 } 00159 00160 /* If the count is not a multiple of 4, copy remaining samples here. 00161 ** No loop unrolling is used. */ 00162 k = srcBLen % 0x4u; 00163 00164 while(k > 0u) 00165 { 00166 /* copy second buffer in reversal manner for remaining samples */ 00167 *pScr2-- = *px++; 00168 00169 /* Decrement the loop counter */ 00170 k--; 00171 } 00172 00173 /* Initialze temporary scratch pointer */ 00174 pScr1 = pScratch1; 00175 00176 /* Assuming scratch1 buffer is aligned by 32-bit */ 00177 /* Fill (srcBLen - 1u) zeros in scratch1 buffer */ 00178 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00179 00180 /* Update temporary scratch pointer */ 00181 pScr1 += (srcBLen - 1u); 00182 00183 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00184 00185 #ifndef UNALIGNED_SUPPORT_DISABLE 00186 00187 /* Copy (srcALen) samples in scratch buffer */ 00188 arm_copy_q15(pIn1, pScr1, srcALen); 00189 00190 /* Update pointers */ 00191 pScr1 += srcALen; 00192 00193 #else 00194 00195 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00196 k = srcALen >> 2u; 00197 00198 /* First part of the processing with loop unrolling copies 4 data points at a time. 00199 ** a second loop below copies for the remaining 1 to 3 samples. */ 00200 while(k > 0u) 00201 { 00202 /* copy second buffer in reversal manner */ 00203 *pScr1++ = *pIn1++; 00204 *pScr1++ = *pIn1++; 00205 *pScr1++ = *pIn1++; 00206 *pScr1++ = *pIn1++; 00207 00208 /* Decrement the loop counter */ 00209 k--; 00210 } 00211 00212 /* If the count is not a multiple of 4, copy remaining samples here. 00213 ** No loop unrolling is used. */ 00214 k = srcALen % 0x4u; 00215 00216 while(k > 0u) 00217 { 00218 /* copy second buffer in reversal manner for remaining samples */ 00219 *pScr1++ = *pIn1++; 00220 00221 /* Decrement the loop counter */ 00222 k--; 00223 } 00224 00225 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00226 00227 00228 #ifndef UNALIGNED_SUPPORT_DISABLE 00229 00230 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00231 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00232 00233 /* Update pointer */ 00234 pScr1 += (srcBLen - 1u); 00235 00236 #else 00237 00238 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00239 k = (srcBLen - 1u) >> 2u; 00240 00241 /* First part of the processing with loop unrolling copies 4 data points at a time. 00242 ** a second loop below copies for the remaining 1 to 3 samples. */ 00243 while(k > 0u) 00244 { 00245 /* copy second buffer in reversal manner */ 00246 *pScr1++ = 0; 00247 *pScr1++ = 0; 00248 *pScr1++ = 0; 00249 *pScr1++ = 0; 00250 00251 /* Decrement the loop counter */ 00252 k--; 00253 } 00254 00255 /* If the count is not a multiple of 4, copy remaining samples here. 00256 ** No loop unrolling is used. */ 00257 k = (srcBLen - 1u) % 0x4u; 00258 00259 while(k > 0u) 00260 { 00261 /* copy second buffer in reversal manner for remaining samples */ 00262 *pScr1++ = 0; 00263 00264 /* Decrement the loop counter */ 00265 k--; 00266 } 00267 00268 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00269 00270 /* Temporary pointer for scratch2 */ 00271 py = pScratch2; 00272 00273 00274 /* Initialization of pIn2 pointer */ 00275 pIn2 = py; 00276 00277 /* First part of the processing with loop unrolling process 4 data points at a time. 00278 ** a second loop below process for the remaining 1 to 3 samples. */ 00279 00280 /* Actual convolution process starts here */ 00281 blkCnt = (srcALen + srcBLen - 1u) >> 2; 00282 00283 while(blkCnt > 0) 00284 { 00285 /* Initialze temporary scratch pointer as scratch1 */ 00286 pScr1 = pScratch1; 00287 00288 /* Clear Accumlators */ 00289 acc0 = 0; 00290 acc1 = 0; 00291 acc2 = 0; 00292 acc3 = 0; 00293 00294 /* Read two samples from scratch1 buffer */ 00295 x1 = *__SIMD32(pScr1)++; 00296 00297 /* Read next two samples from scratch1 buffer */ 00298 x2 = *__SIMD32(pScr1)++; 00299 00300 tapCnt = (srcBLen) >> 2u; 00301 00302 while(tapCnt > 0u) 00303 { 00304 00305 #ifndef UNALIGNED_SUPPORT_DISABLE 00306 00307 /* Read four samples from smaller buffer */ 00308 y1 = _SIMD32_OFFSET(pIn2); 00309 y2 = _SIMD32_OFFSET(pIn2 + 2u); 00310 00311 /* multiply and accumlate */ 00312 acc0 = __SMLAD(x1, y1, acc0); 00313 acc2 = __SMLAD(x2, y1, acc2); 00314 00315 /* pack input data */ 00316 #ifndef ARM_MATH_BIG_ENDIAN 00317 x3 = __PKHBT(x2, x1, 0); 00318 #else 00319 x3 = __PKHBT(x1, x2, 0); 00320 #endif 00321 00322 /* multiply and accumlate */ 00323 acc1 = __SMLADX(x3, y1, acc1); 00324 00325 /* Read next two samples from scratch1 buffer */ 00326 x1 = _SIMD32_OFFSET(pScr1); 00327 00328 /* multiply and accumlate */ 00329 acc0 = __SMLAD(x2, y2, acc0); 00330 acc2 = __SMLAD(x1, y2, acc2); 00331 00332 /* pack input data */ 00333 #ifndef ARM_MATH_BIG_ENDIAN 00334 x3 = __PKHBT(x1, x2, 0); 00335 #else 00336 x3 = __PKHBT(x2, x1, 0); 00337 #endif 00338 00339 acc3 = __SMLADX(x3, y1, acc3); 00340 acc1 = __SMLADX(x3, y2, acc1); 00341 00342 x2 = _SIMD32_OFFSET(pScr1 + 2u); 00343 00344 #ifndef ARM_MATH_BIG_ENDIAN 00345 x3 = __PKHBT(x2, x1, 0); 00346 #else 00347 x3 = __PKHBT(x1, x2, 0); 00348 #endif 00349 00350 acc3 = __SMLADX(x3, y2, acc3); 00351 00352 #else 00353 00354 /* Read four samples from smaller buffer */ 00355 a = *pIn2; 00356 b = *(pIn2 + 1); 00357 00358 #ifndef ARM_MATH_BIG_ENDIAN 00359 y1 = __PKHBT(a, b, 16); 00360 #else 00361 y1 = __PKHBT(b, a, 16); 00362 #endif 00363 00364 a = *(pIn2 + 2); 00365 b = *(pIn2 + 3); 00366 #ifndef ARM_MATH_BIG_ENDIAN 00367 y2 = __PKHBT(a, b, 16); 00368 #else 00369 y2 = __PKHBT(b, a, 16); 00370 #endif 00371 00372 acc0 = __SMLAD(x1, y1, acc0); 00373 00374 acc2 = __SMLAD(x2, y1, acc2); 00375 00376 #ifndef ARM_MATH_BIG_ENDIAN 00377 x3 = __PKHBT(x2, x1, 0); 00378 #else 00379 x3 = __PKHBT(x1, x2, 0); 00380 #endif 00381 00382 acc1 = __SMLADX(x3, y1, acc1); 00383 00384 a = *pScr1; 00385 b = *(pScr1 + 1); 00386 00387 #ifndef ARM_MATH_BIG_ENDIAN 00388 x1 = __PKHBT(a, b, 16); 00389 #else 00390 x1 = __PKHBT(b, a, 16); 00391 #endif 00392 00393 acc0 = __SMLAD(x2, y2, acc0); 00394 00395 acc2 = __SMLAD(x1, y2, acc2); 00396 00397 #ifndef ARM_MATH_BIG_ENDIAN 00398 x3 = __PKHBT(x1, x2, 0); 00399 #else 00400 x3 = __PKHBT(x2, x1, 0); 00401 #endif 00402 00403 acc3 = __SMLADX(x3, y1, acc3); 00404 00405 acc1 = __SMLADX(x3, y2, acc1); 00406 00407 a = *(pScr1 + 2); 00408 b = *(pScr1 + 3); 00409 00410 #ifndef ARM_MATH_BIG_ENDIAN 00411 x2 = __PKHBT(a, b, 16); 00412 #else 00413 x2 = __PKHBT(b, a, 16); 00414 #endif 00415 00416 #ifndef ARM_MATH_BIG_ENDIAN 00417 x3 = __PKHBT(x2, x1, 0); 00418 #else 00419 x3 = __PKHBT(x1, x2, 0); 00420 #endif 00421 00422 acc3 = __SMLADX(x3, y2, acc3); 00423 00424 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00425 00426 /* update scratch pointers */ 00427 pIn2 += 4u; 00428 pScr1 += 4u; 00429 00430 00431 /* Decrement the loop counter */ 00432 tapCnt--; 00433 } 00434 00435 /* Update scratch pointer for remaining samples of smaller length sequence */ 00436 pScr1 -= 4u; 00437 00438 /* apply same above for remaining samples of smaller length sequence */ 00439 tapCnt = (srcBLen) & 3u; 00440 00441 while(tapCnt > 0u) 00442 { 00443 00444 /* accumlate the results */ 00445 acc0 += (*pScr1++ * *pIn2); 00446 acc1 += (*pScr1++ * *pIn2); 00447 acc2 += (*pScr1++ * *pIn2); 00448 acc3 += (*pScr1++ * *pIn2++); 00449 00450 pScr1 -= 3u; 00451 00452 /* Decrement the loop counter */ 00453 tapCnt--; 00454 } 00455 00456 blkCnt--; 00457 00458 00459 /* Store the results in the accumulators in the destination buffer. */ 00460 00461 #ifndef ARM_MATH_BIG_ENDIAN 00462 00463 *__SIMD32(pOut)++ = 00464 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00465 00466 *__SIMD32(pOut)++ = 00467 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00468 00469 00470 #else 00471 00472 *__SIMD32(pOut)++ = 00473 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00474 00475 *__SIMD32(pOut)++ = 00476 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00477 00478 00479 00480 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00481 00482 /* Initialization of inputB pointer */ 00483 pIn2 = py; 00484 00485 pScratch1 += 4u; 00486 00487 } 00488 00489 00490 blkCnt = (srcALen + srcBLen - 1u) & 0x3; 00491 00492 /* Calculate convolution for remaining samples of Bigger length sequence */ 00493 while(blkCnt > 0) 00494 { 00495 /* Initialze temporary scratch pointer as scratch1 */ 00496 pScr1 = pScratch1; 00497 00498 /* Clear Accumlators */ 00499 acc0 = 0; 00500 00501 tapCnt = (srcBLen) >> 1u; 00502 00503 while(tapCnt > 0u) 00504 { 00505 00506 acc0 += (*pScr1++ * *pIn2++); 00507 acc0 += (*pScr1++ * *pIn2++); 00508 00509 /* Decrement the loop counter */ 00510 tapCnt--; 00511 } 00512 00513 tapCnt = (srcBLen) & 1u; 00514 00515 /* apply same above for remaining samples of smaller length sequence */ 00516 while(tapCnt > 0u) 00517 { 00518 00519 /* accumlate the results */ 00520 acc0 += (*pScr1++ * *pIn2++); 00521 00522 /* Decrement the loop counter */ 00523 tapCnt--; 00524 } 00525 00526 blkCnt--; 00527 00528 /* The result is in 2.30 format. Convert to 1.15 with saturation. 00529 ** Then store the output in the destination buffer. */ 00530 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00531 00532 /* Initialization of inputB pointer */ 00533 pIn2 = py; 00534 00535 pScratch1 += 1u; 00536 00537 } 00538 00539 } 00540 00541 /** 00542 * @} end of Conv group 00543 */
Generated on Tue Jul 12 2022 18:44:08 by
1.7.2
