CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_conv_fast_opt_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_fast_opt_q15.c 00009 * 00010 * Description: Fast Q15 Convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00060 * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). 00061 * @return none. 00062 * 00063 * \par Restrictions 00064 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00065 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00066 * 00067 * <b>Scaling and Overflow Behavior:</b> 00068 * 00069 * \par 00070 * This fast version uses a 32-bit accumulator with 2.30 format. 00071 * The accumulator maintains full precision of the intermediate multiplication results 00072 * but provides only a single guard bit. There is no saturation on intermediate additions. 00073 * Thus, if the accumulator overflows it wraps around and distorts the result. 00074 * The input signals should be scaled down to avoid intermediate overflows. 00075 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, 00076 * as maximum of min(srcALen, srcBLen) number of additions are carried internally. 00077 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. 00078 * 00079 * \par 00080 * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. 00081 */ 00082 00083 void arm_conv_fast_opt_q15( 00084 q15_t * pSrcA, 00085 uint32_t srcALen, 00086 q15_t * pSrcB, 00087 uint32_t srcBLen, 00088 q15_t * pDst, 00089 q15_t * pScratch1, 00090 q15_t * pScratch2) 00091 { 00092 q31_t acc0, acc1, acc2, acc3; /* Accumulators */ 00093 q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ 00094 q31_t y1, y2; /* State variables */ 00095 q15_t *pOut = pDst; /* output pointer */ 00096 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00097 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00098 q15_t *pIn1; /* inputA pointer */ 00099 q15_t *pIn2; /* inputB pointer */ 00100 q15_t *px; /* Intermediate inputA pointer */ 00101 q15_t *py; /* Intermediate inputB pointer */ 00102 uint32_t j, k, blkCnt; /* loop counter */ 00103 uint32_t tapCnt; /* loop count */ 00104 #ifdef UNALIGNED_SUPPORT_DISABLE 00105 00106 q15_t a, b; 00107 00108 #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */ 00109 00110 /* The algorithm implementation is based on the lengths of the inputs. */ 00111 /* srcB is always made to slide across srcA. */ 00112 /* So srcBLen is always considered as shorter or equal to srcALen */ 00113 if(srcALen >= srcBLen) 00114 { 00115 /* Initialization of inputA pointer */ 00116 pIn1 = pSrcA; 00117 00118 /* Initialization of inputB pointer */ 00119 pIn2 = pSrcB; 00120 } 00121 else 00122 { 00123 /* Initialization of inputA pointer */ 00124 pIn1 = pSrcB; 00125 00126 /* Initialization of inputB pointer */ 00127 pIn2 = pSrcA; 00128 00129 /* srcBLen is always considered as shorter or equal to srcALen */ 00130 j = srcBLen; 00131 srcBLen = srcALen; 00132 srcALen = j; 00133 } 00134 00135 /* Pointer to take end of scratch2 buffer */ 00136 pScr2 = pScratch2 + srcBLen - 1; 00137 00138 /* points to smaller length sequence */ 00139 px = pIn2; 00140 00141 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00142 k = srcBLen >> 2u; 00143 00144 /* First part of the processing with loop unrolling copies 4 data points at a time. 00145 ** a second loop below copies for the remaining 1 to 3 samples. */ 00146 00147 /* Copy smaller length input sequence in reverse order into second scratch buffer */ 00148 while(k > 0u) 00149 { 00150 /* copy second buffer in reversal manner */ 00151 *pScr2-- = *px++; 00152 *pScr2-- = *px++; 00153 *pScr2-- = *px++; 00154 *pScr2-- = *px++; 00155 00156 /* Decrement the loop counter */ 00157 k--; 00158 } 00159 00160 /* If the count is not a multiple of 4, copy remaining samples here. 00161 ** No loop unrolling is used. */ 00162 k = srcBLen % 0x4u; 00163 00164 while(k > 0u) 00165 { 00166 /* copy second buffer in reversal manner for remaining samples */ 00167 *pScr2-- = *px++; 00168 00169 /* Decrement the loop counter */ 00170 k--; 00171 } 00172 00173 /* Initialze temporary scratch pointer */ 00174 pScr1 = pScratch1; 00175 00176 /* Assuming scratch1 buffer is aligned by 32-bit */ 00177 /* Fill (srcBLen - 1u) zeros in scratch1 buffer */ 00178 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00179 00180 /* Update temporary scratch pointer */ 00181 pScr1 += (srcBLen - 1u); 00182 00183 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00184 00185 #ifndef UNALIGNED_SUPPORT_DISABLE 00186 00187 /* Copy (srcALen) samples in scratch buffer */ 00188 arm_copy_q15(pIn1, pScr1, srcALen); 00189 00190 /* Update pointers */ 00191 pScr1 += srcALen; 00192 00193 #else 00194 00195 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00196 k = srcALen >> 2u; 00197 00198 /* First part of the processing with loop unrolling copies 4 data points at a time. 00199 ** a second loop below copies for the remaining 1 to 3 samples. */ 00200 while(k > 0u) 00201 { 00202 /* copy second buffer in reversal manner */ 00203 *pScr1++ = *pIn1++; 00204 *pScr1++ = *pIn1++; 00205 *pScr1++ = *pIn1++; 00206 *pScr1++ = *pIn1++; 00207 00208 /* Decrement the loop counter */ 00209 k--; 00210 } 00211 00212 /* If the count is not a multiple of 4, copy remaining samples here. 00213 ** No loop unrolling is used. */ 00214 k = srcALen % 0x4u; 00215 00216 while(k > 0u) 00217 { 00218 /* copy second buffer in reversal manner for remaining samples */ 00219 *pScr1++ = *pIn1++; 00220 00221 /* Decrement the loop counter */ 00222 k--; 00223 } 00224 00225 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00226 00227 00228 #ifndef UNALIGNED_SUPPORT_DISABLE 00229 00230 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00231 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00232 00233 /* Update pointer */ 00234 pScr1 += (srcBLen - 1u); 00235 00236 #else 00237 00238 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00239 k = (srcBLen - 1u) >> 2u; 00240 00241 /* First part of the processing with loop unrolling copies 4 data points at a time. 00242 ** a second loop below copies for the remaining 1 to 3 samples. */ 00243 while(k > 0u) 00244 { 00245 /* copy second buffer in reversal manner */ 00246 *pScr1++ = 0; 00247 *pScr1++ = 0; 00248 *pScr1++ = 0; 00249 *pScr1++ = 0; 00250 00251 /* Decrement the loop counter */ 00252 k--; 00253 } 00254 00255 /* If the count is not a multiple of 4, copy remaining samples here. 00256 ** No loop unrolling is used. */ 00257 k = (srcBLen - 1u) % 0x4u; 00258 00259 while(k > 0u) 00260 { 00261 /* copy second buffer in reversal manner for remaining samples */ 00262 *pScr1++ = 0; 00263 00264 /* Decrement the loop counter */ 00265 k--; 00266 } 00267 00268 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00269 00270 /* Temporary pointer for scratch2 */ 00271 py = pScratch2; 00272 00273 00274 /* Initialization of pIn2 pointer */ 00275 pIn2 = py; 00276 00277 /* First part of the processing with loop unrolling process 4 data points at a time. 00278 ** a second loop below process for the remaining 1 to 3 samples. */ 00279 00280 /* Actual convolution process starts here */ 00281 blkCnt = (srcALen + srcBLen - 1u) >> 2; 00282 00283 while(blkCnt > 0) 00284 { 00285 /* Initialze temporary scratch pointer as scratch1 */ 00286 pScr1 = pScratch1; 00287 00288 /* Clear Accumlators */ 00289 acc0 = 0; 00290 acc1 = 0; 00291 acc2 = 0; 00292 acc3 = 0; 00293 00294 /* Read two samples from scratch1 buffer */ 00295 x1 = *__SIMD32(pScr1)++; 00296 00297 /* Read next two samples from scratch1 buffer */ 00298 x2 = *__SIMD32(pScr1)++; 00299 00300 tapCnt = (srcBLen) >> 2u; 00301 00302 while(tapCnt > 0u) 00303 { 00304 00305 #ifndef UNALIGNED_SUPPORT_DISABLE 00306 00307 /* Read four samples from smaller buffer */ 00308 y1 = _SIMD32_OFFSET(pIn2); 00309 y2 = _SIMD32_OFFSET(pIn2 + 2u); 00310 00311 /* multiply and accumlate */ 00312 acc0 = __SMLAD(x1, y1, acc0); 00313 acc2 = __SMLAD(x2, y1, acc2); 00314 00315 /* pack input data */ 00316 #ifndef ARM_MATH_BIG_ENDIAN 00317 x3 = __PKHBT(x2, x1, 0); 00318 #else 00319 x3 = __PKHBT(x1, x2, 0); 00320 #endif 00321 00322 /* multiply and accumlate */ 00323 acc1 = __SMLADX(x3, y1, acc1); 00324 00325 /* Read next two samples from scratch1 buffer */ 00326 x1 = _SIMD32_OFFSET(pScr1); 00327 00328 /* multiply and accumlate */ 00329 acc0 = __SMLAD(x2, y2, acc0); 00330 acc2 = __SMLAD(x1, y2, acc2); 00331 00332 /* pack input data */ 00333 #ifndef ARM_MATH_BIG_ENDIAN 00334 x3 = __PKHBT(x1, x2, 0); 00335 #else 00336 x3 = __PKHBT(x2, x1, 0); 00337 #endif 00338 00339 acc3 = __SMLADX(x3, y1, acc3); 00340 acc1 = __SMLADX(x3, y2, acc1); 00341 00342 x2 = _SIMD32_OFFSET(pScr1 + 2u); 00343 00344 #ifndef ARM_MATH_BIG_ENDIAN 00345 x3 = __PKHBT(x2, x1, 0); 00346 #else 00347 x3 = __PKHBT(x1, x2, 0); 00348 #endif 00349 00350 acc3 = __SMLADX(x3, y2, acc3); 00351 00352 #else 00353 00354 /* Read four samples from smaller buffer */ 00355 a = *pIn2; 00356 b = *(pIn2 + 1); 00357 00358 #ifndef ARM_MATH_BIG_ENDIAN 00359 y1 = __PKHBT(a, b, 16); 00360 #else 00361 y1 = __PKHBT(b, a, 16); 00362 #endif 00363 00364 a = *(pIn2 + 2); 00365 b = *(pIn2 + 3); 00366 #ifndef ARM_MATH_BIG_ENDIAN 00367 y2 = __PKHBT(a, b, 16); 00368 #else 00369 y2 = __PKHBT(b, a, 16); 00370 #endif 00371 00372 acc0 = __SMLAD(x1, y1, acc0); 00373 00374 acc2 = __SMLAD(x2, y1, acc2); 00375 00376 #ifndef ARM_MATH_BIG_ENDIAN 00377 x3 = __PKHBT(x2, x1, 0); 00378 #else 00379 x3 = __PKHBT(x1, x2, 0); 00380 #endif 00381 00382 acc1 = __SMLADX(x3, y1, acc1); 00383 00384 a = *pScr1; 00385 b = *(pScr1 + 1); 00386 00387 #ifndef ARM_MATH_BIG_ENDIAN 00388 x1 = __PKHBT(a, b, 16); 00389 #else 00390 x1 = __PKHBT(b, a, 16); 00391 #endif 00392 00393 acc0 = __SMLAD(x2, y2, acc0); 00394 00395 acc2 = __SMLAD(x1, y2, acc2); 00396 00397 #ifndef ARM_MATH_BIG_ENDIAN 00398 x3 = __PKHBT(x1, x2, 0); 00399 #else 00400 x3 = __PKHBT(x2, x1, 0); 00401 #endif 00402 00403 acc3 = __SMLADX(x3, y1, acc3); 00404 00405 acc1 = __SMLADX(x3, y2, acc1); 00406 00407 a = *(pScr1 + 2); 00408 b = *(pScr1 + 3); 00409 00410 #ifndef ARM_MATH_BIG_ENDIAN 00411 x2 = __PKHBT(a, b, 16); 00412 #else 00413 x2 = __PKHBT(b, a, 16); 00414 #endif 00415 00416 #ifndef ARM_MATH_BIG_ENDIAN 00417 x3 = __PKHBT(x2, x1, 0); 00418 #else 00419 x3 = __PKHBT(x1, x2, 0); 00420 #endif 00421 00422 acc3 = __SMLADX(x3, y2, acc3); 00423 00424 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00425 00426 /* update scratch pointers */ 00427 pIn2 += 4u; 00428 pScr1 += 4u; 00429 00430 00431 /* Decrement the loop counter */ 00432 tapCnt--; 00433 } 00434 00435 /* Update scratch pointer for remaining samples of smaller length sequence */ 00436 pScr1 -= 4u; 00437 00438 /* apply same above for remaining samples of smaller length sequence */ 00439 tapCnt = (srcBLen) & 3u; 00440 00441 while(tapCnt > 0u) 00442 { 00443 00444 /* accumlate the results */ 00445 acc0 += (*pScr1++ * *pIn2); 00446 acc1 += (*pScr1++ * *pIn2); 00447 acc2 += (*pScr1++ * *pIn2); 00448 acc3 += (*pScr1++ * *pIn2++); 00449 00450 pScr1 -= 3u; 00451 00452 /* Decrement the loop counter */ 00453 tapCnt--; 00454 } 00455 00456 blkCnt--; 00457 00458 00459 /* Store the results in the accumulators in the destination buffer. */ 00460 00461 #ifndef ARM_MATH_BIG_ENDIAN 00462 00463 *__SIMD32(pOut)++ = 00464 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00465 00466 *__SIMD32(pOut)++ = 00467 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00468 00469 00470 #else 00471 00472 *__SIMD32(pOut)++ = 00473 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00474 00475 *__SIMD32(pOut)++ = 00476 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00477 00478 00479 00480 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00481 00482 /* Initialization of inputB pointer */ 00483 pIn2 = py; 00484 00485 pScratch1 += 4u; 00486 00487 } 00488 00489 00490 blkCnt = (srcALen + srcBLen - 1u) & 0x3; 00491 00492 /* Calculate convolution for remaining samples of Bigger length sequence */ 00493 while(blkCnt > 0) 00494 { 00495 /* Initialze temporary scratch pointer as scratch1 */ 00496 pScr1 = pScratch1; 00497 00498 /* Clear Accumlators */ 00499 acc0 = 0; 00500 00501 tapCnt = (srcBLen) >> 1u; 00502 00503 while(tapCnt > 0u) 00504 { 00505 00506 acc0 += (*pScr1++ * *pIn2++); 00507 acc0 += (*pScr1++ * *pIn2++); 00508 00509 /* Decrement the loop counter */ 00510 tapCnt--; 00511 } 00512 00513 tapCnt = (srcBLen) & 1u; 00514 00515 /* apply same above for remaining samples of smaller length sequence */ 00516 while(tapCnt > 0u) 00517 { 00518 00519 /* accumlate the results */ 00520 acc0 += (*pScr1++ * *pIn2++); 00521 00522 /* Decrement the loop counter */ 00523 tapCnt--; 00524 } 00525 00526 blkCnt--; 00527 00528 /* The result is in 2.30 format. Convert to 1.15 with saturation. 00529 ** Then store the output in the destination buffer. */ 00530 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00531 00532 /* Initialization of inputB pointer */ 00533 pIn2 = py; 00534 00535 pScratch1 += 1u; 00536 00537 } 00538 00539 } 00540 00541 /** 00542 * @} end of Conv group 00543 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2