CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_conv_opt_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_opt_q15.c 00009 * 00010 * Description: Convolution of Q15 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q15 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00060 * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen). 00061 * @return none. 00062 * 00063 * \par Restrictions 00064 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00065 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00066 * 00067 * 00068 * @details 00069 * <b>Scaling and Overflow Behavior:</b> 00070 * 00071 * \par 00072 * The function is implemented using a 64-bit internal accumulator. 00073 * Both inputs are in 1.15 format and multiplications yield a 2.30 result. 00074 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. 00075 * This approach provides 33 guard bits and there is no risk of overflow. 00076 * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. 00077 * 00078 * 00079 * \par 00080 * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00081 * 00082 * 00083 */ 00084 00085 void arm_conv_opt_q15( 00086 q15_t * pSrcA, 00087 uint32_t srcALen, 00088 q15_t * pSrcB, 00089 uint32_t srcBLen, 00090 q15_t * pDst, 00091 q15_t * pScratch1, 00092 q15_t * pScratch2) 00093 { 00094 q63_t acc0, acc1, acc2, acc3; /* Accumulator */ 00095 q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */ 00096 q31_t y1, y2; /* State variables */ 00097 q15_t *pOut = pDst; /* output pointer */ 00098 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */ 00099 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */ 00100 q15_t *pIn1; /* inputA pointer */ 00101 q15_t *pIn2; /* inputB pointer */ 00102 q15_t *px; /* Intermediate inputA pointer */ 00103 q15_t *py; /* Intermediate inputB pointer */ 00104 uint32_t j, k, blkCnt; /* loop counter */ 00105 uint32_t tapCnt; /* loop count */ 00106 #ifdef UNALIGNED_SUPPORT_DISABLE 00107 00108 q15_t a, b; 00109 00110 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00111 00112 /* The algorithm implementation is based on the lengths of the inputs. */ 00113 /* srcB is always made to slide across srcA. */ 00114 /* So srcBLen is always considered as shorter or equal to srcALen */ 00115 if(srcALen >= srcBLen) 00116 { 00117 /* Initialization of inputA pointer */ 00118 pIn1 = pSrcA; 00119 00120 /* Initialization of inputB pointer */ 00121 pIn2 = pSrcB; 00122 00123 } 00124 else 00125 { 00126 /* Initialization of inputA pointer */ 00127 pIn1 = pSrcB; 00128 00129 /* Initialization of inputB pointer */ 00130 pIn2 = pSrcA; 00131 00132 /* srcBLen is always considered as shorter or equal to srcALen */ 00133 j = srcBLen; 00134 srcBLen = srcALen; 00135 srcALen = j; 00136 } 00137 00138 /* pointer to take end of scratch2 buffer */ 00139 pScr2 = pScratch2 + srcBLen - 1; 00140 00141 /* points to smaller length sequence */ 00142 px = pIn2; 00143 00144 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00145 k = srcBLen >> 2u; 00146 00147 /* First part of the processing with loop unrolling copies 4 data points at a time. 00148 ** a second loop below copies for the remaining 1 to 3 samples. */ 00149 /* Copy smaller length input sequence in reverse order into second scratch buffer */ 00150 while(k > 0u) 00151 { 00152 /* copy second buffer in reversal manner */ 00153 *pScr2-- = *px++; 00154 *pScr2-- = *px++; 00155 *pScr2-- = *px++; 00156 *pScr2-- = *px++; 00157 00158 /* Decrement the loop counter */ 00159 k--; 00160 } 00161 00162 /* If the count is not a multiple of 4, copy remaining samples here. 00163 ** No loop unrolling is used. */ 00164 k = srcBLen % 0x4u; 00165 00166 while(k > 0u) 00167 { 00168 /* copy second buffer in reversal manner for remaining samples */ 00169 *pScr2-- = *px++; 00170 00171 /* Decrement the loop counter */ 00172 k--; 00173 } 00174 00175 /* Initialze temporary scratch pointer */ 00176 pScr1 = pScratch1; 00177 00178 /* Assuming scratch1 buffer is aligned by 32-bit */ 00179 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00180 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00181 00182 /* Update temporary scratch pointer */ 00183 pScr1 += (srcBLen - 1u); 00184 00185 /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */ 00186 00187 #ifndef UNALIGNED_SUPPORT_DISABLE 00188 00189 /* Copy (srcALen) samples in scratch buffer */ 00190 arm_copy_q15(pIn1, pScr1, srcALen); 00191 00192 /* Update pointers */ 00193 pScr1 += srcALen; 00194 00195 #else 00196 00197 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00198 k = srcALen >> 2u; 00199 00200 /* First part of the processing with loop unrolling copies 4 data points at a time. 00201 ** a second loop below copies for the remaining 1 to 3 samples. */ 00202 while(k > 0u) 00203 { 00204 /* copy second buffer in reversal manner */ 00205 *pScr1++ = *pIn1++; 00206 *pScr1++ = *pIn1++; 00207 *pScr1++ = *pIn1++; 00208 *pScr1++ = *pIn1++; 00209 00210 /* Decrement the loop counter */ 00211 k--; 00212 } 00213 00214 /* If the count is not a multiple of 4, copy remaining samples here. 00215 ** No loop unrolling is used. */ 00216 k = srcALen % 0x4u; 00217 00218 while(k > 0u) 00219 { 00220 /* copy second buffer in reversal manner for remaining samples */ 00221 *pScr1++ = *pIn1++; 00222 00223 /* Decrement the loop counter */ 00224 k--; 00225 } 00226 00227 #endif 00228 00229 00230 #ifndef UNALIGNED_SUPPORT_DISABLE 00231 00232 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00233 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00234 00235 /* Update pointer */ 00236 pScr1 += (srcBLen - 1u); 00237 00238 #else 00239 00240 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00241 k = (srcBLen - 1u) >> 2u; 00242 00243 /* First part of the processing with loop unrolling copies 4 data points at a time. 00244 ** a second loop below copies for the remaining 1 to 3 samples. */ 00245 while(k > 0u) 00246 { 00247 /* copy second buffer in reversal manner */ 00248 *pScr1++ = 0; 00249 *pScr1++ = 0; 00250 *pScr1++ = 0; 00251 *pScr1++ = 0; 00252 00253 /* Decrement the loop counter */ 00254 k--; 00255 } 00256 00257 /* If the count is not a multiple of 4, copy remaining samples here. 00258 ** No loop unrolling is used. */ 00259 k = (srcBLen - 1u) % 0x4u; 00260 00261 while(k > 0u) 00262 { 00263 /* copy second buffer in reversal manner for remaining samples */ 00264 *pScr1++ = 0; 00265 00266 /* Decrement the loop counter */ 00267 k--; 00268 } 00269 00270 #endif 00271 00272 /* Temporary pointer for scratch2 */ 00273 py = pScratch2; 00274 00275 00276 /* Initialization of pIn2 pointer */ 00277 pIn2 = py; 00278 00279 /* First part of the processing with loop unrolling process 4 data points at a time. 00280 ** a second loop below process for the remaining 1 to 3 samples. */ 00281 00282 /* Actual convolution process starts here */ 00283 blkCnt = (srcALen + srcBLen - 1u) >> 2; 00284 00285 while(blkCnt > 0) 00286 { 00287 /* Initialze temporary scratch pointer as scratch1 */ 00288 pScr1 = pScratch1; 00289 00290 /* Clear Accumlators */ 00291 acc0 = 0; 00292 acc1 = 0; 00293 acc2 = 0; 00294 acc3 = 0; 00295 00296 /* Read two samples from scratch1 buffer */ 00297 x1 = *__SIMD32(pScr1)++; 00298 00299 /* Read next two samples from scratch1 buffer */ 00300 x2 = *__SIMD32(pScr1)++; 00301 00302 tapCnt = (srcBLen) >> 2u; 00303 00304 while(tapCnt > 0u) 00305 { 00306 00307 #ifndef UNALIGNED_SUPPORT_DISABLE 00308 00309 /* Read four samples from smaller buffer */ 00310 y1 = _SIMD32_OFFSET(pIn2); 00311 y2 = _SIMD32_OFFSET(pIn2 + 2u); 00312 00313 /* multiply and accumlate */ 00314 acc0 = __SMLALD(x1, y1, acc0); 00315 acc2 = __SMLALD(x2, y1, acc2); 00316 00317 /* pack input data */ 00318 #ifndef ARM_MATH_BIG_ENDIAN 00319 x3 = __PKHBT(x2, x1, 0); 00320 #else 00321 x3 = __PKHBT(x1, x2, 0); 00322 #endif 00323 00324 /* multiply and accumlate */ 00325 acc1 = __SMLALDX(x3, y1, acc1); 00326 00327 /* Read next two samples from scratch1 buffer */ 00328 x1 = _SIMD32_OFFSET(pScr1); 00329 00330 /* multiply and accumlate */ 00331 acc0 = __SMLALD(x2, y2, acc0); 00332 acc2 = __SMLALD(x1, y2, acc2); 00333 00334 /* pack input data */ 00335 #ifndef ARM_MATH_BIG_ENDIAN 00336 x3 = __PKHBT(x1, x2, 0); 00337 #else 00338 x3 = __PKHBT(x2, x1, 0); 00339 #endif 00340 00341 acc3 = __SMLALDX(x3, y1, acc3); 00342 acc1 = __SMLALDX(x3, y2, acc1); 00343 00344 x2 = _SIMD32_OFFSET(pScr1 + 2u); 00345 00346 #ifndef ARM_MATH_BIG_ENDIAN 00347 x3 = __PKHBT(x2, x1, 0); 00348 #else 00349 x3 = __PKHBT(x1, x2, 0); 00350 #endif 00351 00352 acc3 = __SMLALDX(x3, y2, acc3); 00353 00354 #else 00355 00356 /* Read four samples from smaller buffer */ 00357 a = *pIn2; 00358 b = *(pIn2 + 1); 00359 00360 #ifndef ARM_MATH_BIG_ENDIAN 00361 y1 = __PKHBT(a, b, 16); 00362 #else 00363 y1 = __PKHBT(b, a, 16); 00364 #endif 00365 00366 a = *(pIn2 + 2); 00367 b = *(pIn2 + 3); 00368 #ifndef ARM_MATH_BIG_ENDIAN 00369 y2 = __PKHBT(a, b, 16); 00370 #else 00371 y2 = __PKHBT(b, a, 16); 00372 #endif 00373 00374 acc0 = __SMLALD(x1, y1, acc0); 00375 00376 acc2 = __SMLALD(x2, y1, acc2); 00377 00378 #ifndef ARM_MATH_BIG_ENDIAN 00379 x3 = __PKHBT(x2, x1, 0); 00380 #else 00381 x3 = __PKHBT(x1, x2, 0); 00382 #endif 00383 00384 acc1 = __SMLALDX(x3, y1, acc1); 00385 00386 a = *pScr1; 00387 b = *(pScr1 + 1); 00388 00389 #ifndef ARM_MATH_BIG_ENDIAN 00390 x1 = __PKHBT(a, b, 16); 00391 #else 00392 x1 = __PKHBT(b, a, 16); 00393 #endif 00394 00395 acc0 = __SMLALD(x2, y2, acc0); 00396 00397 acc2 = __SMLALD(x1, y2, acc2); 00398 00399 #ifndef ARM_MATH_BIG_ENDIAN 00400 x3 = __PKHBT(x1, x2, 0); 00401 #else 00402 x3 = __PKHBT(x2, x1, 0); 00403 #endif 00404 00405 acc3 = __SMLALDX(x3, y1, acc3); 00406 00407 acc1 = __SMLALDX(x3, y2, acc1); 00408 00409 a = *(pScr1 + 2); 00410 b = *(pScr1 + 3); 00411 00412 #ifndef ARM_MATH_BIG_ENDIAN 00413 x2 = __PKHBT(a, b, 16); 00414 #else 00415 x2 = __PKHBT(b, a, 16); 00416 #endif 00417 00418 #ifndef ARM_MATH_BIG_ENDIAN 00419 x3 = __PKHBT(x2, x1, 0); 00420 #else 00421 x3 = __PKHBT(x1, x2, 0); 00422 #endif 00423 00424 acc3 = __SMLALDX(x3, y2, acc3); 00425 00426 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00427 00428 pIn2 += 4u; 00429 pScr1 += 4u; 00430 00431 00432 /* Decrement the loop counter */ 00433 tapCnt--; 00434 } 00435 00436 /* Update scratch pointer for remaining samples of smaller length sequence */ 00437 pScr1 -= 4u; 00438 00439 /* apply same above for remaining samples of smaller length sequence */ 00440 tapCnt = (srcBLen) & 3u; 00441 00442 while(tapCnt > 0u) 00443 { 00444 00445 /* accumlate the results */ 00446 acc0 += (*pScr1++ * *pIn2); 00447 acc1 += (*pScr1++ * *pIn2); 00448 acc2 += (*pScr1++ * *pIn2); 00449 acc3 += (*pScr1++ * *pIn2++); 00450 00451 pScr1 -= 3u; 00452 00453 /* Decrement the loop counter */ 00454 tapCnt--; 00455 } 00456 00457 blkCnt--; 00458 00459 00460 /* Store the results in the accumulators in the destination buffer. */ 00461 00462 #ifndef ARM_MATH_BIG_ENDIAN 00463 00464 *__SIMD32(pOut)++ = 00465 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00466 00467 *__SIMD32(pOut)++ = 00468 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00469 00470 #else 00471 00472 *__SIMD32(pOut)++ = 00473 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00474 00475 *__SIMD32(pOut)++ = 00476 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00477 00478 00479 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00480 00481 /* Initialization of inputB pointer */ 00482 pIn2 = py; 00483 00484 pScratch1 += 4u; 00485 00486 } 00487 00488 00489 blkCnt = (srcALen + srcBLen - 1u) & 0x3; 00490 00491 /* Calculate convolution for remaining samples of Bigger length sequence */ 00492 while(blkCnt > 0) 00493 { 00494 /* Initialze temporary scratch pointer as scratch1 */ 00495 pScr1 = pScratch1; 00496 00497 /* Clear Accumlators */ 00498 acc0 = 0; 00499 00500 tapCnt = (srcBLen) >> 1u; 00501 00502 while(tapCnt > 0u) 00503 { 00504 00505 /* Read next two samples from scratch1 buffer */ 00506 acc0 += (*pScr1++ * *pIn2++); 00507 acc0 += (*pScr1++ * *pIn2++); 00508 00509 /* Decrement the loop counter */ 00510 tapCnt--; 00511 } 00512 00513 tapCnt = (srcBLen) & 1u; 00514 00515 /* apply same above for remaining samples of smaller length sequence */ 00516 while(tapCnt > 0u) 00517 { 00518 00519 /* accumlate the results */ 00520 acc0 += (*pScr1++ * *pIn2++); 00521 00522 /* Decrement the loop counter */ 00523 tapCnt--; 00524 } 00525 00526 blkCnt--; 00527 00528 /* The result is in 2.30 format. Convert to 1.15 with saturation. 00529 ** Then store the output in the destination buffer. */ 00530 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00531 00532 00533 /* Initialization of inputB pointer */ 00534 pIn2 = py; 00535 00536 pScratch1 += 1u; 00537 00538 } 00539 00540 } 00541 00542 00543 /** 00544 * @} end of Conv group 00545 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2