CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_correlate_opt_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_opt_q15.c 00009 * 00010 * Description: Correlation of Q15 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Corr 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Correlation of Q15 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. 00059 * @param[in] *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00060 * @return none. 00061 * 00062 * \par Restrictions 00063 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00064 * In this case input, output, scratch buffers should be aligned by 32-bit 00065 * 00066 * @details 00067 * <b>Scaling and Overflow Behavior:</b> 00068 * 00069 * \par 00070 * The function is implemented using a 64-bit internal accumulator. 00071 * Both inputs are in 1.15 format and multiplications yield a 2.30 result. 00072 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. 00073 * This approach provides 33 guard bits and there is no risk of overflow. 00074 * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. 00075 * 00076 * \par 00077 * Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00078 * 00079 * 00080 */ 00081 00082 00083 void arm_correlate_opt_q15( 00084 q15_t * pSrcA, 00085 uint32_t srcALen, 00086 q15_t * pSrcB, 00087 uint32_t srcBLen, 00088 q15_t * pDst, 00089 q15_t * pScratch) 00090 { 00091 q15_t *pIn1; /* inputA pointer */ 00092 q15_t *pIn2; /* inputB pointer */ 00093 q63_t acc0, acc1, acc2, acc3; /* Accumulators */ 00094 q15_t *py; /* Intermediate inputB pointer */ 00095 q31_t x1, x2, x3; /* temporary variables for holding input1 and input2 values */ 00096 uint32_t j, blkCnt, outBlockSize; /* loop counter */ 00097 int32_t inc = 1; /* output pointer increment */ 00098 uint32_t tapCnt; 00099 q31_t y1, y2; 00100 q15_t *pScr; /* Intermediate pointers */ 00101 q15_t *pOut = pDst; /* output pointer */ 00102 #ifdef UNALIGNED_SUPPORT_DISABLE 00103 00104 q15_t a, b; 00105 00106 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00107 00108 /* The algorithm implementation is based on the lengths of the inputs. */ 00109 /* srcB is always made to slide across srcA. */ 00110 /* So srcBLen is always considered as shorter or equal to srcALen */ 00111 /* But CORR(x, y) is reverse of CORR(y, x) */ 00112 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00113 /* and the destination pointer modifier, inc is set to -1 */ 00114 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00115 /* But to improve the performance, 00116 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00117 /* If srcALen > srcBLen, 00118 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00119 /* If srcALen < srcBLen, 00120 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00121 if(srcALen >= srcBLen) 00122 { 00123 /* Initialization of inputA pointer */ 00124 pIn1 = (pSrcA); 00125 00126 /* Initialization of inputB pointer */ 00127 pIn2 = (pSrcB); 00128 00129 /* Number of output samples is calculated */ 00130 outBlockSize = (2u * srcALen) - 1u; 00131 00132 /* When srcALen > srcBLen, zero padding is done to srcB 00133 * to make their lengths equal. 00134 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00135 * number of output samples are made zero */ 00136 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00137 00138 /* Updating the pointer position to non zero value */ 00139 pOut += j; 00140 00141 } 00142 else 00143 { 00144 /* Initialization of inputA pointer */ 00145 pIn1 = (pSrcB); 00146 00147 /* Initialization of inputB pointer */ 00148 pIn2 = (pSrcA); 00149 00150 /* srcBLen is always considered as shorter or equal to srcALen */ 00151 j = srcBLen; 00152 srcBLen = srcALen; 00153 srcALen = j; 00154 00155 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00156 /* Hence set the destination pointer to point to the last output sample */ 00157 pOut = pDst + ((srcALen + srcBLen) - 2u); 00158 00159 /* Destination address modifier is set to -1 */ 00160 inc = -1; 00161 00162 } 00163 00164 pScr = pScratch; 00165 00166 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00167 arm_fill_q15(0, pScr, (srcBLen - 1u)); 00168 00169 /* Update temporary scratch pointer */ 00170 pScr += (srcBLen - 1u); 00171 00172 #ifndef UNALIGNED_SUPPORT_DISABLE 00173 00174 /* Copy (srcALen) samples in scratch buffer */ 00175 arm_copy_q15(pIn1, pScr, srcALen); 00176 00177 /* Update pointers */ 00178 //pIn1 += srcALen; 00179 pScr += srcALen; 00180 00181 #else 00182 00183 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00184 j = srcALen >> 2u; 00185 00186 /* First part of the processing with loop unrolling copies 4 data points at a time. 00187 ** a second loop below copies for the remaining 1 to 3 samples. */ 00188 while(j > 0u) 00189 { 00190 /* copy second buffer in reversal manner */ 00191 *pScr++ = *pIn1++; 00192 *pScr++ = *pIn1++; 00193 *pScr++ = *pIn1++; 00194 *pScr++ = *pIn1++; 00195 00196 /* Decrement the loop counter */ 00197 j--; 00198 } 00199 00200 /* If the count is not a multiple of 4, copy remaining samples here. 00201 ** No loop unrolling is used. */ 00202 j = srcALen % 0x4u; 00203 00204 while(j > 0u) 00205 { 00206 /* copy second buffer in reversal manner for remaining samples */ 00207 *pScr++ = *pIn1++; 00208 00209 /* Decrement the loop counter */ 00210 j--; 00211 } 00212 00213 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00214 00215 #ifndef UNALIGNED_SUPPORT_DISABLE 00216 00217 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00218 arm_fill_q15(0, pScr, (srcBLen - 1u)); 00219 00220 /* Update pointer */ 00221 pScr += (srcBLen - 1u); 00222 00223 #else 00224 00225 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00226 j = (srcBLen - 1u) >> 2u; 00227 00228 /* First part of the processing with loop unrolling copies 4 data points at a time. 00229 ** a second loop below copies for the remaining 1 to 3 samples. */ 00230 while(j > 0u) 00231 { 00232 /* copy second buffer in reversal manner */ 00233 *pScr++ = 0; 00234 *pScr++ = 0; 00235 *pScr++ = 0; 00236 *pScr++ = 0; 00237 00238 /* Decrement the loop counter */ 00239 j--; 00240 } 00241 00242 /* If the count is not a multiple of 4, copy remaining samples here. 00243 ** No loop unrolling is used. */ 00244 j = (srcBLen - 1u) % 0x4u; 00245 00246 while(j > 0u) 00247 { 00248 /* copy second buffer in reversal manner for remaining samples */ 00249 *pScr++ = 0; 00250 00251 /* Decrement the loop counter */ 00252 j--; 00253 } 00254 00255 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00256 00257 /* Temporary pointer for scratch2 */ 00258 py = pIn2; 00259 00260 00261 /* Actual correlation process starts here */ 00262 blkCnt = (srcALen + srcBLen - 1u) >> 2; 00263 00264 while(blkCnt > 0) 00265 { 00266 /* Initialze temporary scratch pointer as scratch1 */ 00267 pScr = pScratch; 00268 00269 /* Clear Accumlators */ 00270 acc0 = 0; 00271 acc1 = 0; 00272 acc2 = 0; 00273 acc3 = 0; 00274 00275 /* Read four samples from scratch1 buffer */ 00276 x1 = *__SIMD32(pScr)++; 00277 00278 /* Read next four samples from scratch1 buffer */ 00279 x2 = *__SIMD32(pScr)++; 00280 00281 tapCnt = (srcBLen) >> 2u; 00282 00283 while(tapCnt > 0u) 00284 { 00285 00286 #ifndef UNALIGNED_SUPPORT_DISABLE 00287 00288 /* Read four samples from smaller buffer */ 00289 y1 = _SIMD32_OFFSET(pIn2); 00290 y2 = _SIMD32_OFFSET(pIn2 + 2u); 00291 00292 acc0 = __SMLALD(x1, y1, acc0); 00293 00294 acc2 = __SMLALD(x2, y1, acc2); 00295 00296 #ifndef ARM_MATH_BIG_ENDIAN 00297 x3 = __PKHBT(x2, x1, 0); 00298 #else 00299 x3 = __PKHBT(x1, x2, 0); 00300 #endif 00301 00302 acc1 = __SMLALDX(x3, y1, acc1); 00303 00304 x1 = _SIMD32_OFFSET(pScr); 00305 00306 acc0 = __SMLALD(x2, y2, acc0); 00307 00308 acc2 = __SMLALD(x1, y2, acc2); 00309 00310 #ifndef ARM_MATH_BIG_ENDIAN 00311 x3 = __PKHBT(x1, x2, 0); 00312 #else 00313 x3 = __PKHBT(x2, x1, 0); 00314 #endif 00315 00316 acc3 = __SMLALDX(x3, y1, acc3); 00317 00318 acc1 = __SMLALDX(x3, y2, acc1); 00319 00320 x2 = _SIMD32_OFFSET(pScr + 2u); 00321 00322 #ifndef ARM_MATH_BIG_ENDIAN 00323 x3 = __PKHBT(x2, x1, 0); 00324 #else 00325 x3 = __PKHBT(x1, x2, 0); 00326 #endif 00327 00328 acc3 = __SMLALDX(x3, y2, acc3); 00329 00330 #else 00331 00332 /* Read four samples from smaller buffer */ 00333 a = *pIn2; 00334 b = *(pIn2 + 1); 00335 00336 #ifndef ARM_MATH_BIG_ENDIAN 00337 y1 = __PKHBT(a, b, 16); 00338 #else 00339 y1 = __PKHBT(b, a, 16); 00340 #endif 00341 00342 a = *(pIn2 + 2); 00343 b = *(pIn2 + 3); 00344 #ifndef ARM_MATH_BIG_ENDIAN 00345 y2 = __PKHBT(a, b, 16); 00346 #else 00347 y2 = __PKHBT(b, a, 16); 00348 #endif 00349 00350 acc0 = __SMLALD(x1, y1, acc0); 00351 00352 acc2 = __SMLALD(x2, y1, acc2); 00353 00354 #ifndef ARM_MATH_BIG_ENDIAN 00355 x3 = __PKHBT(x2, x1, 0); 00356 #else 00357 x3 = __PKHBT(x1, x2, 0); 00358 #endif 00359 00360 acc1 = __SMLALDX(x3, y1, acc1); 00361 00362 a = *pScr; 00363 b = *(pScr + 1); 00364 00365 #ifndef ARM_MATH_BIG_ENDIAN 00366 x1 = __PKHBT(a, b, 16); 00367 #else 00368 x1 = __PKHBT(b, a, 16); 00369 #endif 00370 00371 acc0 = __SMLALD(x2, y2, acc0); 00372 00373 acc2 = __SMLALD(x1, y2, acc2); 00374 00375 #ifndef ARM_MATH_BIG_ENDIAN 00376 x3 = __PKHBT(x1, x2, 0); 00377 #else 00378 x3 = __PKHBT(x2, x1, 0); 00379 #endif 00380 00381 acc3 = __SMLALDX(x3, y1, acc3); 00382 00383 acc1 = __SMLALDX(x3, y2, acc1); 00384 00385 a = *(pScr + 2); 00386 b = *(pScr + 3); 00387 00388 #ifndef ARM_MATH_BIG_ENDIAN 00389 x2 = __PKHBT(a, b, 16); 00390 #else 00391 x2 = __PKHBT(b, a, 16); 00392 #endif 00393 00394 #ifndef ARM_MATH_BIG_ENDIAN 00395 x3 = __PKHBT(x2, x1, 0); 00396 #else 00397 x3 = __PKHBT(x1, x2, 0); 00398 #endif 00399 00400 acc3 = __SMLALDX(x3, y2, acc3); 00401 00402 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00403 00404 pIn2 += 4u; 00405 00406 pScr += 4u; 00407 00408 00409 /* Decrement the loop counter */ 00410 tapCnt--; 00411 } 00412 00413 00414 00415 /* Update scratch pointer for remaining samples of smaller length sequence */ 00416 pScr -= 4u; 00417 00418 00419 /* apply same above for remaining samples of smaller length sequence */ 00420 tapCnt = (srcBLen) & 3u; 00421 00422 while(tapCnt > 0u) 00423 { 00424 00425 /* accumlate the results */ 00426 acc0 += (*pScr++ * *pIn2); 00427 acc1 += (*pScr++ * *pIn2); 00428 acc2 += (*pScr++ * *pIn2); 00429 acc3 += (*pScr++ * *pIn2++); 00430 00431 pScr -= 3u; 00432 00433 /* Decrement the loop counter */ 00434 tapCnt--; 00435 } 00436 00437 blkCnt--; 00438 00439 00440 /* Store the results in the accumulators in the destination buffer. */ 00441 *pOut = (__SSAT(acc0 >> 15u, 16)); 00442 pOut += inc; 00443 *pOut = (__SSAT(acc1 >> 15u, 16)); 00444 pOut += inc; 00445 *pOut = (__SSAT(acc2 >> 15u, 16)); 00446 pOut += inc; 00447 *pOut = (__SSAT(acc3 >> 15u, 16)); 00448 pOut += inc; 00449 00450 /* Initialization of inputB pointer */ 00451 pIn2 = py; 00452 00453 pScratch += 4u; 00454 00455 } 00456 00457 00458 blkCnt = (srcALen + srcBLen - 1u) & 0x3; 00459 00460 /* Calculate correlation for remaining samples of Bigger length sequence */ 00461 while(blkCnt > 0) 00462 { 00463 /* Initialze temporary scratch pointer as scratch1 */ 00464 pScr = pScratch; 00465 00466 /* Clear Accumlators */ 00467 acc0 = 0; 00468 00469 tapCnt = (srcBLen) >> 1u; 00470 00471 while(tapCnt > 0u) 00472 { 00473 00474 acc0 += (*pScr++ * *pIn2++); 00475 acc0 += (*pScr++ * *pIn2++); 00476 00477 /* Decrement the loop counter */ 00478 tapCnt--; 00479 } 00480 00481 tapCnt = (srcBLen) & 1u; 00482 00483 /* apply same above for remaining samples of smaller length sequence */ 00484 while(tapCnt > 0u) 00485 { 00486 00487 /* accumlate the results */ 00488 acc0 += (*pScr++ * *pIn2++); 00489 00490 /* Decrement the loop counter */ 00491 tapCnt--; 00492 } 00493 00494 blkCnt--; 00495 00496 /* Store the result in the accumulator in the destination buffer. */ 00497 *pOut = (q15_t) (__SSAT((acc0 >> 15), 16)); 00498 00499 pOut += inc; 00500 00501 /* Initialization of inputB pointer */ 00502 pIn2 = py; 00503 00504 pScratch += 1u; 00505 00506 } 00507 00508 00509 } 00510 00511 /** 00512 * @} end of Corr group 00513 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2