Aded CMSIS5 DSP and NN folder. Needs some work
Embed:
(wiki syntax)
Show/hide line numbers
arm_correlate_opt_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_correlate_opt_q15.c 00004 * Description: Correlation of Q15 sequences 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup Corr 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Correlation of Q15 sequences. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. 00047 * @param[in] *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00048 * @return none. 00049 * 00050 * \par Restrictions 00051 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00052 * In this case input, output, scratch buffers should be aligned by 32-bit 00053 * 00054 * @details 00055 * <b>Scaling and Overflow Behavior:</b> 00056 * 00057 * \par 00058 * The function is implemented using a 64-bit internal accumulator. 00059 * Both inputs are in 1.15 format and multiplications yield a 2.30 result. 00060 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. 00061 * This approach provides 33 guard bits and there is no risk of overflow. 00062 * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. 00063 * 00064 * \par 00065 * Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00066 * 00067 * 00068 */ 00069 00070 00071 void arm_correlate_opt_q15( 00072 q15_t * pSrcA, 00073 uint32_t srcALen, 00074 q15_t * pSrcB, 00075 uint32_t srcBLen, 00076 q15_t * pDst, 00077 q15_t * pScratch) 00078 { 00079 q15_t *pIn1; /* inputA pointer */ 00080 q15_t *pIn2; /* inputB pointer */ 00081 q63_t acc0, acc1, acc2, acc3; /* Accumulators */ 00082 q15_t *py; /* Intermediate inputB pointer */ 00083 q31_t x1, x2, x3; /* temporary variables for holding input1 and input2 values */ 00084 uint32_t j, blkCnt, outBlockSize; /* loop counter */ 00085 int32_t inc = 1; /* output pointer increment */ 00086 uint32_t tapCnt; 00087 q31_t y1, y2; 00088 q15_t *pScr; /* Intermediate pointers */ 00089 q15_t *pOut = pDst; /* output pointer */ 00090 #ifdef UNALIGNED_SUPPORT_DISABLE 00091 00092 q15_t a, b; 00093 00094 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00095 00096 /* The algorithm implementation is based on the lengths of the inputs. */ 00097 /* srcB is always made to slide across srcA. */ 00098 /* So srcBLen is always considered as shorter or equal to srcALen */ 00099 /* But CORR(x, y) is reverse of CORR(y, x) */ 00100 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00101 /* and the destination pointer modifier, inc is set to -1 */ 00102 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00103 /* But to improve the performance, 00104 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00105 /* If srcALen > srcBLen, 00106 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00107 /* If srcALen < srcBLen, 00108 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00109 if (srcALen >= srcBLen) 00110 { 00111 /* Initialization of inputA pointer */ 00112 pIn1 = (pSrcA); 00113 00114 /* Initialization of inputB pointer */ 00115 pIn2 = (pSrcB); 00116 00117 /* Number of output samples is calculated */ 00118 outBlockSize = (2U * srcALen) - 1U; 00119 00120 /* When srcALen > srcBLen, zero padding is done to srcB 00121 * to make their lengths equal. 00122 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00123 * number of output samples are made zero */ 00124 j = outBlockSize - (srcALen + (srcBLen - 1U)); 00125 00126 /* Updating the pointer position to non zero value */ 00127 pOut += j; 00128 00129 } 00130 else 00131 { 00132 /* Initialization of inputA pointer */ 00133 pIn1 = (pSrcB); 00134 00135 /* Initialization of inputB pointer */ 00136 pIn2 = (pSrcA); 00137 00138 /* srcBLen is always considered as shorter or equal to srcALen */ 00139 j = srcBLen; 00140 srcBLen = srcALen; 00141 srcALen = j; 00142 00143 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00144 /* Hence set the destination pointer to point to the last output sample */ 00145 pOut = pDst + ((srcALen + srcBLen) - 2U); 00146 00147 /* Destination address modifier is set to -1 */ 00148 inc = -1; 00149 00150 } 00151 00152 pScr = pScratch; 00153 00154 /* Fill (srcBLen - 1U) zeros in scratch buffer */ 00155 arm_fill_q15(0, pScr, (srcBLen - 1U)); 00156 00157 /* Update temporary scratch pointer */ 00158 pScr += (srcBLen - 1U); 00159 00160 #ifndef UNALIGNED_SUPPORT_DISABLE 00161 00162 /* Copy (srcALen) samples in scratch buffer */ 00163 arm_copy_q15(pIn1, pScr, srcALen); 00164 00165 /* Update pointers */ 00166 //pIn1 += srcALen; 00167 pScr += srcALen; 00168 00169 #else 00170 00171 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00172 j = srcALen >> 2U; 00173 00174 /* First part of the processing with loop unrolling copies 4 data points at a time. 00175 ** a second loop below copies for the remaining 1 to 3 samples. */ 00176 while (j > 0U) 00177 { 00178 /* copy second buffer in reversal manner */ 00179 *pScr++ = *pIn1++; 00180 *pScr++ = *pIn1++; 00181 *pScr++ = *pIn1++; 00182 *pScr++ = *pIn1++; 00183 00184 /* Decrement the loop counter */ 00185 j--; 00186 } 00187 00188 /* If the count is not a multiple of 4, copy remaining samples here. 00189 ** No loop unrolling is used. */ 00190 j = srcALen % 0x4U; 00191 00192 while (j > 0U) 00193 { 00194 /* copy second buffer in reversal manner for remaining samples */ 00195 *pScr++ = *pIn1++; 00196 00197 /* Decrement the loop counter */ 00198 j--; 00199 } 00200 00201 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00202 00203 #ifndef UNALIGNED_SUPPORT_DISABLE 00204 00205 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ 00206 arm_fill_q15(0, pScr, (srcBLen - 1U)); 00207 00208 /* Update pointer */ 00209 pScr += (srcBLen - 1U); 00210 00211 #else 00212 00213 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00214 j = (srcBLen - 1U) >> 2U; 00215 00216 /* First part of the processing with loop unrolling copies 4 data points at a time. 00217 ** a second loop below copies for the remaining 1 to 3 samples. */ 00218 while (j > 0U) 00219 { 00220 /* copy second buffer in reversal manner */ 00221 *pScr++ = 0; 00222 *pScr++ = 0; 00223 *pScr++ = 0; 00224 *pScr++ = 0; 00225 00226 /* Decrement the loop counter */ 00227 j--; 00228 } 00229 00230 /* If the count is not a multiple of 4, copy remaining samples here. 00231 ** No loop unrolling is used. */ 00232 j = (srcBLen - 1U) % 0x4U; 00233 00234 while (j > 0U) 00235 { 00236 /* copy second buffer in reversal manner for remaining samples */ 00237 *pScr++ = 0; 00238 00239 /* Decrement the loop counter */ 00240 j--; 00241 } 00242 00243 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00244 00245 /* Temporary pointer for scratch2 */ 00246 py = pIn2; 00247 00248 00249 /* Actual correlation process starts here */ 00250 blkCnt = (srcALen + srcBLen - 1U) >> 2; 00251 00252 while (blkCnt > 0) 00253 { 00254 /* Initialze temporary scratch pointer as scratch1 */ 00255 pScr = pScratch; 00256 00257 /* Clear Accumlators */ 00258 acc0 = 0; 00259 acc1 = 0; 00260 acc2 = 0; 00261 acc3 = 0; 00262 00263 /* Read four samples from scratch1 buffer */ 00264 x1 = *__SIMD32(pScr)++; 00265 00266 /* Read next four samples from scratch1 buffer */ 00267 x2 = *__SIMD32(pScr)++; 00268 00269 tapCnt = (srcBLen) >> 2U; 00270 00271 while (tapCnt > 0U) 00272 { 00273 00274 #ifndef UNALIGNED_SUPPORT_DISABLE 00275 00276 /* Read four samples from smaller buffer */ 00277 y1 = _SIMD32_OFFSET(pIn2); 00278 y2 = _SIMD32_OFFSET(pIn2 + 2U); 00279 00280 acc0 = __SMLALD(x1, y1, acc0); 00281 00282 acc2 = __SMLALD(x2, y1, acc2); 00283 00284 #ifndef ARM_MATH_BIG_ENDIAN 00285 x3 = __PKHBT(x2, x1, 0); 00286 #else 00287 x3 = __PKHBT(x1, x2, 0); 00288 #endif 00289 00290 acc1 = __SMLALDX(x3, y1, acc1); 00291 00292 x1 = _SIMD32_OFFSET(pScr); 00293 00294 acc0 = __SMLALD(x2, y2, acc0); 00295 00296 acc2 = __SMLALD(x1, y2, acc2); 00297 00298 #ifndef ARM_MATH_BIG_ENDIAN 00299 x3 = __PKHBT(x1, x2, 0); 00300 #else 00301 x3 = __PKHBT(x2, x1, 0); 00302 #endif 00303 00304 acc3 = __SMLALDX(x3, y1, acc3); 00305 00306 acc1 = __SMLALDX(x3, y2, acc1); 00307 00308 x2 = _SIMD32_OFFSET(pScr + 2U); 00309 00310 #ifndef ARM_MATH_BIG_ENDIAN 00311 x3 = __PKHBT(x2, x1, 0); 00312 #else 00313 x3 = __PKHBT(x1, x2, 0); 00314 #endif 00315 00316 acc3 = __SMLALDX(x3, y2, acc3); 00317 00318 #else 00319 00320 /* Read four samples from smaller buffer */ 00321 a = *pIn2; 00322 b = *(pIn2 + 1); 00323 00324 #ifndef ARM_MATH_BIG_ENDIAN 00325 y1 = __PKHBT(a, b, 16); 00326 #else 00327 y1 = __PKHBT(b, a, 16); 00328 #endif 00329 00330 a = *(pIn2 + 2); 00331 b = *(pIn2 + 3); 00332 #ifndef ARM_MATH_BIG_ENDIAN 00333 y2 = __PKHBT(a, b, 16); 00334 #else 00335 y2 = __PKHBT(b, a, 16); 00336 #endif 00337 00338 acc0 = __SMLALD(x1, y1, acc0); 00339 00340 acc2 = __SMLALD(x2, y1, acc2); 00341 00342 #ifndef ARM_MATH_BIG_ENDIAN 00343 x3 = __PKHBT(x2, x1, 0); 00344 #else 00345 x3 = __PKHBT(x1, x2, 0); 00346 #endif 00347 00348 acc1 = __SMLALDX(x3, y1, acc1); 00349 00350 a = *pScr; 00351 b = *(pScr + 1); 00352 00353 #ifndef ARM_MATH_BIG_ENDIAN 00354 x1 = __PKHBT(a, b, 16); 00355 #else 00356 x1 = __PKHBT(b, a, 16); 00357 #endif 00358 00359 acc0 = __SMLALD(x2, y2, acc0); 00360 00361 acc2 = __SMLALD(x1, y2, acc2); 00362 00363 #ifndef ARM_MATH_BIG_ENDIAN 00364 x3 = __PKHBT(x1, x2, 0); 00365 #else 00366 x3 = __PKHBT(x2, x1, 0); 00367 #endif 00368 00369 acc3 = __SMLALDX(x3, y1, acc3); 00370 00371 acc1 = __SMLALDX(x3, y2, acc1); 00372 00373 a = *(pScr + 2); 00374 b = *(pScr + 3); 00375 00376 #ifndef ARM_MATH_BIG_ENDIAN 00377 x2 = __PKHBT(a, b, 16); 00378 #else 00379 x2 = __PKHBT(b, a, 16); 00380 #endif 00381 00382 #ifndef ARM_MATH_BIG_ENDIAN 00383 x3 = __PKHBT(x2, x1, 0); 00384 #else 00385 x3 = __PKHBT(x1, x2, 0); 00386 #endif 00387 00388 acc3 = __SMLALDX(x3, y2, acc3); 00389 00390 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00391 00392 pIn2 += 4U; 00393 00394 pScr += 4U; 00395 00396 00397 /* Decrement the loop counter */ 00398 tapCnt--; 00399 } 00400 00401 00402 00403 /* Update scratch pointer for remaining samples of smaller length sequence */ 00404 pScr -= 4U; 00405 00406 00407 /* apply same above for remaining samples of smaller length sequence */ 00408 tapCnt = (srcBLen) & 3U; 00409 00410 while (tapCnt > 0U) 00411 { 00412 00413 /* accumlate the results */ 00414 acc0 += (*pScr++ * *pIn2); 00415 acc1 += (*pScr++ * *pIn2); 00416 acc2 += (*pScr++ * *pIn2); 00417 acc3 += (*pScr++ * *pIn2++); 00418 00419 pScr -= 3U; 00420 00421 /* Decrement the loop counter */ 00422 tapCnt--; 00423 } 00424 00425 blkCnt--; 00426 00427 00428 /* Store the results in the accumulators in the destination buffer. */ 00429 *pOut = (__SSAT(acc0 >> 15U, 16)); 00430 pOut += inc; 00431 *pOut = (__SSAT(acc1 >> 15U, 16)); 00432 pOut += inc; 00433 *pOut = (__SSAT(acc2 >> 15U, 16)); 00434 pOut += inc; 00435 *pOut = (__SSAT(acc3 >> 15U, 16)); 00436 pOut += inc; 00437 00438 /* Initialization of inputB pointer */ 00439 pIn2 = py; 00440 00441 pScratch += 4U; 00442 00443 } 00444 00445 00446 blkCnt = (srcALen + srcBLen - 1U) & 0x3; 00447 00448 /* Calculate correlation for remaining samples of Bigger length sequence */ 00449 while (blkCnt > 0) 00450 { 00451 /* Initialze temporary scratch pointer as scratch1 */ 00452 pScr = pScratch; 00453 00454 /* Clear Accumlators */ 00455 acc0 = 0; 00456 00457 tapCnt = (srcBLen) >> 1U; 00458 00459 while (tapCnt > 0U) 00460 { 00461 00462 acc0 += (*pScr++ * *pIn2++); 00463 acc0 += (*pScr++ * *pIn2++); 00464 00465 /* Decrement the loop counter */ 00466 tapCnt--; 00467 } 00468 00469 tapCnt = (srcBLen) & 1U; 00470 00471 /* apply same above for remaining samples of smaller length sequence */ 00472 while (tapCnt > 0U) 00473 { 00474 00475 /* accumlate the results */ 00476 acc0 += (*pScr++ * *pIn2++); 00477 00478 /* Decrement the loop counter */ 00479 tapCnt--; 00480 } 00481 00482 blkCnt--; 00483 00484 /* Store the result in the accumulator in the destination buffer. */ 00485 *pOut = (q15_t) (__SSAT((acc0 >> 15), 16)); 00486 00487 pOut += inc; 00488 00489 /* Initialization of inputB pointer */ 00490 pIn2 = py; 00491 00492 pScratch += 1U; 00493 00494 } 00495 00496 00497 } 00498 00499 /** 00500 * @} end of Corr group 00501 */ 00502
Generated on Tue Jul 12 2022 16:46:23 by 1.7.2