Aded CMSIS5 DSP and NN folder. Needs some work
Embed:
(wiki syntax)
Show/hide line numbers
arm_correlate_opt_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_correlate_opt_q7.c 00004 * Description: Correlation of Q7 sequences 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup Corr 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Correlation of Q7 sequences. 00042 * @param[in] *pSrcA points to the first input sequence. 00043 * @param[in] srcALen length of the first input sequence. 00044 * @param[in] *pSrcB points to the second input sequence. 00045 * @param[in] srcBLen length of the second input sequence. 00046 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. 00047 * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00048 * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). 00049 * @return none. 00050 * 00051 * 00052 * \par Restrictions 00053 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00054 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00055 * 00056 * @details 00057 * <b>Scaling and Overflow Behavior:</b> 00058 * 00059 * \par 00060 * The function is implemented using a 32-bit internal accumulator. 00061 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 00062 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00063 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 00064 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format. 00065 * 00066 * 00067 */ 00068 00069 00070 00071 void arm_correlate_opt_q7( 00072 q7_t * pSrcA, 00073 uint32_t srcALen, 00074 q7_t * pSrcB, 00075 uint32_t srcBLen, 00076 q7_t * pDst, 00077 q15_t * pScratch1, 00078 q15_t * pScratch2) 00079 { 00080 q7_t *pOut = pDst; /* output pointer */ 00081 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */ 00082 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */ 00083 q7_t *pIn1; /* inputA pointer */ 00084 q7_t *pIn2; /* inputB pointer */ 00085 q15_t *py; /* Intermediate inputB pointer */ 00086 q31_t acc0, acc1, acc2, acc3; /* Accumulators */ 00087 uint32_t j, k = 0U, blkCnt; /* loop counter */ 00088 int32_t inc = 1; /* output pointer increment */ 00089 uint32_t outBlockSize; /* loop counter */ 00090 q15_t x4; /* Temporary input variable */ 00091 uint32_t tapCnt; /* loop counter */ 00092 q31_t x1, x2, x3, y1; /* Temporary input variables */ 00093 00094 /* The algorithm implementation is based on the lengths of the inputs. */ 00095 /* srcB is always made to slide across srcA. */ 00096 /* So srcBLen is always considered as shorter or equal to srcALen */ 00097 /* But CORR(x, y) is reverse of CORR(y, x) */ 00098 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00099 /* and the destination pointer modifier, inc is set to -1 */ 00100 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00101 /* But to improve the performance, 00102 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00103 /* If srcALen > srcBLen, 00104 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00105 /* If srcALen < srcBLen, 00106 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00107 if (srcALen >= srcBLen) 00108 { 00109 /* Initialization of inputA pointer */ 00110 pIn1 = (pSrcA); 00111 00112 /* Initialization of inputB pointer */ 00113 pIn2 = (pSrcB); 00114 00115 /* Number of output samples is calculated */ 00116 outBlockSize = (2U * srcALen) - 1U; 00117 00118 /* When srcALen > srcBLen, zero padding is done to srcB 00119 * to make their lengths equal. 00120 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00121 * number of output samples are made zero */ 00122 j = outBlockSize - (srcALen + (srcBLen - 1U)); 00123 00124 /* Updating the pointer position to non zero value */ 00125 pOut += j; 00126 00127 } 00128 else 00129 { 00130 /* Initialization of inputA pointer */ 00131 pIn1 = (pSrcB); 00132 00133 /* Initialization of inputB pointer */ 00134 pIn2 = (pSrcA); 00135 00136 /* srcBLen is always considered as shorter or equal to srcALen */ 00137 j = srcBLen; 00138 srcBLen = srcALen; 00139 srcALen = j; 00140 00141 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00142 /* Hence set the destination pointer to point to the last output sample */ 00143 pOut = pDst + ((srcALen + srcBLen) - 2U); 00144 00145 /* Destination address modifier is set to -1 */ 00146 inc = -1; 00147 00148 } 00149 00150 00151 /* Copy (srcBLen) samples in scratch buffer */ 00152 k = srcBLen >> 2U; 00153 00154 /* First part of the processing with loop unrolling copies 4 data points at a time. 00155 ** a second loop below copies for the remaining 1 to 3 samples. */ 00156 while (k > 0U) 00157 { 00158 /* copy second buffer in reversal manner */ 00159 x4 = (q15_t) * pIn2++; 00160 *pScr2++ = x4; 00161 x4 = (q15_t) * pIn2++; 00162 *pScr2++ = x4; 00163 x4 = (q15_t) * pIn2++; 00164 *pScr2++ = x4; 00165 x4 = (q15_t) * pIn2++; 00166 *pScr2++ = x4; 00167 00168 /* Decrement the loop counter */ 00169 k--; 00170 } 00171 00172 /* If the count is not a multiple of 4, copy remaining samples here. 00173 ** No loop unrolling is used. */ 00174 k = srcBLen % 0x4U; 00175 00176 while (k > 0U) 00177 { 00178 /* copy second buffer in reversal manner for remaining samples */ 00179 x4 = (q15_t) * pIn2++; 00180 *pScr2++ = x4; 00181 00182 /* Decrement the loop counter */ 00183 k--; 00184 } 00185 00186 /* Fill (srcBLen - 1U) zeros in scratch buffer */ 00187 arm_fill_q15(0, pScr1, (srcBLen - 1U)); 00188 00189 /* Update temporary scratch pointer */ 00190 pScr1 += (srcBLen - 1U); 00191 00192 /* Copy (srcALen) samples in scratch buffer */ 00193 k = srcALen >> 2U; 00194 00195 /* First part of the processing with loop unrolling copies 4 data points at a time. 00196 ** a second loop below copies for the remaining 1 to 3 samples. */ 00197 while (k > 0U) 00198 { 00199 /* copy second buffer in reversal manner */ 00200 x4 = (q15_t) * pIn1++; 00201 *pScr1++ = x4; 00202 x4 = (q15_t) * pIn1++; 00203 *pScr1++ = x4; 00204 x4 = (q15_t) * pIn1++; 00205 *pScr1++ = x4; 00206 x4 = (q15_t) * pIn1++; 00207 *pScr1++ = x4; 00208 00209 /* Decrement the loop counter */ 00210 k--; 00211 } 00212 00213 /* If the count is not a multiple of 4, copy remaining samples here. 00214 ** No loop unrolling is used. */ 00215 k = srcALen % 0x4U; 00216 00217 while (k > 0U) 00218 { 00219 /* copy second buffer in reversal manner for remaining samples */ 00220 x4 = (q15_t) * pIn1++; 00221 *pScr1++ = x4; 00222 00223 /* Decrement the loop counter */ 00224 k--; 00225 } 00226 00227 #ifndef UNALIGNED_SUPPORT_DISABLE 00228 00229 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */ 00230 arm_fill_q15(0, pScr1, (srcBLen - 1U)); 00231 00232 /* Update pointer */ 00233 pScr1 += (srcBLen - 1U); 00234 00235 #else 00236 00237 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00238 k = (srcBLen - 1U) >> 2U; 00239 00240 /* First part of the processing with loop unrolling copies 4 data points at a time. 00241 ** a second loop below copies for the remaining 1 to 3 samples. */ 00242 while (k > 0U) 00243 { 00244 /* copy second buffer in reversal manner */ 00245 *pScr1++ = 0; 00246 *pScr1++ = 0; 00247 *pScr1++ = 0; 00248 *pScr1++ = 0; 00249 00250 /* Decrement the loop counter */ 00251 k--; 00252 } 00253 00254 /* If the count is not a multiple of 4, copy remaining samples here. 00255 ** No loop unrolling is used. */ 00256 k = (srcBLen - 1U) % 0x4U; 00257 00258 while (k > 0U) 00259 { 00260 /* copy second buffer in reversal manner for remaining samples */ 00261 *pScr1++ = 0; 00262 00263 /* Decrement the loop counter */ 00264 k--; 00265 } 00266 00267 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00268 00269 /* Temporary pointer for second sequence */ 00270 py = pScratch2; 00271 00272 /* Initialization of pScr2 pointer */ 00273 pScr2 = pScratch2; 00274 00275 /* Actual correlation process starts here */ 00276 blkCnt = (srcALen + srcBLen - 1U) >> 2; 00277 00278 while (blkCnt > 0) 00279 { 00280 /* Initialze temporary scratch pointer as scratch1 */ 00281 pScr1 = pScratch1; 00282 00283 /* Clear Accumlators */ 00284 acc0 = 0; 00285 acc1 = 0; 00286 acc2 = 0; 00287 acc3 = 0; 00288 00289 /* Read two samples from scratch1 buffer */ 00290 x1 = *__SIMD32(pScr1)++; 00291 00292 /* Read next two samples from scratch1 buffer */ 00293 x2 = *__SIMD32(pScr1)++; 00294 00295 tapCnt = (srcBLen) >> 2U; 00296 00297 while (tapCnt > 0U) 00298 { 00299 00300 /* Read four samples from smaller buffer */ 00301 y1 = _SIMD32_OFFSET(pScr2); 00302 00303 /* multiply and accumlate */ 00304 acc0 = __SMLAD(x1, y1, acc0); 00305 acc2 = __SMLAD(x2, y1, acc2); 00306 00307 /* pack input data */ 00308 #ifndef ARM_MATH_BIG_ENDIAN 00309 x3 = __PKHBT(x2, x1, 0); 00310 #else 00311 x3 = __PKHBT(x1, x2, 0); 00312 #endif 00313 00314 /* multiply and accumlate */ 00315 acc1 = __SMLADX(x3, y1, acc1); 00316 00317 /* Read next two samples from scratch1 buffer */ 00318 x1 = *__SIMD32(pScr1)++; 00319 00320 /* pack input data */ 00321 #ifndef ARM_MATH_BIG_ENDIAN 00322 x3 = __PKHBT(x1, x2, 0); 00323 #else 00324 x3 = __PKHBT(x2, x1, 0); 00325 #endif 00326 00327 acc3 = __SMLADX(x3, y1, acc3); 00328 00329 /* Read four samples from smaller buffer */ 00330 y1 = _SIMD32_OFFSET(pScr2 + 2U); 00331 00332 acc0 = __SMLAD(x2, y1, acc0); 00333 00334 acc2 = __SMLAD(x1, y1, acc2); 00335 00336 acc1 = __SMLADX(x3, y1, acc1); 00337 00338 x2 = *__SIMD32(pScr1)++; 00339 00340 #ifndef ARM_MATH_BIG_ENDIAN 00341 x3 = __PKHBT(x2, x1, 0); 00342 #else 00343 x3 = __PKHBT(x1, x2, 0); 00344 #endif 00345 00346 acc3 = __SMLADX(x3, y1, acc3); 00347 00348 pScr2 += 4U; 00349 00350 00351 /* Decrement the loop counter */ 00352 tapCnt--; 00353 } 00354 00355 00356 00357 /* Update scratch pointer for remaining samples of smaller length sequence */ 00358 pScr1 -= 4U; 00359 00360 00361 /* apply same above for remaining samples of smaller length sequence */ 00362 tapCnt = (srcBLen) & 3U; 00363 00364 while (tapCnt > 0U) 00365 { 00366 00367 /* accumlate the results */ 00368 acc0 += (*pScr1++ * *pScr2); 00369 acc1 += (*pScr1++ * *pScr2); 00370 acc2 += (*pScr1++ * *pScr2); 00371 acc3 += (*pScr1++ * *pScr2++); 00372 00373 pScr1 -= 3U; 00374 00375 /* Decrement the loop counter */ 00376 tapCnt--; 00377 } 00378 00379 blkCnt--; 00380 00381 /* Store the result in the accumulator in the destination buffer. */ 00382 *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8)); 00383 pOut += inc; 00384 *pOut = (q7_t) (__SSAT(acc1 >> 7U, 8)); 00385 pOut += inc; 00386 *pOut = (q7_t) (__SSAT(acc2 >> 7U, 8)); 00387 pOut += inc; 00388 *pOut = (q7_t) (__SSAT(acc3 >> 7U, 8)); 00389 pOut += inc; 00390 00391 /* Initialization of inputB pointer */ 00392 pScr2 = py; 00393 00394 pScratch1 += 4U; 00395 00396 } 00397 00398 00399 blkCnt = (srcALen + srcBLen - 1U) & 0x3; 00400 00401 /* Calculate correlation for remaining samples of Bigger length sequence */ 00402 while (blkCnt > 0) 00403 { 00404 /* Initialze temporary scratch pointer as scratch1 */ 00405 pScr1 = pScratch1; 00406 00407 /* Clear Accumlators */ 00408 acc0 = 0; 00409 00410 tapCnt = (srcBLen) >> 1U; 00411 00412 while (tapCnt > 0U) 00413 { 00414 acc0 += (*pScr1++ * *pScr2++); 00415 acc0 += (*pScr1++ * *pScr2++); 00416 00417 /* Decrement the loop counter */ 00418 tapCnt--; 00419 } 00420 00421 tapCnt = (srcBLen) & 1U; 00422 00423 /* apply same above for remaining samples of smaller length sequence */ 00424 while (tapCnt > 0U) 00425 { 00426 00427 /* accumlate the results */ 00428 acc0 += (*pScr1++ * *pScr2++); 00429 00430 /* Decrement the loop counter */ 00431 tapCnt--; 00432 } 00433 00434 blkCnt--; 00435 00436 /* Store the result in the accumulator in the destination buffer. */ 00437 *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8)); 00438 00439 pOut += inc; 00440 00441 /* Initialization of inputB pointer */ 00442 pScr2 = py; 00443 00444 pScratch1 += 1U; 00445 00446 } 00447 00448 } 00449 00450 /** 00451 * @} end of Corr group 00452 */ 00453
Generated on Tue Jul 12 2022 16:46:23 by 1.7.2