Marco Zecchini
/
Example_RTOS
Rtos API example
Embed:
(wiki syntax)
Show/hide line numbers
arm_correlate_fast_opt_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_fast_opt_q15.c 00009 * 00010 * Description: Fast Q15 Correlation. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Corr 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. 00059 * @param[in] *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00060 * @return none. 00061 * 00062 * 00063 * \par Restrictions 00064 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00065 * In this case input, output, scratch buffers should be aligned by 32-bit 00066 * 00067 * 00068 * <b>Scaling and Overflow Behavior:</b> 00069 * 00070 * \par 00071 * This fast version uses a 32-bit accumulator with 2.30 format. 00072 * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. 00073 * There is no saturation on intermediate additions. 00074 * Thus, if the accumulator overflows it wraps around and distorts the result. 00075 * The input signals should be scaled down to avoid intermediate overflows. 00076 * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a 00077 * maximum of min(srcALen, srcBLen) number of additions is carried internally. 00078 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result. 00079 * 00080 * \par 00081 * See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion. 00082 */ 00083 00084 void arm_correlate_fast_opt_q15( 00085 q15_t * pSrcA, 00086 uint32_t srcALen, 00087 q15_t * pSrcB, 00088 uint32_t srcBLen, 00089 q15_t * pDst, 00090 q15_t * pScratch) 00091 { 00092 q15_t *pIn1; /* inputA pointer */ 00093 q15_t *pIn2; /* inputB pointer */ 00094 q31_t acc0, acc1, acc2, acc3; /* Accumulators */ 00095 q15_t *py; /* Intermediate inputB pointer */ 00096 q31_t x1, x2, x3; /* temporary variables for holding input and coefficient values */ 00097 uint32_t j, blkCnt, outBlockSize; /* loop counter */ 00098 int32_t inc = 1; /* Destination address modifier */ 00099 uint32_t tapCnt; 00100 q31_t y1, y2; 00101 q15_t *pScr; /* Intermediate pointers */ 00102 q15_t *pOut = pDst; /* output pointer */ 00103 #ifdef UNALIGNED_SUPPORT_DISABLE 00104 00105 q15_t a, b; 00106 00107 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00108 00109 /* The algorithm implementation is based on the lengths of the inputs. */ 00110 /* srcB is always made to slide across srcA. */ 00111 /* So srcBLen is always considered as shorter or equal to srcALen */ 00112 /* But CORR(x, y) is reverse of CORR(y, x) */ 00113 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00114 /* and the destination pointer modifier, inc is set to -1 */ 00115 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00116 /* But to improve the performance, 00117 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00118 /* If srcALen > srcBLen, 00119 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00120 /* If srcALen < srcBLen, 00121 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00122 if(srcALen >= srcBLen) 00123 { 00124 /* Initialization of inputA pointer */ 00125 pIn1 = (pSrcA); 00126 00127 /* Initialization of inputB pointer */ 00128 pIn2 = (pSrcB); 00129 00130 /* Number of output samples is calculated */ 00131 outBlockSize = (2u * srcALen) - 1u; 00132 00133 /* When srcALen > srcBLen, zero padding is done to srcB 00134 * to make their lengths equal. 00135 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00136 * number of output samples are made zero */ 00137 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00138 00139 /* Updating the pointer position to non zero value */ 00140 pOut += j; 00141 00142 } 00143 else 00144 { 00145 /* Initialization of inputA pointer */ 00146 pIn1 = (pSrcB); 00147 00148 /* Initialization of inputB pointer */ 00149 pIn2 = (pSrcA); 00150 00151 /* srcBLen is always considered as shorter or equal to srcALen */ 00152 j = srcBLen; 00153 srcBLen = srcALen; 00154 srcALen = j; 00155 00156 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00157 /* Hence set the destination pointer to point to the last output sample */ 00158 pOut = pDst + ((srcALen + srcBLen) - 2u); 00159 00160 /* Destination address modifier is set to -1 */ 00161 inc = -1; 00162 00163 } 00164 00165 pScr = pScratch; 00166 00167 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00168 arm_fill_q15(0, pScr, (srcBLen - 1u)); 00169 00170 /* Update temporary scratch pointer */ 00171 pScr += (srcBLen - 1u); 00172 00173 #ifndef UNALIGNED_SUPPORT_DISABLE 00174 00175 /* Copy (srcALen) samples in scratch buffer */ 00176 arm_copy_q15(pIn1, pScr, srcALen); 00177 00178 /* Update pointers */ 00179 pScr += srcALen; 00180 00181 #else 00182 00183 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00184 j = srcALen >> 2u; 00185 00186 /* First part of the processing with loop unrolling copies 4 data points at a time. 00187 ** a second loop below copies for the remaining 1 to 3 samples. */ 00188 while(j > 0u) 00189 { 00190 /* copy second buffer in reversal manner */ 00191 *pScr++ = *pIn1++; 00192 *pScr++ = *pIn1++; 00193 *pScr++ = *pIn1++; 00194 *pScr++ = *pIn1++; 00195 00196 /* Decrement the loop counter */ 00197 j--; 00198 } 00199 00200 /* If the count is not a multiple of 4, copy remaining samples here. 00201 ** No loop unrolling is used. */ 00202 j = srcALen % 0x4u; 00203 00204 while(j > 0u) 00205 { 00206 /* copy second buffer in reversal manner for remaining samples */ 00207 *pScr++ = *pIn1++; 00208 00209 /* Decrement the loop counter */ 00210 j--; 00211 } 00212 00213 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00214 00215 #ifndef UNALIGNED_SUPPORT_DISABLE 00216 00217 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00218 arm_fill_q15(0, pScr, (srcBLen - 1u)); 00219 00220 /* Update pointer */ 00221 pScr += (srcBLen - 1u); 00222 00223 #else 00224 00225 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00226 j = (srcBLen - 1u) >> 2u; 00227 00228 /* First part of the processing with loop unrolling copies 4 data points at a time. 00229 ** a second loop below copies for the remaining 1 to 3 samples. */ 00230 while(j > 0u) 00231 { 00232 /* copy second buffer in reversal manner */ 00233 *pScr++ = 0; 00234 *pScr++ = 0; 00235 *pScr++ = 0; 00236 *pScr++ = 0; 00237 00238 /* Decrement the loop counter */ 00239 j--; 00240 } 00241 00242 /* If the count is not a multiple of 4, copy remaining samples here. 00243 ** No loop unrolling is used. */ 00244 j = (srcBLen - 1u) % 0x4u; 00245 00246 while(j > 0u) 00247 { 00248 /* copy second buffer in reversal manner for remaining samples */ 00249 *pScr++ = 0; 00250 00251 /* Decrement the loop counter */ 00252 j--; 00253 } 00254 00255 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00256 00257 /* Temporary pointer for scratch2 */ 00258 py = pIn2; 00259 00260 00261 /* Actual correlation process starts here */ 00262 blkCnt = (srcALen + srcBLen - 1u) >> 2; 00263 00264 while(blkCnt > 0) 00265 { 00266 /* Initialze temporary scratch pointer as scratch1 */ 00267 pScr = pScratch; 00268 00269 /* Clear Accumlators */ 00270 acc0 = 0; 00271 acc1 = 0; 00272 acc2 = 0; 00273 acc3 = 0; 00274 00275 /* Read four samples from scratch1 buffer */ 00276 x1 = *__SIMD32(pScr)++; 00277 00278 /* Read next four samples from scratch1 buffer */ 00279 x2 = *__SIMD32(pScr)++; 00280 00281 tapCnt = (srcBLen) >> 2u; 00282 00283 while(tapCnt > 0u) 00284 { 00285 00286 #ifndef UNALIGNED_SUPPORT_DISABLE 00287 00288 /* Read four samples from smaller buffer */ 00289 y1 = _SIMD32_OFFSET(pIn2); 00290 y2 = _SIMD32_OFFSET(pIn2 + 2u); 00291 00292 acc0 = __SMLAD(x1, y1, acc0); 00293 00294 acc2 = __SMLAD(x2, y1, acc2); 00295 00296 #ifndef ARM_MATH_BIG_ENDIAN 00297 x3 = __PKHBT(x2, x1, 0); 00298 #else 00299 x3 = __PKHBT(x1, x2, 0); 00300 #endif 00301 00302 acc1 = __SMLADX(x3, y1, acc1); 00303 00304 x1 = _SIMD32_OFFSET(pScr); 00305 00306 acc0 = __SMLAD(x2, y2, acc0); 00307 00308 acc2 = __SMLAD(x1, y2, acc2); 00309 00310 #ifndef ARM_MATH_BIG_ENDIAN 00311 x3 = __PKHBT(x1, x2, 0); 00312 #else 00313 x3 = __PKHBT(x2, x1, 0); 00314 #endif 00315 00316 acc3 = __SMLADX(x3, y1, acc3); 00317 00318 acc1 = __SMLADX(x3, y2, acc1); 00319 00320 x2 = _SIMD32_OFFSET(pScr + 2u); 00321 00322 #ifndef ARM_MATH_BIG_ENDIAN 00323 x3 = __PKHBT(x2, x1, 0); 00324 #else 00325 x3 = __PKHBT(x1, x2, 0); 00326 #endif 00327 00328 acc3 = __SMLADX(x3, y2, acc3); 00329 #else 00330 00331 /* Read four samples from smaller buffer */ 00332 a = *pIn2; 00333 b = *(pIn2 + 1); 00334 00335 #ifndef ARM_MATH_BIG_ENDIAN 00336 y1 = __PKHBT(a, b, 16); 00337 #else 00338 y1 = __PKHBT(b, a, 16); 00339 #endif 00340 00341 a = *(pIn2 + 2); 00342 b = *(pIn2 + 3); 00343 #ifndef ARM_MATH_BIG_ENDIAN 00344 y2 = __PKHBT(a, b, 16); 00345 #else 00346 y2 = __PKHBT(b, a, 16); 00347 #endif 00348 00349 acc0 = __SMLAD(x1, y1, acc0); 00350 00351 acc2 = __SMLAD(x2, y1, acc2); 00352 00353 #ifndef ARM_MATH_BIG_ENDIAN 00354 x3 = __PKHBT(x2, x1, 0); 00355 #else 00356 x3 = __PKHBT(x1, x2, 0); 00357 #endif 00358 00359 acc1 = __SMLADX(x3, y1, acc1); 00360 00361 a = *pScr; 00362 b = *(pScr + 1); 00363 00364 #ifndef ARM_MATH_BIG_ENDIAN 00365 x1 = __PKHBT(a, b, 16); 00366 #else 00367 x1 = __PKHBT(b, a, 16); 00368 #endif 00369 00370 acc0 = __SMLAD(x2, y2, acc0); 00371 00372 acc2 = __SMLAD(x1, y2, acc2); 00373 00374 #ifndef ARM_MATH_BIG_ENDIAN 00375 x3 = __PKHBT(x1, x2, 0); 00376 #else 00377 x3 = __PKHBT(x2, x1, 0); 00378 #endif 00379 00380 acc3 = __SMLADX(x3, y1, acc3); 00381 00382 acc1 = __SMLADX(x3, y2, acc1); 00383 00384 a = *(pScr + 2); 00385 b = *(pScr + 3); 00386 00387 #ifndef ARM_MATH_BIG_ENDIAN 00388 x2 = __PKHBT(a, b, 16); 00389 #else 00390 x2 = __PKHBT(b, a, 16); 00391 #endif 00392 00393 #ifndef ARM_MATH_BIG_ENDIAN 00394 x3 = __PKHBT(x2, x1, 0); 00395 #else 00396 x3 = __PKHBT(x1, x2, 0); 00397 #endif 00398 00399 acc3 = __SMLADX(x3, y2, acc3); 00400 00401 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00402 00403 pIn2 += 4u; 00404 00405 pScr += 4u; 00406 00407 00408 /* Decrement the loop counter */ 00409 tapCnt--; 00410 } 00411 00412 00413 00414 /* Update scratch pointer for remaining samples of smaller length sequence */ 00415 pScr -= 4u; 00416 00417 00418 /* apply same above for remaining samples of smaller length sequence */ 00419 tapCnt = (srcBLen) & 3u; 00420 00421 while(tapCnt > 0u) 00422 { 00423 00424 /* accumlate the results */ 00425 acc0 += (*pScr++ * *pIn2); 00426 acc1 += (*pScr++ * *pIn2); 00427 acc2 += (*pScr++ * *pIn2); 00428 acc3 += (*pScr++ * *pIn2++); 00429 00430 pScr -= 3u; 00431 00432 /* Decrement the loop counter */ 00433 tapCnt--; 00434 } 00435 00436 blkCnt--; 00437 00438 00439 /* Store the results in the accumulators in the destination buffer. */ 00440 *pOut = (__SSAT(acc0 >> 15u, 16)); 00441 pOut += inc; 00442 *pOut = (__SSAT(acc1 >> 15u, 16)); 00443 pOut += inc; 00444 *pOut = (__SSAT(acc2 >> 15u, 16)); 00445 pOut += inc; 00446 *pOut = (__SSAT(acc3 >> 15u, 16)); 00447 pOut += inc; 00448 00449 00450 /* Initialization of inputB pointer */ 00451 pIn2 = py; 00452 00453 pScratch += 4u; 00454 00455 } 00456 00457 00458 blkCnt = (srcALen + srcBLen - 1u) & 0x3; 00459 00460 /* Calculate correlation for remaining samples of Bigger length sequence */ 00461 while(blkCnt > 0) 00462 { 00463 /* Initialze temporary scratch pointer as scratch1 */ 00464 pScr = pScratch; 00465 00466 /* Clear Accumlators */ 00467 acc0 = 0; 00468 00469 tapCnt = (srcBLen) >> 1u; 00470 00471 while(tapCnt > 0u) 00472 { 00473 00474 acc0 += (*pScr++ * *pIn2++); 00475 acc0 += (*pScr++ * *pIn2++); 00476 00477 /* Decrement the loop counter */ 00478 tapCnt--; 00479 } 00480 00481 tapCnt = (srcBLen) & 1u; 00482 00483 /* apply same above for remaining samples of smaller length sequence */ 00484 while(tapCnt > 0u) 00485 { 00486 00487 /* accumlate the results */ 00488 acc0 += (*pScr++ * *pIn2++); 00489 00490 /* Decrement the loop counter */ 00491 tapCnt--; 00492 } 00493 00494 blkCnt--; 00495 00496 /* Store the result in the accumulator in the destination buffer. */ 00497 00498 *pOut = (q15_t) (__SSAT((acc0 >> 15), 16)); 00499 00500 pOut += inc; 00501 00502 /* Initialization of inputB pointer */ 00503 pIn2 = py; 00504 00505 pScratch += 1u; 00506 00507 } 00508 } 00509 00510 /** 00511 * @} end of Corr group 00512 */
Generated on Sun Jul 17 2022 08:25:19 by 1.7.2