V4.0.1 of the ARM CMSIS DSP libraries. Note that arm_bitreversal2.s, arm_cfft_f32.c and arm_rfft_fast_f32.c had to be removed. arm_bitreversal2.s will not assemble with the online tools. So, the fast f32 FFT functions are not yet available. All the other FFT functions are available.
Dependents: MPU9150_Example fir_f32 fir_f32 MPU9150_nucleo_noni2cdev ... more
arm_correlate_opt_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 12. March 2014 00005 * $Revision: V1.4.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_opt_q7.c 00009 * 00010 * Description: Correlation of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Corr 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Correlation of Q7 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. 00059 * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2. 00060 * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen). 00061 * @return none. 00062 * 00063 * 00064 * \par Restrictions 00065 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00066 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit 00067 * 00068 * @details 00069 * <b>Scaling and Overflow Behavior:</b> 00070 * 00071 * \par 00072 * The function is implemented using a 32-bit internal accumulator. 00073 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. 00074 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00075 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>. 00076 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format. 00077 * 00078 * 00079 */ 00080 00081 00082 00083 void arm_correlate_opt_q7( 00084 q7_t * pSrcA, 00085 uint32_t srcALen, 00086 q7_t * pSrcB, 00087 uint32_t srcBLen, 00088 q7_t * pDst, 00089 q15_t * pScratch1, 00090 q15_t * pScratch2) 00091 { 00092 q7_t *pOut = pDst; /* output pointer */ 00093 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */ 00094 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */ 00095 q7_t *pIn1; /* inputA pointer */ 00096 q7_t *pIn2; /* inputB pointer */ 00097 q15_t *py; /* Intermediate inputB pointer */ 00098 q31_t acc0, acc1, acc2, acc3; /* Accumulators */ 00099 uint32_t j, k = 0u, blkCnt; /* loop counter */ 00100 int32_t inc = 1; /* output pointer increment */ 00101 uint32_t outBlockSize; /* loop counter */ 00102 q15_t x4; /* Temporary input variable */ 00103 uint32_t tapCnt; /* loop counter */ 00104 q31_t x1, x2, x3, y1; /* Temporary input variables */ 00105 00106 /* The algorithm implementation is based on the lengths of the inputs. */ 00107 /* srcB is always made to slide across srcA. */ 00108 /* So srcBLen is always considered as shorter or equal to srcALen */ 00109 /* But CORR(x, y) is reverse of CORR(y, x) */ 00110 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00111 /* and the destination pointer modifier, inc is set to -1 */ 00112 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00113 /* But to improve the performance, 00114 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00115 /* If srcALen > srcBLen, 00116 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00117 /* If srcALen < srcBLen, 00118 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00119 if(srcALen >= srcBLen) 00120 { 00121 /* Initialization of inputA pointer */ 00122 pIn1 = (pSrcA); 00123 00124 /* Initialization of inputB pointer */ 00125 pIn2 = (pSrcB); 00126 00127 /* Number of output samples is calculated */ 00128 outBlockSize = (2u * srcALen) - 1u; 00129 00130 /* When srcALen > srcBLen, zero padding is done to srcB 00131 * to make their lengths equal. 00132 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00133 * number of output samples are made zero */ 00134 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00135 00136 /* Updating the pointer position to non zero value */ 00137 pOut += j; 00138 00139 } 00140 else 00141 { 00142 /* Initialization of inputA pointer */ 00143 pIn1 = (pSrcB); 00144 00145 /* Initialization of inputB pointer */ 00146 pIn2 = (pSrcA); 00147 00148 /* srcBLen is always considered as shorter or equal to srcALen */ 00149 j = srcBLen; 00150 srcBLen = srcALen; 00151 srcALen = j; 00152 00153 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00154 /* Hence set the destination pointer to point to the last output sample */ 00155 pOut = pDst + ((srcALen + srcBLen) - 2u); 00156 00157 /* Destination address modifier is set to -1 */ 00158 inc = -1; 00159 00160 } 00161 00162 00163 /* Copy (srcBLen) samples in scratch buffer */ 00164 k = srcBLen >> 2u; 00165 00166 /* First part of the processing with loop unrolling copies 4 data points at a time. 00167 ** a second loop below copies for the remaining 1 to 3 samples. */ 00168 while(k > 0u) 00169 { 00170 /* copy second buffer in reversal manner */ 00171 x4 = (q15_t) * pIn2++; 00172 *pScr2++ = x4; 00173 x4 = (q15_t) * pIn2++; 00174 *pScr2++ = x4; 00175 x4 = (q15_t) * pIn2++; 00176 *pScr2++ = x4; 00177 x4 = (q15_t) * pIn2++; 00178 *pScr2++ = x4; 00179 00180 /* Decrement the loop counter */ 00181 k--; 00182 } 00183 00184 /* If the count is not a multiple of 4, copy remaining samples here. 00185 ** No loop unrolling is used. */ 00186 k = srcBLen % 0x4u; 00187 00188 while(k > 0u) 00189 { 00190 /* copy second buffer in reversal manner for remaining samples */ 00191 x4 = (q15_t) * pIn2++; 00192 *pScr2++ = x4; 00193 00194 /* Decrement the loop counter */ 00195 k--; 00196 } 00197 00198 /* Fill (srcBLen - 1u) zeros in scratch buffer */ 00199 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00200 00201 /* Update temporary scratch pointer */ 00202 pScr1 += (srcBLen - 1u); 00203 00204 /* Copy (srcALen) samples in scratch buffer */ 00205 k = srcALen >> 2u; 00206 00207 /* First part of the processing with loop unrolling copies 4 data points at a time. 00208 ** a second loop below copies for the remaining 1 to 3 samples. */ 00209 while(k > 0u) 00210 { 00211 /* copy second buffer in reversal manner */ 00212 x4 = (q15_t) * pIn1++; 00213 *pScr1++ = x4; 00214 x4 = (q15_t) * pIn1++; 00215 *pScr1++ = x4; 00216 x4 = (q15_t) * pIn1++; 00217 *pScr1++ = x4; 00218 x4 = (q15_t) * pIn1++; 00219 *pScr1++ = x4; 00220 00221 /* Decrement the loop counter */ 00222 k--; 00223 } 00224 00225 /* If the count is not a multiple of 4, copy remaining samples here. 00226 ** No loop unrolling is used. */ 00227 k = srcALen % 0x4u; 00228 00229 while(k > 0u) 00230 { 00231 /* copy second buffer in reversal manner for remaining samples */ 00232 x4 = (q15_t) * pIn1++; 00233 *pScr1++ = x4; 00234 00235 /* Decrement the loop counter */ 00236 k--; 00237 } 00238 00239 #ifndef UNALIGNED_SUPPORT_DISABLE 00240 00241 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */ 00242 arm_fill_q15(0, pScr1, (srcBLen - 1u)); 00243 00244 /* Update pointer */ 00245 pScr1 += (srcBLen - 1u); 00246 00247 #else 00248 00249 /* Apply loop unrolling and do 4 Copies simultaneously. */ 00250 k = (srcBLen - 1u) >> 2u; 00251 00252 /* First part of the processing with loop unrolling copies 4 data points at a time. 00253 ** a second loop below copies for the remaining 1 to 3 samples. */ 00254 while(k > 0u) 00255 { 00256 /* copy second buffer in reversal manner */ 00257 *pScr1++ = 0; 00258 *pScr1++ = 0; 00259 *pScr1++ = 0; 00260 *pScr1++ = 0; 00261 00262 /* Decrement the loop counter */ 00263 k--; 00264 } 00265 00266 /* If the count is not a multiple of 4, copy remaining samples here. 00267 ** No loop unrolling is used. */ 00268 k = (srcBLen - 1u) % 0x4u; 00269 00270 while(k > 0u) 00271 { 00272 /* copy second buffer in reversal manner for remaining samples */ 00273 *pScr1++ = 0; 00274 00275 /* Decrement the loop counter */ 00276 k--; 00277 } 00278 00279 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00280 00281 /* Temporary pointer for second sequence */ 00282 py = pScratch2; 00283 00284 /* Initialization of pScr2 pointer */ 00285 pScr2 = pScratch2; 00286 00287 /* Actual correlation process starts here */ 00288 blkCnt = (srcALen + srcBLen - 1u) >> 2; 00289 00290 while(blkCnt > 0) 00291 { 00292 /* Initialze temporary scratch pointer as scratch1 */ 00293 pScr1 = pScratch1; 00294 00295 /* Clear Accumlators */ 00296 acc0 = 0; 00297 acc1 = 0; 00298 acc2 = 0; 00299 acc3 = 0; 00300 00301 /* Read two samples from scratch1 buffer */ 00302 x1 = *__SIMD32(pScr1)++; 00303 00304 /* Read next two samples from scratch1 buffer */ 00305 x2 = *__SIMD32(pScr1)++; 00306 00307 tapCnt = (srcBLen) >> 2u; 00308 00309 while(tapCnt > 0u) 00310 { 00311 00312 /* Read four samples from smaller buffer */ 00313 y1 = _SIMD32_OFFSET(pScr2); 00314 00315 /* multiply and accumlate */ 00316 acc0 = __SMLAD(x1, y1, acc0); 00317 acc2 = __SMLAD(x2, y1, acc2); 00318 00319 /* pack input data */ 00320 #ifndef ARM_MATH_BIG_ENDIAN 00321 x3 = __PKHBT(x2, x1, 0); 00322 #else 00323 x3 = __PKHBT(x1, x2, 0); 00324 #endif 00325 00326 /* multiply and accumlate */ 00327 acc1 = __SMLADX(x3, y1, acc1); 00328 00329 /* Read next two samples from scratch1 buffer */ 00330 x1 = *__SIMD32(pScr1)++; 00331 00332 /* pack input data */ 00333 #ifndef ARM_MATH_BIG_ENDIAN 00334 x3 = __PKHBT(x1, x2, 0); 00335 #else 00336 x3 = __PKHBT(x2, x1, 0); 00337 #endif 00338 00339 acc3 = __SMLADX(x3, y1, acc3); 00340 00341 /* Read four samples from smaller buffer */ 00342 y1 = _SIMD32_OFFSET(pScr2 + 2u); 00343 00344 acc0 = __SMLAD(x2, y1, acc0); 00345 00346 acc2 = __SMLAD(x1, y1, acc2); 00347 00348 acc1 = __SMLADX(x3, y1, acc1); 00349 00350 x2 = *__SIMD32(pScr1)++; 00351 00352 #ifndef ARM_MATH_BIG_ENDIAN 00353 x3 = __PKHBT(x2, x1, 0); 00354 #else 00355 x3 = __PKHBT(x1, x2, 0); 00356 #endif 00357 00358 acc3 = __SMLADX(x3, y1, acc3); 00359 00360 pScr2 += 4u; 00361 00362 00363 /* Decrement the loop counter */ 00364 tapCnt--; 00365 } 00366 00367 00368 00369 /* Update scratch pointer for remaining samples of smaller length sequence */ 00370 pScr1 -= 4u; 00371 00372 00373 /* apply same above for remaining samples of smaller length sequence */ 00374 tapCnt = (srcBLen) & 3u; 00375 00376 while(tapCnt > 0u) 00377 { 00378 00379 /* accumlate the results */ 00380 acc0 += (*pScr1++ * *pScr2); 00381 acc1 += (*pScr1++ * *pScr2); 00382 acc2 += (*pScr1++ * *pScr2); 00383 acc3 += (*pScr1++ * *pScr2++); 00384 00385 pScr1 -= 3u; 00386 00387 /* Decrement the loop counter */ 00388 tapCnt--; 00389 } 00390 00391 blkCnt--; 00392 00393 /* Store the result in the accumulator in the destination buffer. */ 00394 *pOut = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00395 pOut += inc; 00396 *pOut = (q7_t) (__SSAT(acc1 >> 7u, 8)); 00397 pOut += inc; 00398 *pOut = (q7_t) (__SSAT(acc2 >> 7u, 8)); 00399 pOut += inc; 00400 *pOut = (q7_t) (__SSAT(acc3 >> 7u, 8)); 00401 pOut += inc; 00402 00403 /* Initialization of inputB pointer */ 00404 pScr2 = py; 00405 00406 pScratch1 += 4u; 00407 00408 } 00409 00410 00411 blkCnt = (srcALen + srcBLen - 1u) & 0x3; 00412 00413 /* Calculate correlation for remaining samples of Bigger length sequence */ 00414 while(blkCnt > 0) 00415 { 00416 /* Initialze temporary scratch pointer as scratch1 */ 00417 pScr1 = pScratch1; 00418 00419 /* Clear Accumlators */ 00420 acc0 = 0; 00421 00422 tapCnt = (srcBLen) >> 1u; 00423 00424 while(tapCnt > 0u) 00425 { 00426 acc0 += (*pScr1++ * *pScr2++); 00427 acc0 += (*pScr1++ * *pScr2++); 00428 00429 /* Decrement the loop counter */ 00430 tapCnt--; 00431 } 00432 00433 tapCnt = (srcBLen) & 1u; 00434 00435 /* apply same above for remaining samples of smaller length sequence */ 00436 while(tapCnt > 0u) 00437 { 00438 00439 /* accumlate the results */ 00440 acc0 += (*pScr1++ * *pScr2++); 00441 00442 /* Decrement the loop counter */ 00443 tapCnt--; 00444 } 00445 00446 blkCnt--; 00447 00448 /* Store the result in the accumulator in the destination buffer. */ 00449 *pOut = (q7_t) (__SSAT(acc0 >> 7u, 8)); 00450 00451 pOut += inc; 00452 00453 /* Initialization of inputB pointer */ 00454 pScr2 = py; 00455 00456 pScratch1 += 1u; 00457 00458 } 00459 00460 } 00461 00462 /** 00463 * @} end of Corr group 00464 */
Generated on Tue Jul 12 2022 19:48:43 by 1.7.2