V4.0.1 of the ARM CMSIS DSP libraries. Note that arm_bitreversal2.s, arm_cfft_f32.c and arm_rfft_fast_f32.c had to be removed. arm_bitreversal2.s will not assemble with the online tools. So, the fast f32 FFT functions are not yet available. All the other FFT functions are available.

Dependents:   MPU9150_Example fir_f32 fir_f32 MPU9150_nucleo_noni2cdev ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_fast_opt_q15.c Source File

arm_correlate_fast_opt_q15.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        12. March 2014
00005 * $Revision:    V1.4.3
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_correlate_fast_opt_q15.c    
00009 *    
00010 * Description:  Fast Q15 Correlation.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.    
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup Corr    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.    
00054  * @param[in] *pSrcA points to the first input sequence.    
00055  * @param[in] srcALen length of the first input sequence.    
00056  * @param[in] *pSrcB points to the second input sequence.    
00057  * @param[in] srcBLen length of the second input sequence.    
00058  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.    
00059  * @param[in]  *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.   
00060  * @return none.    
00061  *    
00062  *    
00063  * \par Restrictions    
00064  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE    
00065  *  In this case input, output, scratch buffers should be aligned by 32-bit    
00066  *    
00067  *     
00068  * <b>Scaling and Overflow Behavior:</b>    
00069  *    
00070  * \par    
00071  * This fast version uses a 32-bit accumulator with 2.30 format.    
00072  * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.    
00073  * There is no saturation on intermediate additions.    
00074  * Thus, if the accumulator overflows it wraps around and distorts the result.    
00075  * The input signals should be scaled down to avoid intermediate overflows.    
00076  * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a    
00077  * maximum of min(srcALen, srcBLen) number of additions is carried internally.    
00078  * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.    
00079  *    
00080  * \par    
00081  * See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.    
00082  */
00083 
00084 void arm_correlate_fast_opt_q15(
00085   q15_t * pSrcA,
00086   uint32_t srcALen,
00087   q15_t * pSrcB,
00088   uint32_t srcBLen,
00089   q15_t * pDst,
00090   q15_t * pScratch)
00091 {
00092   q15_t *pIn1;                                   /* inputA pointer               */
00093   q15_t *pIn2;                                   /* inputB pointer               */
00094   q31_t acc0, acc1, acc2, acc3;                  /* Accumulators                  */
00095   q15_t *py;                                     /* Intermediate inputB pointer  */
00096   q31_t x1, x2, x3;                              /* temporary variables for holding input and coefficient values */
00097   uint32_t j, blkCnt, outBlockSize;              /* loop counter                 */
00098   int32_t inc = 1;                               /* Destination address modifier */
00099   uint32_t tapCnt;
00100   q31_t y1, y2;
00101   q15_t *pScr;                                   /* Intermediate pointers        */
00102   q15_t *pOut = pDst;                            /* output pointer               */
00103 #ifdef UNALIGNED_SUPPORT_DISABLE
00104 
00105   q15_t a, b;
00106 
00107 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00108 
00109   /* The algorithm implementation is based on the lengths of the inputs. */
00110   /* srcB is always made to slide across srcA. */
00111   /* So srcBLen is always considered as shorter or equal to srcALen */
00112   /* But CORR(x, y) is reverse of CORR(y, x) */
00113   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00114   /* and the destination pointer modifier, inc is set to -1 */
00115   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00116   /* But to improve the performance,        
00117    * we include zeroes in the output instead of zero padding either of the the inputs*/
00118   /* If srcALen > srcBLen,        
00119    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00120   /* If srcALen < srcBLen,        
00121    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00122   if(srcALen >= srcBLen)
00123   {
00124     /* Initialization of inputA pointer */
00125     pIn1 = (pSrcA);
00126 
00127     /* Initialization of inputB pointer */
00128     pIn2 = (pSrcB);
00129 
00130     /* Number of output samples is calculated */
00131     outBlockSize = (2u * srcALen) - 1u;
00132 
00133     /* When srcALen > srcBLen, zero padding is done to srcB        
00134      * to make their lengths equal.        
00135      * Instead, (outBlockSize - (srcALen + srcBLen - 1))        
00136      * number of output samples are made zero */
00137     j = outBlockSize - (srcALen + (srcBLen - 1u));
00138 
00139     /* Updating the pointer position to non zero value */
00140     pOut += j;
00141 
00142   }
00143   else
00144   {
00145     /* Initialization of inputA pointer */
00146     pIn1 = (pSrcB);
00147 
00148     /* Initialization of inputB pointer */
00149     pIn2 = (pSrcA);
00150 
00151     /* srcBLen is always considered as shorter or equal to srcALen */
00152     j = srcBLen;
00153     srcBLen = srcALen;
00154     srcALen = j;
00155 
00156     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00157     /* Hence set the destination pointer to point to the last output sample */
00158     pOut = pDst + ((srcALen + srcBLen) - 2u);
00159 
00160     /* Destination address modifier is set to -1 */
00161     inc = -1;
00162 
00163   }
00164 
00165   pScr = pScratch;
00166 
00167   /* Fill (srcBLen - 1u) zeros in scratch buffer */
00168   arm_fill_q15(0, pScr, (srcBLen - 1u));
00169 
00170   /* Update temporary scratch pointer */
00171   pScr += (srcBLen - 1u);
00172 
00173 #ifndef UNALIGNED_SUPPORT_DISABLE
00174 
00175   /* Copy (srcALen) samples in scratch buffer */
00176   arm_copy_q15(pIn1, pScr, srcALen);
00177 
00178   /* Update pointers */
00179   pScr += srcALen;
00180 
00181 #else
00182 
00183   /* Apply loop unrolling and do 4 Copies simultaneously. */
00184   j = srcALen >> 2u;
00185 
00186   /* First part of the processing with loop unrolling copies 4 data points at a time.       
00187    ** a second loop below copies for the remaining 1 to 3 samples. */
00188   while(j > 0u)
00189   {
00190     /* copy second buffer in reversal manner */
00191     *pScr++ = *pIn1++;
00192     *pScr++ = *pIn1++;
00193     *pScr++ = *pIn1++;
00194     *pScr++ = *pIn1++;
00195 
00196     /* Decrement the loop counter */
00197     j--;
00198   }
00199 
00200   /* If the count is not a multiple of 4, copy remaining samples here.       
00201    ** No loop unrolling is used. */
00202   j = srcALen % 0x4u;
00203 
00204   while(j > 0u)
00205   {
00206     /* copy second buffer in reversal manner for remaining samples */
00207     *pScr++ = *pIn1++;
00208 
00209     /* Decrement the loop counter */
00210     j--;
00211   }
00212 
00213 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00214 
00215 #ifndef UNALIGNED_SUPPORT_DISABLE
00216 
00217   /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
00218   arm_fill_q15(0, pScr, (srcBLen - 1u));
00219 
00220   /* Update pointer */
00221   pScr += (srcBLen - 1u);
00222 
00223 #else
00224 
00225 /* Apply loop unrolling and do 4 Copies simultaneously. */
00226   j = (srcBLen - 1u) >> 2u;
00227 
00228   /* First part of the processing with loop unrolling copies 4 data points at a time.       
00229    ** a second loop below copies for the remaining 1 to 3 samples. */
00230   while(j > 0u)
00231   {
00232     /* copy second buffer in reversal manner */
00233     *pScr++ = 0;
00234     *pScr++ = 0;
00235     *pScr++ = 0;
00236     *pScr++ = 0;
00237 
00238     /* Decrement the loop counter */
00239     j--;
00240   }
00241 
00242   /* If the count is not a multiple of 4, copy remaining samples here.       
00243    ** No loop unrolling is used. */
00244   j = (srcBLen - 1u) % 0x4u;
00245 
00246   while(j > 0u)
00247   {
00248     /* copy second buffer in reversal manner for remaining samples */
00249     *pScr++ = 0;
00250 
00251     /* Decrement the loop counter */
00252     j--;
00253   }
00254 
00255 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00256 
00257   /* Temporary pointer for scratch2 */
00258   py = pIn2;
00259 
00260 
00261   /* Actual correlation process starts here */
00262   blkCnt = (srcALen + srcBLen - 1u) >> 2;
00263 
00264   while(blkCnt > 0)
00265   {
00266     /* Initialze temporary scratch pointer as scratch1 */
00267     pScr = pScratch;
00268 
00269     /* Clear Accumlators */
00270     acc0 = 0;
00271     acc1 = 0;
00272     acc2 = 0;
00273     acc3 = 0;
00274 
00275     /* Read four samples from scratch1 buffer */
00276     x1 = *__SIMD32(pScr)++;
00277 
00278     /* Read next four samples from scratch1 buffer */
00279     x2 = *__SIMD32(pScr)++;
00280 
00281     tapCnt = (srcBLen) >> 2u;
00282 
00283     while(tapCnt > 0u)
00284     {
00285 
00286 #ifndef UNALIGNED_SUPPORT_DISABLE
00287 
00288       /* Read four samples from smaller buffer */
00289       y1 = _SIMD32_OFFSET(pIn2);
00290       y2 = _SIMD32_OFFSET(pIn2 + 2u);
00291 
00292       acc0 = __SMLAD(x1, y1, acc0);
00293 
00294       acc2 = __SMLAD(x2, y1, acc2);
00295 
00296 #ifndef ARM_MATH_BIG_ENDIAN
00297       x3 = __PKHBT(x2, x1, 0);
00298 #else
00299       x3 = __PKHBT(x1, x2, 0);
00300 #endif
00301 
00302       acc1 = __SMLADX(x3, y1, acc1);
00303 
00304       x1 = _SIMD32_OFFSET(pScr);
00305 
00306       acc0 = __SMLAD(x2, y2, acc0);
00307 
00308       acc2 = __SMLAD(x1, y2, acc2);
00309 
00310 #ifndef ARM_MATH_BIG_ENDIAN
00311       x3 = __PKHBT(x1, x2, 0);
00312 #else
00313       x3 = __PKHBT(x2, x1, 0);
00314 #endif
00315 
00316       acc3 = __SMLADX(x3, y1, acc3);
00317 
00318       acc1 = __SMLADX(x3, y2, acc1);
00319 
00320       x2 = _SIMD32_OFFSET(pScr + 2u);
00321 
00322 #ifndef ARM_MATH_BIG_ENDIAN
00323       x3 = __PKHBT(x2, x1, 0);
00324 #else
00325       x3 = __PKHBT(x1, x2, 0);
00326 #endif
00327 
00328       acc3 = __SMLADX(x3, y2, acc3);
00329 #else    
00330 
00331       /* Read four samples from smaller buffer */
00332       a = *pIn2;
00333       b = *(pIn2 + 1);
00334 
00335 #ifndef ARM_MATH_BIG_ENDIAN
00336       y1 = __PKHBT(a, b, 16);
00337 #else
00338       y1 = __PKHBT(b, a, 16);
00339 #endif
00340       
00341       a = *(pIn2 + 2);
00342       b = *(pIn2 + 3);
00343 #ifndef ARM_MATH_BIG_ENDIAN
00344       y2 = __PKHBT(a, b, 16);
00345 #else
00346       y2 = __PKHBT(b, a, 16);
00347 #endif              
00348 
00349       acc0 = __SMLAD(x1, y1, acc0);
00350 
00351       acc2 = __SMLAD(x2, y1, acc2);
00352 
00353 #ifndef ARM_MATH_BIG_ENDIAN
00354       x3 = __PKHBT(x2, x1, 0);
00355 #else
00356       x3 = __PKHBT(x1, x2, 0);
00357 #endif
00358 
00359       acc1 = __SMLADX(x3, y1, acc1);
00360 
00361       a = *pScr;
00362       b = *(pScr + 1);
00363 
00364 #ifndef ARM_MATH_BIG_ENDIAN
00365       x1 = __PKHBT(a, b, 16);
00366 #else
00367       x1 = __PKHBT(b, a, 16);
00368 #endif
00369 
00370       acc0 = __SMLAD(x2, y2, acc0);
00371 
00372       acc2 = __SMLAD(x1, y2, acc2);
00373 
00374 #ifndef ARM_MATH_BIG_ENDIAN
00375       x3 = __PKHBT(x1, x2, 0);
00376 #else
00377       x3 = __PKHBT(x2, x1, 0);
00378 #endif
00379 
00380       acc3 = __SMLADX(x3, y1, acc3);
00381 
00382       acc1 = __SMLADX(x3, y2, acc1);
00383 
00384       a = *(pScr + 2);
00385       b = *(pScr + 3);
00386 
00387 #ifndef ARM_MATH_BIG_ENDIAN
00388       x2 = __PKHBT(a, b, 16);
00389 #else
00390       x2 = __PKHBT(b, a, 16);
00391 #endif
00392 
00393 #ifndef ARM_MATH_BIG_ENDIAN
00394       x3 = __PKHBT(x2, x1, 0);
00395 #else
00396       x3 = __PKHBT(x1, x2, 0);
00397 #endif
00398 
00399       acc3 = __SMLADX(x3, y2, acc3);
00400 
00401 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00402 
00403       pIn2 += 4u;
00404 
00405       pScr += 4u;
00406 
00407 
00408       /* Decrement the loop counter */
00409       tapCnt--;
00410     }
00411 
00412 
00413 
00414     /* Update scratch pointer for remaining samples of smaller length sequence */
00415     pScr -= 4u;
00416 
00417 
00418     /* apply same above for remaining samples of smaller length sequence */
00419     tapCnt = (srcBLen) & 3u;
00420 
00421     while(tapCnt > 0u)
00422     {
00423 
00424       /* accumlate the results */
00425       acc0 += (*pScr++ * *pIn2);
00426       acc1 += (*pScr++ * *pIn2);
00427       acc2 += (*pScr++ * *pIn2);
00428       acc3 += (*pScr++ * *pIn2++);
00429 
00430       pScr -= 3u;
00431 
00432       /* Decrement the loop counter */
00433       tapCnt--;
00434     }
00435 
00436     blkCnt--;
00437 
00438 
00439     /* Store the results in the accumulators in the destination buffer. */
00440     *pOut = (__SSAT(acc0 >> 15u, 16));
00441     pOut += inc;
00442     *pOut = (__SSAT(acc1 >> 15u, 16));
00443     pOut += inc;
00444     *pOut = (__SSAT(acc2 >> 15u, 16));
00445     pOut += inc;
00446     *pOut = (__SSAT(acc3 >> 15u, 16));
00447     pOut += inc;
00448 
00449 
00450     /* Initialization of inputB pointer */
00451     pIn2 = py;
00452 
00453     pScratch += 4u;
00454 
00455   }
00456 
00457 
00458   blkCnt = (srcALen + srcBLen - 1u) & 0x3;
00459 
00460   /* Calculate correlation for remaining samples of Bigger length sequence */
00461   while(blkCnt > 0)
00462   {
00463     /* Initialze temporary scratch pointer as scratch1 */
00464     pScr = pScratch;
00465 
00466     /* Clear Accumlators */
00467     acc0 = 0;
00468 
00469     tapCnt = (srcBLen) >> 1u;
00470 
00471     while(tapCnt > 0u)
00472     {
00473 
00474       acc0 += (*pScr++ * *pIn2++);
00475       acc0 += (*pScr++ * *pIn2++);
00476 
00477       /* Decrement the loop counter */
00478       tapCnt--;
00479     }
00480 
00481     tapCnt = (srcBLen) & 1u;
00482 
00483     /* apply same above for remaining samples of smaller length sequence */
00484     while(tapCnt > 0u)
00485     {
00486 
00487       /* accumlate the results */
00488       acc0 += (*pScr++ * *pIn2++);
00489 
00490       /* Decrement the loop counter */
00491       tapCnt--;
00492     }
00493 
00494     blkCnt--;
00495 
00496     /* Store the result in the accumulator in the destination buffer. */
00497 
00498     *pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
00499 
00500     pOut += inc;
00501 
00502     /* Initialization of inputB pointer */
00503     pIn2 = py;
00504 
00505     pScratch += 1u;
00506 
00507   }
00508 }
00509 
00510 /**    
00511  * @} end of Corr group    
00512  */