mbed-dsp | Mbed

mbed official » Code » Documentation
mbed official / mbed-dsp
CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by mbed official
Embed: (wiki syntax)
« Back to documentation index
Show/hide line numbers arm_correlate_opt_q15.c Source File
00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_correlate_opt_q15.c    
00009 *    
00010 * Description:  Correlation of Q15 sequences.  
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.    
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup Corr    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Correlation of Q15 sequences.  
00054  * @param[in] *pSrcA points to the first input sequence.    
00055  * @param[in] srcALen length of the first input sequence.    
00056  * @param[in] *pSrcB points to the second input sequence.    
00057  * @param[in] srcBLen length of the second input sequence.    
00058  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.    
00059  * @param[in]  *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.    
00060  * @return none.    
00061  *    
00062  * \par Restrictions    
00063  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE    
00064  *  In this case input, output, scratch buffers should be aligned by 32-bit    
00065  *     
00066  * @details    
00067  * <b>Scaling and Overflow Behavior:</b>    
00068  *    
00069  * \par    
00070  * The function is implemented using a 64-bit internal accumulator.    
00071  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.    
00072  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.    
00073  * This approach provides 33 guard bits and there is no risk of overflow.    
00074  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.    
00075  *    
00076  * \par    
00077  * Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.   
00078  *  
00079  * 
00080  */
00081 
00082 
00083 void arm_correlate_opt_q15(
00084   q15_t * pSrcA,
00085   uint32_t srcALen,
00086   q15_t * pSrcB,
00087   uint32_t srcBLen,
00088   q15_t * pDst,
00089   q15_t * pScratch)
00090 {
00091   q15_t *pIn1;                                   /* inputA pointer               */
00092   q15_t *pIn2;                                   /* inputB pointer               */
00093   q63_t acc0, acc1, acc2, acc3;                  /* Accumulators                  */
00094   q15_t *py;                                     /* Intermediate inputB pointer  */
00095   q31_t x1, x2, x3;                              /* temporary variables for holding input1 and input2 values */
00096   uint32_t j, blkCnt, outBlockSize;              /* loop counter                 */
00097   int32_t inc = 1;                               /* output pointer increment     */
00098   uint32_t tapCnt;
00099   q31_t y1, y2;
00100   q15_t *pScr;                                   /* Intermediate pointers        */
00101   q15_t *pOut = pDst;                            /* output pointer               */
00102 #ifdef UNALIGNED_SUPPORT_DISABLE
00103 
00104   q15_t a, b;
00105 
00106 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00107 
00108   /* The algorithm implementation is based on the lengths of the inputs. */
00109   /* srcB is always made to slide across srcA. */
00110   /* So srcBLen is always considered as shorter or equal to srcALen */
00111   /* But CORR(x, y) is reverse of CORR(y, x) */
00112   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00113   /* and the destination pointer modifier, inc is set to -1 */
00114   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00115   /* But to improve the performance,        
00116    * we include zeroes in the output instead of zero padding either of the the inputs*/
00117   /* If srcALen > srcBLen,        
00118    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00119   /* If srcALen < srcBLen,        
00120    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00121   if(srcALen >= srcBLen)
00122   {
00123     /* Initialization of inputA pointer */
00124     pIn1 = (pSrcA);
00125 
00126     /* Initialization of inputB pointer */
00127     pIn2 = (pSrcB);
00128 
00129     /* Number of output samples is calculated */
00130     outBlockSize = (2u * srcALen) - 1u;
00131 
00132     /* When srcALen > srcBLen, zero padding is done to srcB        
00133      * to make their lengths equal.        
00134      * Instead, (outBlockSize - (srcALen + srcBLen - 1))        
00135      * number of output samples are made zero */
00136     j = outBlockSize - (srcALen + (srcBLen - 1u));
00137 
00138     /* Updating the pointer position to non zero value */
00139     pOut += j;
00140 
00141   }
00142   else
00143   {
00144     /* Initialization of inputA pointer */
00145     pIn1 = (pSrcB);
00146 
00147     /* Initialization of inputB pointer */
00148     pIn2 = (pSrcA);
00149 
00150     /* srcBLen is always considered as shorter or equal to srcALen */
00151     j = srcBLen;
00152     srcBLen = srcALen;
00153     srcALen = j;
00154 
00155     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00156     /* Hence set the destination pointer to point to the last output sample */
00157     pOut = pDst + ((srcALen + srcBLen) - 2u);
00158 
00159     /* Destination address modifier is set to -1 */
00160     inc = -1;
00161 
00162   }
00163 
00164   pScr = pScratch;
00165 
00166   /* Fill (srcBLen - 1u) zeros in scratch buffer */
00167   arm_fill_q15(0, pScr, (srcBLen - 1u));
00168 
00169   /* Update temporary scratch pointer */
00170   pScr += (srcBLen - 1u);
00171 
00172 #ifndef UNALIGNED_SUPPORT_DISABLE
00173 
00174   /* Copy (srcALen) samples in scratch buffer */
00175   arm_copy_q15(pIn1, pScr, srcALen);
00176 
00177   /* Update pointers */
00178   //pIn1 += srcALen;    
00179   pScr += srcALen;
00180 
00181 #else
00182 
00183   /* Apply loop unrolling and do 4 Copies simultaneously. */
00184   j = srcALen >> 2u;
00185 
00186   /* First part of the processing with loop unrolling copies 4 data points at a time.       
00187    ** a second loop below copies for the remaining 1 to 3 samples. */
00188   while(j > 0u)
00189   {
00190     /* copy second buffer in reversal manner */
00191     *pScr++ = *pIn1++;
00192     *pScr++ = *pIn1++;
00193     *pScr++ = *pIn1++;
00194     *pScr++ = *pIn1++;
00195 
00196     /* Decrement the loop counter */
00197     j--;
00198   }
00199 
00200   /* If the count is not a multiple of 4, copy remaining samples here.       
00201    ** No loop unrolling is used. */
00202   j = srcALen % 0x4u;
00203 
00204   while(j > 0u)
00205   {
00206     /* copy second buffer in reversal manner for remaining samples */
00207     *pScr++ = *pIn1++;
00208 
00209     /* Decrement the loop counter */
00210     j--;
00211   }
00212 
00213 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00214 
00215 #ifndef UNALIGNED_SUPPORT_DISABLE
00216 
00217   /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
00218   arm_fill_q15(0, pScr, (srcBLen - 1u));
00219 
00220   /* Update pointer */
00221   pScr += (srcBLen - 1u);
00222 
00223 #else
00224 
00225 /* Apply loop unrolling and do 4 Copies simultaneously. */
00226   j = (srcBLen - 1u) >> 2u;
00227 
00228   /* First part of the processing with loop unrolling copies 4 data points at a time.       
00229    ** a second loop below copies for the remaining 1 to 3 samples. */
00230   while(j > 0u)
00231   {
00232     /* copy second buffer in reversal manner */
00233     *pScr++ = 0;
00234     *pScr++ = 0;
00235     *pScr++ = 0;
00236     *pScr++ = 0;
00237 
00238     /* Decrement the loop counter */
00239     j--;
00240   }
00241 
00242   /* If the count is not a multiple of 4, copy remaining samples here.       
00243    ** No loop unrolling is used. */
00244   j = (srcBLen - 1u) % 0x4u;
00245 
00246   while(j > 0u)
00247   {
00248     /* copy second buffer in reversal manner for remaining samples */
00249     *pScr++ = 0;
00250 
00251     /* Decrement the loop counter */
00252     j--;
00253   }
00254 
00255 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00256 
00257   /* Temporary pointer for scratch2 */
00258   py = pIn2;
00259 
00260 
00261   /* Actual correlation process starts here */
00262   blkCnt = (srcALen + srcBLen - 1u) >> 2;
00263 
00264   while(blkCnt > 0)
00265   {
00266     /* Initialze temporary scratch pointer as scratch1 */
00267     pScr = pScratch;
00268 
00269     /* Clear Accumlators */
00270     acc0 = 0;
00271     acc1 = 0;
00272     acc2 = 0;
00273     acc3 = 0;
00274 
00275     /* Read four samples from scratch1 buffer */
00276     x1 = *__SIMD32(pScr)++;
00277 
00278     /* Read next four samples from scratch1 buffer */
00279     x2 = *__SIMD32(pScr)++;
00280 
00281     tapCnt = (srcBLen) >> 2u;
00282 
00283     while(tapCnt > 0u)
00284     {
00285 
00286 #ifndef UNALIGNED_SUPPORT_DISABLE
00287 
00288       /* Read four samples from smaller buffer */
00289       y1 = _SIMD32_OFFSET(pIn2);
00290       y2 = _SIMD32_OFFSET(pIn2 + 2u);
00291 
00292       acc0 = __SMLALD(x1, y1, acc0);
00293 
00294       acc2 = __SMLALD(x2, y1, acc2);
00295 
00296 #ifndef ARM_MATH_BIG_ENDIAN
00297       x3 = __PKHBT(x2, x1, 0);
00298 #else
00299       x3 = __PKHBT(x1, x2, 0);
00300 #endif
00301 
00302       acc1 = __SMLALDX(x3, y1, acc1);
00303 
00304       x1 = _SIMD32_OFFSET(pScr);
00305 
00306       acc0 = __SMLALD(x2, y2, acc0);
00307 
00308       acc2 = __SMLALD(x1, y2, acc2);
00309 
00310 #ifndef ARM_MATH_BIG_ENDIAN
00311       x3 = __PKHBT(x1, x2, 0);
00312 #else
00313       x3 = __PKHBT(x2, x1, 0);
00314 #endif
00315 
00316       acc3 = __SMLALDX(x3, y1, acc3);
00317 
00318       acc1 = __SMLALDX(x3, y2, acc1);
00319 
00320       x2 = _SIMD32_OFFSET(pScr + 2u);
00321 
00322 #ifndef ARM_MATH_BIG_ENDIAN
00323       x3 = __PKHBT(x2, x1, 0);
00324 #else
00325       x3 = __PKHBT(x1, x2, 0);
00326 #endif
00327 
00328       acc3 = __SMLALDX(x3, y2, acc3);
00329 
00330 #else    
00331 
00332       /* Read four samples from smaller buffer */
00333       a = *pIn2;
00334       b = *(pIn2 + 1);
00335 
00336 #ifndef ARM_MATH_BIG_ENDIAN
00337       y1 = __PKHBT(a, b, 16);
00338 #else
00339       y1 = __PKHBT(b, a, 16);
00340 #endif
00341       
00342       a = *(pIn2 + 2);
00343       b = *(pIn2 + 3);
00344 #ifndef ARM_MATH_BIG_ENDIAN
00345       y2 = __PKHBT(a, b, 16);
00346 #else
00347       y2 = __PKHBT(b, a, 16);
00348 #endif              
00349 
00350       acc0 = __SMLALD(x1, y1, acc0);
00351 
00352       acc2 = __SMLALD(x2, y1, acc2);
00353 
00354 #ifndef ARM_MATH_BIG_ENDIAN
00355       x3 = __PKHBT(x2, x1, 0);
00356 #else
00357       x3 = __PKHBT(x1, x2, 0);
00358 #endif
00359 
00360       acc1 = __SMLALDX(x3, y1, acc1);
00361 
00362       a = *pScr;
00363       b = *(pScr + 1);
00364 
00365 #ifndef ARM_MATH_BIG_ENDIAN
00366       x1 = __PKHBT(a, b, 16);
00367 #else
00368       x1 = __PKHBT(b, a, 16);
00369 #endif
00370 
00371       acc0 = __SMLALD(x2, y2, acc0);
00372 
00373       acc2 = __SMLALD(x1, y2, acc2);
00374 
00375 #ifndef ARM_MATH_BIG_ENDIAN
00376       x3 = __PKHBT(x1, x2, 0);
00377 #else
00378       x3 = __PKHBT(x2, x1, 0);
00379 #endif
00380 
00381       acc3 = __SMLALDX(x3, y1, acc3);
00382 
00383       acc1 = __SMLALDX(x3, y2, acc1);
00384 
00385       a = *(pScr + 2);
00386       b = *(pScr + 3);
00387 
00388 #ifndef ARM_MATH_BIG_ENDIAN
00389       x2 = __PKHBT(a, b, 16);
00390 #else
00391       x2 = __PKHBT(b, a, 16);
00392 #endif
00393 
00394 #ifndef ARM_MATH_BIG_ENDIAN
00395       x3 = __PKHBT(x2, x1, 0);
00396 #else
00397       x3 = __PKHBT(x1, x2, 0);
00398 #endif
00399 
00400       acc3 = __SMLALDX(x3, y2, acc3);
00401 
00402 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00403 
00404       pIn2 += 4u;
00405 
00406       pScr += 4u;
00407 
00408 
00409       /* Decrement the loop counter */
00410       tapCnt--;
00411     }
00412 
00413 
00414 
00415     /* Update scratch pointer for remaining samples of smaller length sequence */
00416     pScr -= 4u;
00417 
00418 
00419     /* apply same above for remaining samples of smaller length sequence */
00420     tapCnt = (srcBLen) & 3u;
00421 
00422     while(tapCnt > 0u)
00423     {
00424 
00425       /* accumlate the results */
00426       acc0 += (*pScr++ * *pIn2);
00427       acc1 += (*pScr++ * *pIn2);
00428       acc2 += (*pScr++ * *pIn2);
00429       acc3 += (*pScr++ * *pIn2++);
00430 
00431       pScr -= 3u;
00432 
00433       /* Decrement the loop counter */
00434       tapCnt--;
00435     }
00436 
00437     blkCnt--;
00438 
00439 
00440     /* Store the results in the accumulators in the destination buffer. */
00441     *pOut = (__SSAT(acc0 >> 15u, 16));
00442     pOut += inc;
00443     *pOut = (__SSAT(acc1 >> 15u, 16));
00444     pOut += inc;
00445     *pOut = (__SSAT(acc2 >> 15u, 16));
00446     pOut += inc;
00447     *pOut = (__SSAT(acc3 >> 15u, 16));
00448     pOut += inc;
00449 
00450     /* Initialization of inputB pointer */
00451     pIn2 = py;
00452 
00453     pScratch += 4u;
00454 
00455   }
00456 
00457 
00458   blkCnt = (srcALen + srcBLen - 1u) & 0x3;
00459 
00460   /* Calculate correlation for remaining samples of Bigger length sequence */
00461   while(blkCnt > 0)
00462   {
00463     /* Initialze temporary scratch pointer as scratch1 */
00464     pScr = pScratch;
00465 
00466     /* Clear Accumlators */
00467     acc0 = 0;
00468 
00469     tapCnt = (srcBLen) >> 1u;
00470 
00471     while(tapCnt > 0u)
00472     {
00473 
00474       acc0 += (*pScr++ * *pIn2++);
00475       acc0 += (*pScr++ * *pIn2++);
00476 
00477       /* Decrement the loop counter */
00478       tapCnt--;
00479     }
00480 
00481     tapCnt = (srcBLen) & 1u;
00482 
00483     /* apply same above for remaining samples of smaller length sequence */
00484     while(tapCnt > 0u)
00485     {
00486 
00487       /* accumlate the results */
00488       acc0 += (*pScr++ * *pIn2++);
00489 
00490       /* Decrement the loop counter */
00491       tapCnt--;
00492     }
00493 
00494     blkCnt--;
00495 
00496     /* Store the result in the accumulator in the destination buffer. */
00497     *pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
00498 
00499     pOut += inc;
00500 
00501     /* Initialization of inputB pointer */
00502     pIn2 = py;
00503 
00504     pScratch += 1u;
00505 
00506   }
00507 
00508 
00509 }
00510 
00511 /**    
00512  * @} end of Corr group    
00513  */
Repository toolbox

Export to desktop IDE
Repository details

Type:	Library
Created:	11 Feb 2014
Imports:	270
Forks:	0
Commits:	4
Dependents:	55
Dependencies:	0
Followers:	25
The code in this repository is MIT licensed.
arm_correlate_opt_q15.c

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning