CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_opt_q15.c Source File

arm_conv_opt_q15.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_conv_opt_q15.c    
00009 *    
00010 * Description:  Convolution of Q15 sequences.      
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.   
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup Conv    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Convolution of Q15 sequences.    
00054  * @param[in] *pSrcA points to the first input sequence.    
00055  * @param[in] srcALen length of the first input sequence.    
00056  * @param[in] *pSrcB points to the second input sequence.    
00057  * @param[in] srcBLen length of the second input sequence.    
00058  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.    
00059  * @param[in]  *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.    
00060  * @param[in]  *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).    
00061  * @return none.    
00062  *    
00063  * \par Restrictions    
00064  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE    
00065  *  In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit    
00066  *    
00067  *       
00068  * @details    
00069  * <b>Scaling and Overflow Behavior:</b>    
00070  *    
00071  * \par    
00072  * The function is implemented using a 64-bit internal accumulator.    
00073  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.    
00074  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.    
00075  * This approach provides 33 guard bits and there is no risk of overflow.    
00076  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.    
00077  *  
00078  *   
00079  * \par    
00080  * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.     
00081  * 
00082  *  
00083  */
00084 
00085 void arm_conv_opt_q15(
00086   q15_t * pSrcA,
00087   uint32_t srcALen,
00088   q15_t * pSrcB,
00089   uint32_t srcBLen,
00090   q15_t * pDst,
00091   q15_t * pScratch1,
00092   q15_t * pScratch2)
00093 {
00094   q63_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00095   q31_t x1, x2, x3;                              /* Temporary variables to hold state and coefficient values */
00096   q31_t y1, y2;                                  /* State variables */
00097   q15_t *pOut = pDst;                            /* output pointer */
00098   q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
00099   q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
00100   q15_t *pIn1;                                   /* inputA pointer */
00101   q15_t *pIn2;                                   /* inputB pointer */
00102   q15_t *px;                                     /* Intermediate inputA pointer  */
00103   q15_t *py;                                     /* Intermediate inputB pointer  */
00104   uint32_t j, k, blkCnt;                         /* loop counter */
00105   uint32_t tapCnt;                               /* loop count */
00106 #ifdef UNALIGNED_SUPPORT_DISABLE
00107 
00108   q15_t a, b;
00109 
00110 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00111 
00112   /* The algorithm implementation is based on the lengths of the inputs. */
00113   /* srcB is always made to slide across srcA. */
00114   /* So srcBLen is always considered as shorter or equal to srcALen */
00115   if(srcALen >= srcBLen)
00116   {
00117     /* Initialization of inputA pointer */
00118     pIn1 = pSrcA;
00119 
00120     /* Initialization of inputB pointer */
00121     pIn2 = pSrcB;
00122 
00123   }
00124   else
00125   {
00126     /* Initialization of inputA pointer */
00127     pIn1 = pSrcB;
00128 
00129     /* Initialization of inputB pointer */
00130     pIn2 = pSrcA;
00131 
00132     /* srcBLen is always considered as shorter or equal to srcALen */
00133     j = srcBLen;
00134     srcBLen = srcALen;
00135     srcALen = j;
00136   }
00137 
00138   /* pointer to take end of scratch2 buffer */
00139   pScr2 = pScratch2 + srcBLen - 1;
00140 
00141   /* points to smaller length sequence */
00142   px = pIn2;
00143 
00144   /* Apply loop unrolling and do 4 Copies simultaneously. */
00145   k = srcBLen >> 2u;
00146 
00147   /* First part of the processing with loop unrolling copies 4 data points at a time.       
00148    ** a second loop below copies for the remaining 1 to 3 samples. */
00149   /* Copy smaller length input sequence in reverse order into second scratch buffer */
00150   while(k > 0u)
00151   {
00152     /* copy second buffer in reversal manner */
00153     *pScr2-- = *px++;
00154     *pScr2-- = *px++;
00155     *pScr2-- = *px++;
00156     *pScr2-- = *px++;
00157 
00158     /* Decrement the loop counter */
00159     k--;
00160   }
00161 
00162   /* If the count is not a multiple of 4, copy remaining samples here.       
00163    ** No loop unrolling is used. */
00164   k = srcBLen % 0x4u;
00165 
00166   while(k > 0u)
00167   {
00168     /* copy second buffer in reversal manner for remaining samples */
00169     *pScr2-- = *px++;
00170 
00171     /* Decrement the loop counter */
00172     k--;
00173   }
00174 
00175   /* Initialze temporary scratch pointer */
00176   pScr1 = pScratch1;
00177 
00178   /* Assuming scratch1 buffer is aligned by 32-bit */
00179   /* Fill (srcBLen - 1u) zeros in scratch buffer */
00180   arm_fill_q15(0, pScr1, (srcBLen - 1u));
00181 
00182   /* Update temporary scratch pointer */
00183   pScr1 += (srcBLen - 1u);
00184 
00185   /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
00186 
00187 #ifndef UNALIGNED_SUPPORT_DISABLE
00188 
00189   /* Copy (srcALen) samples in scratch buffer */
00190   arm_copy_q15(pIn1, pScr1, srcALen);
00191 
00192   /* Update pointers */
00193   pScr1 += srcALen;
00194 
00195 #else
00196 
00197   /* Apply loop unrolling and do 4 Copies simultaneously. */
00198   k = srcALen >> 2u;
00199 
00200   /* First part of the processing with loop unrolling copies 4 data points at a time.       
00201    ** a second loop below copies for the remaining 1 to 3 samples. */
00202   while(k > 0u)
00203   {
00204     /* copy second buffer in reversal manner */
00205     *pScr1++ = *pIn1++;
00206     *pScr1++ = *pIn1++;
00207     *pScr1++ = *pIn1++;
00208     *pScr1++ = *pIn1++;
00209 
00210     /* Decrement the loop counter */
00211     k--;
00212   }
00213 
00214   /* If the count is not a multiple of 4, copy remaining samples here.       
00215    ** No loop unrolling is used. */
00216   k = srcALen % 0x4u;
00217 
00218   while(k > 0u)
00219   {
00220     /* copy second buffer in reversal manner for remaining samples */
00221     *pScr1++ = *pIn1++;
00222 
00223     /* Decrement the loop counter */
00224     k--;
00225   }
00226 
00227 #endif
00228 
00229 
00230 #ifndef UNALIGNED_SUPPORT_DISABLE
00231 
00232   /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
00233   arm_fill_q15(0, pScr1, (srcBLen - 1u));
00234 
00235   /* Update pointer */
00236   pScr1 += (srcBLen - 1u);
00237 
00238 #else
00239 
00240   /* Apply loop unrolling and do 4 Copies simultaneously. */
00241   k = (srcBLen - 1u) >> 2u;
00242 
00243   /* First part of the processing with loop unrolling copies 4 data points at a time.       
00244    ** a second loop below copies for the remaining 1 to 3 samples. */
00245   while(k > 0u)
00246   {
00247     /* copy second buffer in reversal manner */
00248     *pScr1++ = 0;
00249     *pScr1++ = 0;
00250     *pScr1++ = 0;
00251     *pScr1++ = 0;
00252 
00253     /* Decrement the loop counter */
00254     k--;
00255   }
00256 
00257   /* If the count is not a multiple of 4, copy remaining samples here.       
00258    ** No loop unrolling is used. */
00259   k = (srcBLen - 1u) % 0x4u;
00260 
00261   while(k > 0u)
00262   {
00263     /* copy second buffer in reversal manner for remaining samples */
00264     *pScr1++ = 0;
00265 
00266     /* Decrement the loop counter */
00267     k--;
00268   }
00269 
00270 #endif
00271 
00272   /* Temporary pointer for scratch2 */
00273   py = pScratch2;
00274 
00275 
00276   /* Initialization of pIn2 pointer */
00277   pIn2 = py;
00278 
00279   /* First part of the processing with loop unrolling process 4 data points at a time.       
00280    ** a second loop below process for the remaining 1 to 3 samples. */
00281 
00282   /* Actual convolution process starts here */
00283   blkCnt = (srcALen + srcBLen - 1u) >> 2;
00284 
00285   while(blkCnt > 0)
00286   {
00287     /* Initialze temporary scratch pointer as scratch1 */
00288     pScr1 = pScratch1;
00289 
00290     /* Clear Accumlators */
00291     acc0 = 0;
00292     acc1 = 0;
00293     acc2 = 0;
00294     acc3 = 0;
00295 
00296     /* Read two samples from scratch1 buffer */
00297     x1 = *__SIMD32(pScr1)++;
00298 
00299     /* Read next two samples from scratch1 buffer */
00300     x2 = *__SIMD32(pScr1)++;
00301 
00302     tapCnt = (srcBLen) >> 2u;
00303 
00304     while(tapCnt > 0u)
00305     {
00306 
00307 #ifndef UNALIGNED_SUPPORT_DISABLE
00308 
00309       /* Read four samples from smaller buffer */
00310       y1 = _SIMD32_OFFSET(pIn2);
00311       y2 = _SIMD32_OFFSET(pIn2 + 2u);
00312 
00313       /* multiply and accumlate */
00314       acc0 = __SMLALD(x1, y1, acc0);
00315       acc2 = __SMLALD(x2, y1, acc2);
00316 
00317       /* pack input data */
00318 #ifndef ARM_MATH_BIG_ENDIAN
00319       x3 = __PKHBT(x2, x1, 0);
00320 #else
00321       x3 = __PKHBT(x1, x2, 0);
00322 #endif
00323 
00324       /* multiply and accumlate */
00325       acc1 = __SMLALDX(x3, y1, acc1);
00326 
00327       /* Read next two samples from scratch1 buffer */
00328       x1 = _SIMD32_OFFSET(pScr1);
00329 
00330       /* multiply and accumlate */
00331       acc0 = __SMLALD(x2, y2, acc0);
00332       acc2 = __SMLALD(x1, y2, acc2);
00333 
00334       /* pack input data */
00335 #ifndef ARM_MATH_BIG_ENDIAN
00336       x3 = __PKHBT(x1, x2, 0);
00337 #else
00338       x3 = __PKHBT(x2, x1, 0);
00339 #endif
00340 
00341       acc3 = __SMLALDX(x3, y1, acc3);
00342       acc1 = __SMLALDX(x3, y2, acc1);
00343 
00344       x2 = _SIMD32_OFFSET(pScr1 + 2u);
00345 
00346 #ifndef ARM_MATH_BIG_ENDIAN
00347       x3 = __PKHBT(x2, x1, 0);
00348 #else
00349       x3 = __PKHBT(x1, x2, 0);
00350 #endif
00351 
00352       acc3 = __SMLALDX(x3, y2, acc3);
00353 
00354 #else    
00355 
00356       /* Read four samples from smaller buffer */
00357       a = *pIn2;
00358       b = *(pIn2 + 1);
00359 
00360 #ifndef ARM_MATH_BIG_ENDIAN
00361       y1 = __PKHBT(a, b, 16);
00362 #else
00363       y1 = __PKHBT(b, a, 16);
00364 #endif
00365       
00366       a = *(pIn2 + 2);
00367       b = *(pIn2 + 3);
00368 #ifndef ARM_MATH_BIG_ENDIAN
00369       y2 = __PKHBT(a, b, 16);
00370 #else
00371       y2 = __PKHBT(b, a, 16);
00372 #endif              
00373 
00374       acc0 = __SMLALD(x1, y1, acc0);
00375 
00376       acc2 = __SMLALD(x2, y1, acc2);
00377 
00378 #ifndef ARM_MATH_BIG_ENDIAN
00379       x3 = __PKHBT(x2, x1, 0);
00380 #else
00381       x3 = __PKHBT(x1, x2, 0);
00382 #endif
00383 
00384       acc1 = __SMLALDX(x3, y1, acc1);
00385 
00386       a = *pScr1;
00387       b = *(pScr1 + 1);
00388 
00389 #ifndef ARM_MATH_BIG_ENDIAN
00390       x1 = __PKHBT(a, b, 16);
00391 #else
00392       x1 = __PKHBT(b, a, 16);
00393 #endif
00394 
00395       acc0 = __SMLALD(x2, y2, acc0);
00396 
00397       acc2 = __SMLALD(x1, y2, acc2);
00398 
00399 #ifndef ARM_MATH_BIG_ENDIAN
00400       x3 = __PKHBT(x1, x2, 0);
00401 #else
00402       x3 = __PKHBT(x2, x1, 0);
00403 #endif
00404 
00405       acc3 = __SMLALDX(x3, y1, acc3);
00406 
00407       acc1 = __SMLALDX(x3, y2, acc1);
00408 
00409       a = *(pScr1 + 2);
00410       b = *(pScr1 + 3);
00411 
00412 #ifndef ARM_MATH_BIG_ENDIAN
00413       x2 = __PKHBT(a, b, 16);
00414 #else
00415       x2 = __PKHBT(b, a, 16);
00416 #endif
00417 
00418 #ifndef ARM_MATH_BIG_ENDIAN
00419       x3 = __PKHBT(x2, x1, 0);
00420 #else
00421       x3 = __PKHBT(x1, x2, 0);
00422 #endif
00423 
00424       acc3 = __SMLALDX(x3, y2, acc3);
00425 
00426 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00427 
00428       pIn2 += 4u;
00429       pScr1 += 4u;
00430 
00431 
00432       /* Decrement the loop counter */
00433       tapCnt--;
00434     }
00435 
00436     /* Update scratch pointer for remaining samples of smaller length sequence */
00437     pScr1 -= 4u;
00438 
00439     /* apply same above for remaining samples of smaller length sequence */
00440     tapCnt = (srcBLen) & 3u;
00441 
00442     while(tapCnt > 0u)
00443     {
00444 
00445       /* accumlate the results */
00446       acc0 += (*pScr1++ * *pIn2);
00447       acc1 += (*pScr1++ * *pIn2);
00448       acc2 += (*pScr1++ * *pIn2);
00449       acc3 += (*pScr1++ * *pIn2++);
00450 
00451       pScr1 -= 3u;
00452 
00453       /* Decrement the loop counter */
00454       tapCnt--;
00455     }
00456 
00457     blkCnt--;
00458 
00459 
00460     /* Store the results in the accumulators in the destination buffer. */
00461 
00462 #ifndef ARM_MATH_BIG_ENDIAN
00463 
00464     *__SIMD32(pOut)++ =
00465       __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00466 
00467     *__SIMD32(pOut)++ =
00468       __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00469 
00470 #else
00471 
00472     *__SIMD32(pOut)++ =
00473       __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00474 
00475     *__SIMD32(pOut)++ =
00476       __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00477 
00478 
00479 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN       */
00480 
00481     /* Initialization of inputB pointer */
00482     pIn2 = py;
00483 
00484     pScratch1 += 4u;
00485 
00486   }
00487 
00488 
00489   blkCnt = (srcALen + srcBLen - 1u) & 0x3;
00490 
00491   /* Calculate convolution for remaining samples of Bigger length sequence */
00492   while(blkCnt > 0)
00493   {
00494     /* Initialze temporary scratch pointer as scratch1 */
00495     pScr1 = pScratch1;
00496 
00497     /* Clear Accumlators */
00498     acc0 = 0;
00499 
00500     tapCnt = (srcBLen) >> 1u;
00501 
00502     while(tapCnt > 0u)
00503     {
00504 
00505       /* Read next two samples from scratch1 buffer */
00506       acc0 += (*pScr1++ * *pIn2++);
00507       acc0 += (*pScr1++ * *pIn2++);
00508 
00509       /* Decrement the loop counter */
00510       tapCnt--;
00511     }
00512 
00513     tapCnt = (srcBLen) & 1u;
00514 
00515     /* apply same above for remaining samples of smaller length sequence */
00516     while(tapCnt > 0u)
00517     {
00518 
00519       /* accumlate the results */
00520       acc0 += (*pScr1++ * *pIn2++);
00521 
00522       /* Decrement the loop counter */
00523       tapCnt--;
00524     }
00525 
00526     blkCnt--;
00527 
00528     /* The result is in 2.30 format.  Convert to 1.15 with saturation.       
00529      ** Then store the output in the destination buffer. */
00530     *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00531 
00532 
00533     /* Initialization of inputB pointer */
00534     pIn2 = py;
00535 
00536     pScratch1 += 1u;
00537 
00538   }
00539 
00540 }
00541 
00542 
00543 /**    
00544  * @} end of Conv group    
00545  */