CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_opt_q7.c Source File

arm_conv_opt_q7.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_conv_opt_q7.c    
00009 *    
00010 * Description:  Convolution of Q7 sequences.  
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup Conv    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Convolution of Q7 sequences.    
00054  * @param[in] *pSrcA points to the first input sequence.    
00055  * @param[in] srcALen length of the first input sequence.    
00056  * @param[in] *pSrcB points to the second input sequence.    
00057  * @param[in] srcBLen length of the second input sequence.    
00058  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.    
00059  * @param[in]  *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.   
00060  * @param[in]  *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).   
00061  * @return none.    
00062  *    
00063  * \par Restrictions    
00064  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE    
00065  *  In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit     
00066  *       
00067  * @details    
00068  * <b>Scaling and Overflow Behavior:</b>    
00069  *    
00070  * \par    
00071  * The function is implemented using a 32-bit internal accumulator.    
00072  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.    
00073  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.    
00074  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.    
00075  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.    
00076  *
00077  */
00078 
00079 void arm_conv_opt_q7(
00080   q7_t * pSrcA,
00081   uint32_t srcALen,
00082   q7_t * pSrcB,
00083   uint32_t srcBLen,
00084   q7_t * pDst,
00085   q15_t * pScratch1,
00086   q15_t * pScratch2)
00087 {
00088 
00089   q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
00090   q15_t x4;                                      /* Temporary input variable */
00091   q7_t *pIn1, *pIn2;                             /* inputA and inputB pointer */
00092   uint32_t j, k, blkCnt, tapCnt;                 /* loop counter */
00093   q7_t *px;                                      /* Temporary input1 pointer */
00094   q15_t *py;                                     /* Temporary input2 pointer */
00095   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00096   q31_t x1, x2, x3, y1;                          /* Temporary input variables */
00097   q7_t *pOut = pDst;                             /* output pointer */
00098   q7_t out0, out1, out2, out3;                   /* temporary variables */
00099 
00100   /* The algorithm implementation is based on the lengths of the inputs. */
00101   /* srcB is always made to slide across srcA. */
00102   /* So srcBLen is always considered as shorter or equal to srcALen */
00103   if(srcALen >= srcBLen)
00104   {
00105     /* Initialization of inputA pointer */
00106     pIn1 = pSrcA;
00107 
00108     /* Initialization of inputB pointer */
00109     pIn2 = pSrcB;
00110   }
00111   else
00112   {
00113     /* Initialization of inputA pointer */
00114     pIn1 = pSrcB;
00115 
00116     /* Initialization of inputB pointer */
00117     pIn2 = pSrcA;
00118 
00119     /* srcBLen is always considered as shorter or equal to srcALen */
00120     j = srcBLen;
00121     srcBLen = srcALen;
00122     srcALen = j;
00123   }
00124 
00125   /* pointer to take end of scratch2 buffer */
00126   pScr2 = pScratch2;
00127 
00128   /* points to smaller length sequence */
00129   px = pIn2 + srcBLen - 1;
00130 
00131   /* Apply loop unrolling and do 4 Copies simultaneously. */
00132   k = srcBLen >> 2u;
00133 
00134   /* First part of the processing with loop unrolling copies 4 data points at a time.       
00135    ** a second loop below copies for the remaining 1 to 3 samples. */
00136   while(k > 0u)
00137   {
00138     /* copy second buffer in reversal manner */
00139     x4 = (q15_t) * px--;
00140     *pScr2++ = x4;
00141     x4 = (q15_t) * px--;
00142     *pScr2++ = x4;
00143     x4 = (q15_t) * px--;
00144     *pScr2++ = x4;
00145     x4 = (q15_t) * px--;
00146     *pScr2++ = x4;
00147 
00148     /* Decrement the loop counter */
00149     k--;
00150   }
00151 
00152   /* If the count is not a multiple of 4, copy remaining samples here.       
00153    ** No loop unrolling is used. */
00154   k = srcBLen % 0x4u;
00155 
00156   while(k > 0u)
00157   {
00158     /* copy second buffer in reversal manner for remaining samples */
00159     x4 = (q15_t) * px--;
00160     *pScr2++ = x4;
00161 
00162     /* Decrement the loop counter */
00163     k--;
00164   }
00165 
00166   /* Initialze temporary scratch pointer */
00167   pScr1 = pScratch1;
00168 
00169   /* Fill (srcBLen - 1u) zeros in scratch buffer */
00170   arm_fill_q15(0, pScr1, (srcBLen - 1u));
00171 
00172   /* Update temporary scratch pointer */
00173   pScr1 += (srcBLen - 1u);
00174 
00175   /* Copy (srcALen) samples in scratch buffer */
00176   /* Apply loop unrolling and do 4 Copies simultaneously. */
00177   k = srcALen >> 2u;
00178 
00179   /* First part of the processing with loop unrolling copies 4 data points at a time.       
00180    ** a second loop below copies for the remaining 1 to 3 samples. */
00181   while(k > 0u)
00182   {
00183     /* copy second buffer in reversal manner */
00184     x4 = (q15_t) * pIn1++;
00185     *pScr1++ = x4;
00186     x4 = (q15_t) * pIn1++;
00187     *pScr1++ = x4;
00188     x4 = (q15_t) * pIn1++;
00189     *pScr1++ = x4;
00190     x4 = (q15_t) * pIn1++;
00191     *pScr1++ = x4;
00192 
00193     /* Decrement the loop counter */
00194     k--;
00195   }
00196 
00197   /* If the count is not a multiple of 4, copy remaining samples here.       
00198    ** No loop unrolling is used. */
00199   k = srcALen % 0x4u;
00200 
00201   while(k > 0u)
00202   {
00203     /* copy second buffer in reversal manner for remaining samples */
00204     x4 = (q15_t) * pIn1++;
00205     *pScr1++ = x4;
00206 
00207     /* Decrement the loop counter */
00208     k--;
00209   }
00210 
00211 #ifndef UNALIGNED_SUPPORT_DISABLE
00212 
00213   /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
00214   arm_fill_q15(0, pScr1, (srcBLen - 1u));
00215 
00216   /* Update pointer */
00217   pScr1 += (srcBLen - 1u);
00218 
00219 #else
00220 
00221   /* Apply loop unrolling and do 4 Copies simultaneously. */
00222   k = (srcBLen - 1u) >> 2u;
00223 
00224   /* First part of the processing with loop unrolling copies 4 data points at a time.       
00225    ** a second loop below copies for the remaining 1 to 3 samples. */
00226   while(k > 0u)
00227   {
00228     /* copy second buffer in reversal manner */
00229     *pScr1++ = 0;
00230     *pScr1++ = 0;
00231     *pScr1++ = 0;
00232     *pScr1++ = 0;
00233 
00234     /* Decrement the loop counter */
00235     k--;
00236   }
00237 
00238   /* If the count is not a multiple of 4, copy remaining samples here.       
00239    ** No loop unrolling is used. */
00240   k = (srcBLen - 1u) % 0x4u;
00241 
00242   while(k > 0u)
00243   {
00244     /* copy second buffer in reversal manner for remaining samples */
00245     *pScr1++ = 0;
00246 
00247     /* Decrement the loop counter */
00248     k--;
00249   }
00250 
00251 #endif
00252 
00253   /* Temporary pointer for scratch2 */
00254   py = pScratch2;
00255 
00256   /* Initialization of pIn2 pointer */
00257   pIn2 = (q7_t *) py;
00258 
00259   pScr2 = py;
00260 
00261   /* Actual convolution process starts here */
00262   blkCnt = (srcALen + srcBLen - 1u) >> 2;
00263 
00264   while(blkCnt > 0)
00265   {
00266     /* Initialze temporary scratch pointer as scratch1 */
00267     pScr1 = pScratch1;
00268 
00269     /* Clear Accumlators */
00270     acc0 = 0;
00271     acc1 = 0;
00272     acc2 = 0;
00273     acc3 = 0;
00274 
00275     /* Read two samples from scratch1 buffer */
00276     x1 = *__SIMD32(pScr1)++;
00277 
00278     /* Read next two samples from scratch1 buffer */
00279     x2 = *__SIMD32(pScr1)++;
00280 
00281     tapCnt = (srcBLen) >> 2u;
00282 
00283     while(tapCnt > 0u)
00284     {
00285 
00286       /* Read four samples from smaller buffer */
00287       y1 = _SIMD32_OFFSET(pScr2);
00288 
00289       /* multiply and accumlate */
00290       acc0 = __SMLAD(x1, y1, acc0);
00291       acc2 = __SMLAD(x2, y1, acc2);
00292 
00293       /* pack input data */
00294 #ifndef ARM_MATH_BIG_ENDIAN
00295       x3 = __PKHBT(x2, x1, 0);
00296 #else
00297       x3 = __PKHBT(x1, x2, 0);
00298 #endif
00299 
00300       /* multiply and accumlate */
00301       acc1 = __SMLADX(x3, y1, acc1);
00302 
00303       /* Read next two samples from scratch1 buffer */
00304       x1 = *__SIMD32(pScr1)++;
00305 
00306       /* pack input data */
00307 #ifndef ARM_MATH_BIG_ENDIAN
00308       x3 = __PKHBT(x1, x2, 0);
00309 #else
00310       x3 = __PKHBT(x2, x1, 0);
00311 #endif
00312 
00313       acc3 = __SMLADX(x3, y1, acc3);
00314 
00315       /* Read four samples from smaller buffer */
00316       y1 = _SIMD32_OFFSET(pScr2 + 2u);
00317 
00318       acc0 = __SMLAD(x2, y1, acc0);
00319 
00320       acc2 = __SMLAD(x1, y1, acc2);
00321 
00322       acc1 = __SMLADX(x3, y1, acc1);
00323 
00324       x2 = *__SIMD32(pScr1)++;
00325 
00326 #ifndef ARM_MATH_BIG_ENDIAN
00327       x3 = __PKHBT(x2, x1, 0);
00328 #else
00329       x3 = __PKHBT(x1, x2, 0);
00330 #endif
00331 
00332       acc3 = __SMLADX(x3, y1, acc3);
00333 
00334       pScr2 += 4u;
00335 
00336 
00337       /* Decrement the loop counter */
00338       tapCnt--;
00339     }
00340 
00341 
00342 
00343     /* Update scratch pointer for remaining samples of smaller length sequence */
00344     pScr1 -= 4u;
00345 
00346 
00347     /* apply same above for remaining samples of smaller length sequence */
00348     tapCnt = (srcBLen) & 3u;
00349 
00350     while(tapCnt > 0u)
00351     {
00352 
00353       /* accumlate the results */
00354       acc0 += (*pScr1++ * *pScr2);
00355       acc1 += (*pScr1++ * *pScr2);
00356       acc2 += (*pScr1++ * *pScr2);
00357       acc3 += (*pScr1++ * *pScr2++);
00358 
00359       pScr1 -= 3u;
00360 
00361       /* Decrement the loop counter */
00362       tapCnt--;
00363     }
00364 
00365     blkCnt--;
00366 
00367     /* Store the result in the accumulator in the destination buffer. */
00368     out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
00369     out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
00370     out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
00371     out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
00372 
00373     *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
00374 
00375     /* Initialization of inputB pointer */
00376     pScr2 = py;
00377 
00378     pScratch1 += 4u;
00379 
00380   }
00381 
00382 
00383   blkCnt = (srcALen + srcBLen - 1u) & 0x3;
00384 
00385   /* Calculate convolution for remaining samples of Bigger length sequence */
00386   while(blkCnt > 0)
00387   {
00388     /* Initialze temporary scratch pointer as scratch1 */
00389     pScr1 = pScratch1;
00390 
00391     /* Clear Accumlators */
00392     acc0 = 0;
00393 
00394     tapCnt = (srcBLen) >> 1u;
00395 
00396     while(tapCnt > 0u)
00397     {
00398       acc0 += (*pScr1++ * *pScr2++);
00399       acc0 += (*pScr1++ * *pScr2++);
00400 
00401       /* Decrement the loop counter */
00402       tapCnt--;
00403     }
00404 
00405     tapCnt = (srcBLen) & 1u;
00406 
00407     /* apply same above for remaining samples of smaller length sequence */
00408     while(tapCnt > 0u)
00409     {
00410 
00411       /* accumlate the results */
00412       acc0 += (*pScr1++ * *pScr2++);
00413 
00414       /* Decrement the loop counter */
00415       tapCnt--;
00416     }
00417 
00418     blkCnt--;
00419 
00420     /* Store the result in the accumulator in the destination buffer. */
00421     *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
00422 
00423     /* Initialization of inputB pointer */
00424     pScr2 = py;
00425 
00426     pScratch1 += 1u;
00427 
00428   }
00429 
00430 }
00431 
00432 
00433 /**    
00434  * @} end of Conv group    
00435  */