Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_opt_q7.c Source File

arm_conv_opt_q7.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_opt_q7.c
00004  * Description:  Convolution of Q7 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup Conv
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Convolution of Q7 sequences.
00042  * @param[in] *pSrcA points to the first input sequence.
00043  * @param[in] srcALen length of the first input sequence.
00044  * @param[in] *pSrcB points to the second input sequence.
00045  * @param[in] srcBLen length of the second input sequence.
00046  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
00047  * @param[in]  *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
00048  * @param[in]  *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
00049  * @return none.
00050  *
00051  * \par Restrictions
00052  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
00053  *  In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
00054  *
00055  * @details
00056  * <b>Scaling and Overflow Behavior:</b>
00057  *
00058  * \par
00059  * The function is implemented using a 32-bit internal accumulator.
00060  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
00061  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
00062  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
00063  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
00064  *
00065  */
00066 
00067 void arm_conv_opt_q7(
00068   q7_t * pSrcA,
00069   uint32_t srcALen,
00070   q7_t * pSrcB,
00071   uint32_t srcBLen,
00072   q7_t * pDst,
00073   q15_t * pScratch1,
00074   q15_t * pScratch2)
00075 {
00076 
00077   q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
00078   q15_t x4;                                      /* Temporary input variable */
00079   q7_t *pIn1, *pIn2;                             /* inputA and inputB pointer */
00080   uint32_t j, k, blkCnt, tapCnt;                 /* loop counter */
00081   q7_t *px;                                      /* Temporary input1 pointer */
00082   q15_t *py;                                     /* Temporary input2 pointer */
00083   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00084   q31_t x1, x2, x3, y1;                          /* Temporary input variables */
00085   q7_t *pOut = pDst;                             /* output pointer */
00086   q7_t out0, out1, out2, out3;                   /* temporary variables */
00087 
00088   /* The algorithm implementation is based on the lengths of the inputs. */
00089   /* srcB is always made to slide across srcA. */
00090   /* So srcBLen is always considered as shorter or equal to srcALen */
00091   if (srcALen >= srcBLen)
00092   {
00093     /* Initialization of inputA pointer */
00094     pIn1 = pSrcA;
00095 
00096     /* Initialization of inputB pointer */
00097     pIn2 = pSrcB;
00098   }
00099   else
00100   {
00101     /* Initialization of inputA pointer */
00102     pIn1 = pSrcB;
00103 
00104     /* Initialization of inputB pointer */
00105     pIn2 = pSrcA;
00106 
00107     /* srcBLen is always considered as shorter or equal to srcALen */
00108     j = srcBLen;
00109     srcBLen = srcALen;
00110     srcALen = j;
00111   }
00112 
00113   /* pointer to take end of scratch2 buffer */
00114   pScr2 = pScratch2;
00115 
00116   /* points to smaller length sequence */
00117   px = pIn2 + srcBLen - 1;
00118 
00119   /* Apply loop unrolling and do 4 Copies simultaneously. */
00120   k = srcBLen >> 2U;
00121 
00122   /* First part of the processing with loop unrolling copies 4 data points at a time.
00123    ** a second loop below copies for the remaining 1 to 3 samples. */
00124   while (k > 0U)
00125   {
00126     /* copy second buffer in reversal manner */
00127     x4 = (q15_t) * px--;
00128     *pScr2++ = x4;
00129     x4 = (q15_t) * px--;
00130     *pScr2++ = x4;
00131     x4 = (q15_t) * px--;
00132     *pScr2++ = x4;
00133     x4 = (q15_t) * px--;
00134     *pScr2++ = x4;
00135 
00136     /* Decrement the loop counter */
00137     k--;
00138   }
00139 
00140   /* If the count is not a multiple of 4, copy remaining samples here.
00141    ** No loop unrolling is used. */
00142   k = srcBLen % 0x4U;
00143 
00144   while (k > 0U)
00145   {
00146     /* copy second buffer in reversal manner for remaining samples */
00147     x4 = (q15_t) * px--;
00148     *pScr2++ = x4;
00149 
00150     /* Decrement the loop counter */
00151     k--;
00152   }
00153 
00154   /* Initialze temporary scratch pointer */
00155   pScr1 = pScratch1;
00156 
00157   /* Fill (srcBLen - 1U) zeros in scratch buffer */
00158   arm_fill_q15(0, pScr1, (srcBLen - 1U));
00159 
00160   /* Update temporary scratch pointer */
00161   pScr1 += (srcBLen - 1U);
00162 
00163   /* Copy (srcALen) samples in scratch buffer */
00164   /* Apply loop unrolling and do 4 Copies simultaneously. */
00165   k = srcALen >> 2U;
00166 
00167   /* First part of the processing with loop unrolling copies 4 data points at a time.
00168    ** a second loop below copies for the remaining 1 to 3 samples. */
00169   while (k > 0U)
00170   {
00171     /* copy second buffer in reversal manner */
00172     x4 = (q15_t) * pIn1++;
00173     *pScr1++ = x4;
00174     x4 = (q15_t) * pIn1++;
00175     *pScr1++ = x4;
00176     x4 = (q15_t) * pIn1++;
00177     *pScr1++ = x4;
00178     x4 = (q15_t) * pIn1++;
00179     *pScr1++ = x4;
00180 
00181     /* Decrement the loop counter */
00182     k--;
00183   }
00184 
00185   /* If the count is not a multiple of 4, copy remaining samples here.
00186    ** No loop unrolling is used. */
00187   k = srcALen % 0x4U;
00188 
00189   while (k > 0U)
00190   {
00191     /* copy second buffer in reversal manner for remaining samples */
00192     x4 = (q15_t) * pIn1++;
00193     *pScr1++ = x4;
00194 
00195     /* Decrement the loop counter */
00196     k--;
00197   }
00198 
00199 #ifndef UNALIGNED_SUPPORT_DISABLE
00200 
00201   /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
00202   arm_fill_q15(0, pScr1, (srcBLen - 1U));
00203 
00204   /* Update pointer */
00205   pScr1 += (srcBLen - 1U);
00206 
00207 #else
00208 
00209   /* Apply loop unrolling and do 4 Copies simultaneously. */
00210   k = (srcBLen - 1U) >> 2U;
00211 
00212   /* First part of the processing with loop unrolling copies 4 data points at a time.
00213    ** a second loop below copies for the remaining 1 to 3 samples. */
00214   while (k > 0U)
00215   {
00216     /* copy second buffer in reversal manner */
00217     *pScr1++ = 0;
00218     *pScr1++ = 0;
00219     *pScr1++ = 0;
00220     *pScr1++ = 0;
00221 
00222     /* Decrement the loop counter */
00223     k--;
00224   }
00225 
00226   /* If the count is not a multiple of 4, copy remaining samples here.
00227    ** No loop unrolling is used. */
00228   k = (srcBLen - 1U) % 0x4U;
00229 
00230   while (k > 0U)
00231   {
00232     /* copy second buffer in reversal manner for remaining samples */
00233     *pScr1++ = 0;
00234 
00235     /* Decrement the loop counter */
00236     k--;
00237   }
00238 
00239 #endif
00240 
00241   /* Temporary pointer for scratch2 */
00242   py = pScratch2;
00243 
00244   /* Initialization of pIn2 pointer */
00245   pIn2 = (q7_t *) py;
00246 
00247   pScr2 = py;
00248 
00249   /* Actual convolution process starts here */
00250   blkCnt = (srcALen + srcBLen - 1U) >> 2;
00251 
00252   while (blkCnt > 0)
00253   {
00254     /* Initialze temporary scratch pointer as scratch1 */
00255     pScr1 = pScratch1;
00256 
00257     /* Clear Accumlators */
00258     acc0 = 0;
00259     acc1 = 0;
00260     acc2 = 0;
00261     acc3 = 0;
00262 
00263     /* Read two samples from scratch1 buffer */
00264     x1 = *__SIMD32(pScr1)++;
00265 
00266     /* Read next two samples from scratch1 buffer */
00267     x2 = *__SIMD32(pScr1)++;
00268 
00269     tapCnt = (srcBLen) >> 2U;
00270 
00271     while (tapCnt > 0U)
00272     {
00273 
00274       /* Read four samples from smaller buffer */
00275       y1 = _SIMD32_OFFSET(pScr2);
00276 
00277       /* multiply and accumlate */
00278       acc0 = __SMLAD(x1, y1, acc0);
00279       acc2 = __SMLAD(x2, y1, acc2);
00280 
00281       /* pack input data */
00282 #ifndef ARM_MATH_BIG_ENDIAN
00283       x3 = __PKHBT(x2, x1, 0);
00284 #else
00285       x3 = __PKHBT(x1, x2, 0);
00286 #endif
00287 
00288       /* multiply and accumlate */
00289       acc1 = __SMLADX(x3, y1, acc1);
00290 
00291       /* Read next two samples from scratch1 buffer */
00292       x1 = *__SIMD32(pScr1)++;
00293 
00294       /* pack input data */
00295 #ifndef ARM_MATH_BIG_ENDIAN
00296       x3 = __PKHBT(x1, x2, 0);
00297 #else
00298       x3 = __PKHBT(x2, x1, 0);
00299 #endif
00300 
00301       acc3 = __SMLADX(x3, y1, acc3);
00302 
00303       /* Read four samples from smaller buffer */
00304       y1 = _SIMD32_OFFSET(pScr2 + 2U);
00305 
00306       acc0 = __SMLAD(x2, y1, acc0);
00307 
00308       acc2 = __SMLAD(x1, y1, acc2);
00309 
00310       acc1 = __SMLADX(x3, y1, acc1);
00311 
00312       x2 = *__SIMD32(pScr1)++;
00313 
00314 #ifndef ARM_MATH_BIG_ENDIAN
00315       x3 = __PKHBT(x2, x1, 0);
00316 #else
00317       x3 = __PKHBT(x1, x2, 0);
00318 #endif
00319 
00320       acc3 = __SMLADX(x3, y1, acc3);
00321 
00322       pScr2 += 4U;
00323 
00324 
00325       /* Decrement the loop counter */
00326       tapCnt--;
00327     }
00328 
00329 
00330 
00331     /* Update scratch pointer for remaining samples of smaller length sequence */
00332     pScr1 -= 4U;
00333 
00334 
00335     /* apply same above for remaining samples of smaller length sequence */
00336     tapCnt = (srcBLen) & 3U;
00337 
00338     while (tapCnt > 0U)
00339     {
00340 
00341       /* accumlate the results */
00342       acc0 += (*pScr1++ * *pScr2);
00343       acc1 += (*pScr1++ * *pScr2);
00344       acc2 += (*pScr1++ * *pScr2);
00345       acc3 += (*pScr1++ * *pScr2++);
00346 
00347       pScr1 -= 3U;
00348 
00349       /* Decrement the loop counter */
00350       tapCnt--;
00351     }
00352 
00353     blkCnt--;
00354 
00355     /* Store the result in the accumulator in the destination buffer. */
00356     out0 = (q7_t) (__SSAT(acc0 >> 7U, 8));
00357     out1 = (q7_t) (__SSAT(acc1 >> 7U, 8));
00358     out2 = (q7_t) (__SSAT(acc2 >> 7U, 8));
00359     out3 = (q7_t) (__SSAT(acc3 >> 7U, 8));
00360 
00361     *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
00362 
00363     /* Initialization of inputB pointer */
00364     pScr2 = py;
00365 
00366     pScratch1 += 4U;
00367 
00368   }
00369 
00370 
00371   blkCnt = (srcALen + srcBLen - 1U) & 0x3;
00372 
00373   /* Calculate convolution for remaining samples of Bigger length sequence */
00374   while (blkCnt > 0)
00375   {
00376     /* Initialze temporary scratch pointer as scratch1 */
00377     pScr1 = pScratch1;
00378 
00379     /* Clear Accumlators */
00380     acc0 = 0;
00381 
00382     tapCnt = (srcBLen) >> 1U;
00383 
00384     while (tapCnt > 0U)
00385     {
00386       acc0 += (*pScr1++ * *pScr2++);
00387       acc0 += (*pScr1++ * *pScr2++);
00388 
00389       /* Decrement the loop counter */
00390       tapCnt--;
00391     }
00392 
00393     tapCnt = (srcBLen) & 1U;
00394 
00395     /* apply same above for remaining samples of smaller length sequence */
00396     while (tapCnt > 0U)
00397     {
00398 
00399       /* accumlate the results */
00400       acc0 += (*pScr1++ * *pScr2++);
00401 
00402       /* Decrement the loop counter */
00403       tapCnt--;
00404     }
00405 
00406     blkCnt--;
00407 
00408     /* Store the result in the accumulator in the destination buffer. */
00409     *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
00410 
00411     /* Initialization of inputB pointer */
00412     pScr2 = py;
00413 
00414     pScratch1 += 1U;
00415 
00416   }
00417 
00418 }
00419 
00420 
00421 /**
00422  * @} end of Conv group
00423  */
00424