Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_opt_q15.c Source File

arm_conv_opt_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_opt_q15.c
00004  * Description:  Convolution of Q15 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup Conv
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Convolution of Q15 sequences.
00042  * @param[in] *pSrcA points to the first input sequence.
00043  * @param[in] srcALen length of the first input sequence.
00044  * @param[in] *pSrcB points to the second input sequence.
00045  * @param[in] srcBLen length of the second input sequence.
00046  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
00047  * @param[in]  *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
00048  * @param[in]  *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
00049  * @return none.
00050  *
00051  * \par Restrictions
00052  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
00053  *  In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
00054  *
00055  *
00056  * @details
00057  * <b>Scaling and Overflow Behavior:</b>
00058  *
00059  * \par
00060  * The function is implemented using a 64-bit internal accumulator.
00061  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
00062  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
00063  * This approach provides 33 guard bits and there is no risk of overflow.
00064  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
00065  *
00066  *
00067  * \par
00068  * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
00069  *
00070  *
00071  */
00072 
00073 void arm_conv_opt_q15(
00074   q15_t * pSrcA,
00075   uint32_t srcALen,
00076   q15_t * pSrcB,
00077   uint32_t srcBLen,
00078   q15_t * pDst,
00079   q15_t * pScratch1,
00080   q15_t * pScratch2)
00081 {
00082   q63_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00083   q31_t x1, x2, x3;                              /* Temporary variables to hold state and coefficient values */
00084   q31_t y1, y2;                                  /* State variables */
00085   q15_t *pOut = pDst;                            /* output pointer */
00086   q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
00087   q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
00088   q15_t *pIn1;                                   /* inputA pointer */
00089   q15_t *pIn2;                                   /* inputB pointer */
00090   q15_t *px;                                     /* Intermediate inputA pointer  */
00091   q15_t *py;                                     /* Intermediate inputB pointer  */
00092   uint32_t j, k, blkCnt;                         /* loop counter */
00093   uint32_t tapCnt;                               /* loop count */
00094 #ifdef UNALIGNED_SUPPORT_DISABLE
00095 
00096   q15_t a, b;
00097 
00098 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00099 
00100   /* The algorithm implementation is based on the lengths of the inputs. */
00101   /* srcB is always made to slide across srcA. */
00102   /* So srcBLen is always considered as shorter or equal to srcALen */
00103   if (srcALen >= srcBLen)
00104   {
00105     /* Initialization of inputA pointer */
00106     pIn1 = pSrcA;
00107 
00108     /* Initialization of inputB pointer */
00109     pIn2 = pSrcB;
00110 
00111   }
00112   else
00113   {
00114     /* Initialization of inputA pointer */
00115     pIn1 = pSrcB;
00116 
00117     /* Initialization of inputB pointer */
00118     pIn2 = pSrcA;
00119 
00120     /* srcBLen is always considered as shorter or equal to srcALen */
00121     j = srcBLen;
00122     srcBLen = srcALen;
00123     srcALen = j;
00124   }
00125 
00126   /* pointer to take end of scratch2 buffer */
00127   pScr2 = pScratch2 + srcBLen - 1;
00128 
00129   /* points to smaller length sequence */
00130   px = pIn2;
00131 
00132   /* Apply loop unrolling and do 4 Copies simultaneously. */
00133   k = srcBLen >> 2U;
00134 
00135   /* First part of the processing with loop unrolling copies 4 data points at a time.
00136    ** a second loop below copies for the remaining 1 to 3 samples. */
00137   /* Copy smaller length input sequence in reverse order into second scratch buffer */
00138   while (k > 0U)
00139   {
00140     /* copy second buffer in reversal manner */
00141     *pScr2-- = *px++;
00142     *pScr2-- = *px++;
00143     *pScr2-- = *px++;
00144     *pScr2-- = *px++;
00145 
00146     /* Decrement the loop counter */
00147     k--;
00148   }
00149 
00150   /* If the count is not a multiple of 4, copy remaining samples here.
00151    ** No loop unrolling is used. */
00152   k = srcBLen % 0x4U;
00153 
00154   while (k > 0U)
00155   {
00156     /* copy second buffer in reversal manner for remaining samples */
00157     *pScr2-- = *px++;
00158 
00159     /* Decrement the loop counter */
00160     k--;
00161   }
00162 
00163   /* Initialze temporary scratch pointer */
00164   pScr1 = pScratch1;
00165 
00166   /* Assuming scratch1 buffer is aligned by 32-bit */
00167   /* Fill (srcBLen - 1U) zeros in scratch buffer */
00168   arm_fill_q15(0, pScr1, (srcBLen - 1U));
00169 
00170   /* Update temporary scratch pointer */
00171   pScr1 += (srcBLen - 1U);
00172 
00173   /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
00174 
00175 #ifndef UNALIGNED_SUPPORT_DISABLE
00176 
00177   /* Copy (srcALen) samples in scratch buffer */
00178   arm_copy_q15(pIn1, pScr1, srcALen);
00179 
00180   /* Update pointers */
00181   pScr1 += srcALen;
00182 
00183 #else
00184 
00185   /* Apply loop unrolling and do 4 Copies simultaneously. */
00186   k = srcALen >> 2U;
00187 
00188   /* First part of the processing with loop unrolling copies 4 data points at a time.
00189    ** a second loop below copies for the remaining 1 to 3 samples. */
00190   while (k > 0U)
00191   {
00192     /* copy second buffer in reversal manner */
00193     *pScr1++ = *pIn1++;
00194     *pScr1++ = *pIn1++;
00195     *pScr1++ = *pIn1++;
00196     *pScr1++ = *pIn1++;
00197 
00198     /* Decrement the loop counter */
00199     k--;
00200   }
00201 
00202   /* If the count is not a multiple of 4, copy remaining samples here.
00203    ** No loop unrolling is used. */
00204   k = srcALen % 0x4U;
00205 
00206   while (k > 0U)
00207   {
00208     /* copy second buffer in reversal manner for remaining samples */
00209     *pScr1++ = *pIn1++;
00210 
00211     /* Decrement the loop counter */
00212     k--;
00213   }
00214 
00215 #endif
00216 
00217 
00218 #ifndef UNALIGNED_SUPPORT_DISABLE
00219 
00220   /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
00221   arm_fill_q15(0, pScr1, (srcBLen - 1U));
00222 
00223   /* Update pointer */
00224   pScr1 += (srcBLen - 1U);
00225 
00226 #else
00227 
00228   /* Apply loop unrolling and do 4 Copies simultaneously. */
00229   k = (srcBLen - 1U) >> 2U;
00230 
00231   /* First part of the processing with loop unrolling copies 4 data points at a time.
00232    ** a second loop below copies for the remaining 1 to 3 samples. */
00233   while (k > 0U)
00234   {
00235     /* copy second buffer in reversal manner */
00236     *pScr1++ = 0;
00237     *pScr1++ = 0;
00238     *pScr1++ = 0;
00239     *pScr1++ = 0;
00240 
00241     /* Decrement the loop counter */
00242     k--;
00243   }
00244 
00245   /* If the count is not a multiple of 4, copy remaining samples here.
00246    ** No loop unrolling is used. */
00247   k = (srcBLen - 1U) % 0x4U;
00248 
00249   while (k > 0U)
00250   {
00251     /* copy second buffer in reversal manner for remaining samples */
00252     *pScr1++ = 0;
00253 
00254     /* Decrement the loop counter */
00255     k--;
00256   }
00257 
00258 #endif
00259 
00260   /* Temporary pointer for scratch2 */
00261   py = pScratch2;
00262 
00263 
00264   /* Initialization of pIn2 pointer */
00265   pIn2 = py;
00266 
00267   /* First part of the processing with loop unrolling process 4 data points at a time.
00268    ** a second loop below process for the remaining 1 to 3 samples. */
00269 
00270   /* Actual convolution process starts here */
00271   blkCnt = (srcALen + srcBLen - 1U) >> 2;
00272 
00273   while (blkCnt > 0)
00274   {
00275     /* Initialze temporary scratch pointer as scratch1 */
00276     pScr1 = pScratch1;
00277 
00278     /* Clear Accumlators */
00279     acc0 = 0;
00280     acc1 = 0;
00281     acc2 = 0;
00282     acc3 = 0;
00283 
00284     /* Read two samples from scratch1 buffer */
00285     x1 = *__SIMD32(pScr1)++;
00286 
00287     /* Read next two samples from scratch1 buffer */
00288     x2 = *__SIMD32(pScr1)++;
00289 
00290     tapCnt = (srcBLen) >> 2U;
00291 
00292     while (tapCnt > 0U)
00293     {
00294 
00295 #ifndef UNALIGNED_SUPPORT_DISABLE
00296 
00297       /* Read four samples from smaller buffer */
00298       y1 = _SIMD32_OFFSET(pIn2);
00299       y2 = _SIMD32_OFFSET(pIn2 + 2U);
00300 
00301       /* multiply and accumlate */
00302       acc0 = __SMLALD(x1, y1, acc0);
00303       acc2 = __SMLALD(x2, y1, acc2);
00304 
00305       /* pack input data */
00306 #ifndef ARM_MATH_BIG_ENDIAN
00307       x3 = __PKHBT(x2, x1, 0);
00308 #else
00309       x3 = __PKHBT(x1, x2, 0);
00310 #endif
00311 
00312       /* multiply and accumlate */
00313       acc1 = __SMLALDX(x3, y1, acc1);
00314 
00315       /* Read next two samples from scratch1 buffer */
00316       x1 = _SIMD32_OFFSET(pScr1);
00317 
00318       /* multiply and accumlate */
00319       acc0 = __SMLALD(x2, y2, acc0);
00320       acc2 = __SMLALD(x1, y2, acc2);
00321 
00322       /* pack input data */
00323 #ifndef ARM_MATH_BIG_ENDIAN
00324       x3 = __PKHBT(x1, x2, 0);
00325 #else
00326       x3 = __PKHBT(x2, x1, 0);
00327 #endif
00328 
00329       acc3 = __SMLALDX(x3, y1, acc3);
00330       acc1 = __SMLALDX(x3, y2, acc1);
00331 
00332       x2 = _SIMD32_OFFSET(pScr1 + 2U);
00333 
00334 #ifndef ARM_MATH_BIG_ENDIAN
00335       x3 = __PKHBT(x2, x1, 0);
00336 #else
00337       x3 = __PKHBT(x1, x2, 0);
00338 #endif
00339 
00340       acc3 = __SMLALDX(x3, y2, acc3);
00341 
00342 #else
00343 
00344       /* Read four samples from smaller buffer */
00345       a = *pIn2;
00346       b = *(pIn2 + 1);
00347 
00348 #ifndef ARM_MATH_BIG_ENDIAN
00349       y1 = __PKHBT(a, b, 16);
00350 #else
00351       y1 = __PKHBT(b, a, 16);
00352 #endif
00353 
00354       a = *(pIn2 + 2);
00355       b = *(pIn2 + 3);
00356 #ifndef ARM_MATH_BIG_ENDIAN
00357       y2 = __PKHBT(a, b, 16);
00358 #else
00359       y2 = __PKHBT(b, a, 16);
00360 #endif
00361 
00362       acc0 = __SMLALD(x1, y1, acc0);
00363 
00364       acc2 = __SMLALD(x2, y1, acc2);
00365 
00366 #ifndef ARM_MATH_BIG_ENDIAN
00367       x3 = __PKHBT(x2, x1, 0);
00368 #else
00369       x3 = __PKHBT(x1, x2, 0);
00370 #endif
00371 
00372       acc1 = __SMLALDX(x3, y1, acc1);
00373 
00374       a = *pScr1;
00375       b = *(pScr1 + 1);
00376 
00377 #ifndef ARM_MATH_BIG_ENDIAN
00378       x1 = __PKHBT(a, b, 16);
00379 #else
00380       x1 = __PKHBT(b, a, 16);
00381 #endif
00382 
00383       acc0 = __SMLALD(x2, y2, acc0);
00384 
00385       acc2 = __SMLALD(x1, y2, acc2);
00386 
00387 #ifndef ARM_MATH_BIG_ENDIAN
00388       x3 = __PKHBT(x1, x2, 0);
00389 #else
00390       x3 = __PKHBT(x2, x1, 0);
00391 #endif
00392 
00393       acc3 = __SMLALDX(x3, y1, acc3);
00394 
00395       acc1 = __SMLALDX(x3, y2, acc1);
00396 
00397       a = *(pScr1 + 2);
00398       b = *(pScr1 + 3);
00399 
00400 #ifndef ARM_MATH_BIG_ENDIAN
00401       x2 = __PKHBT(a, b, 16);
00402 #else
00403       x2 = __PKHBT(b, a, 16);
00404 #endif
00405 
00406 #ifndef ARM_MATH_BIG_ENDIAN
00407       x3 = __PKHBT(x2, x1, 0);
00408 #else
00409       x3 = __PKHBT(x1, x2, 0);
00410 #endif
00411 
00412       acc3 = __SMLALDX(x3, y2, acc3);
00413 
00414 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00415 
00416       pIn2 += 4U;
00417       pScr1 += 4U;
00418 
00419 
00420       /* Decrement the loop counter */
00421       tapCnt--;
00422     }
00423 
00424     /* Update scratch pointer for remaining samples of smaller length sequence */
00425     pScr1 -= 4U;
00426 
00427     /* apply same above for remaining samples of smaller length sequence */
00428     tapCnt = (srcBLen) & 3U;
00429 
00430     while (tapCnt > 0U)
00431     {
00432 
00433       /* accumlate the results */
00434       acc0 += (*pScr1++ * *pIn2);
00435       acc1 += (*pScr1++ * *pIn2);
00436       acc2 += (*pScr1++ * *pIn2);
00437       acc3 += (*pScr1++ * *pIn2++);
00438 
00439       pScr1 -= 3U;
00440 
00441       /* Decrement the loop counter */
00442       tapCnt--;
00443     }
00444 
00445     blkCnt--;
00446 
00447 
00448     /* Store the results in the accumulators in the destination buffer. */
00449 
00450 #ifndef ARM_MATH_BIG_ENDIAN
00451 
00452     *__SIMD32(pOut)++ =
00453       __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00454 
00455     *__SIMD32(pOut)++ =
00456       __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00457 
00458 #else
00459 
00460     *__SIMD32(pOut)++ =
00461       __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00462 
00463     *__SIMD32(pOut)++ =
00464       __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00465 
00466 
00467 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN       */
00468 
00469     /* Initialization of inputB pointer */
00470     pIn2 = py;
00471 
00472     pScratch1 += 4U;
00473 
00474   }
00475 
00476 
00477   blkCnt = (srcALen + srcBLen - 1U) & 0x3;
00478 
00479   /* Calculate convolution for remaining samples of Bigger length sequence */
00480   while (blkCnt > 0)
00481   {
00482     /* Initialze temporary scratch pointer as scratch1 */
00483     pScr1 = pScratch1;
00484 
00485     /* Clear Accumlators */
00486     acc0 = 0;
00487 
00488     tapCnt = (srcBLen) >> 1U;
00489 
00490     while (tapCnt > 0U)
00491     {
00492 
00493       /* Read next two samples from scratch1 buffer */
00494       acc0 += (*pScr1++ * *pIn2++);
00495       acc0 += (*pScr1++ * *pIn2++);
00496 
00497       /* Decrement the loop counter */
00498       tapCnt--;
00499     }
00500 
00501     tapCnt = (srcBLen) & 1U;
00502 
00503     /* apply same above for remaining samples of smaller length sequence */
00504     while (tapCnt > 0U)
00505     {
00506 
00507       /* accumlate the results */
00508       acc0 += (*pScr1++ * *pIn2++);
00509 
00510       /* Decrement the loop counter */
00511       tapCnt--;
00512     }
00513 
00514     blkCnt--;
00515 
00516     /* The result is in 2.30 format.  Convert to 1.15 with saturation.
00517      ** Then store the output in the destination buffer. */
00518     *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00519 
00520 
00521     /* Initialization of inputB pointer */
00522     pIn2 = py;
00523 
00524     pScratch1 += 1U;
00525 
00526   }
00527 
00528 }
00529 
00530 
00531 /**
00532  * @} end of Conv group
00533  */
00534