Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_opt_q15.c Source File

arm_correlate_opt_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_correlate_opt_q15.c
00004  * Description:  Correlation of Q15 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup Corr
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Correlation of Q15 sequences.
00042  * @param[in] *pSrcA points to the first input sequence.
00043  * @param[in] srcALen length of the first input sequence.
00044  * @param[in] *pSrcB points to the second input sequence.
00045  * @param[in] srcBLen length of the second input sequence.
00046  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
00047  * @param[in]  *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
00048  * @return none.
00049  *
00050  * \par Restrictions
00051  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
00052  *  In this case input, output, scratch buffers should be aligned by 32-bit
00053  *
00054  * @details
00055  * <b>Scaling and Overflow Behavior:</b>
00056  *
00057  * \par
00058  * The function is implemented using a 64-bit internal accumulator.
00059  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
00060  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
00061  * This approach provides 33 guard bits and there is no risk of overflow.
00062  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
00063  *
00064  * \par
00065  * Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
00066  *
00067  *
00068  */
00069 
00070 
00071 void arm_correlate_opt_q15(
00072   q15_t * pSrcA,
00073   uint32_t srcALen,
00074   q15_t * pSrcB,
00075   uint32_t srcBLen,
00076   q15_t * pDst,
00077   q15_t * pScratch)
00078 {
00079   q15_t *pIn1;                                   /* inputA pointer               */
00080   q15_t *pIn2;                                   /* inputB pointer               */
00081   q63_t acc0, acc1, acc2, acc3;                  /* Accumulators                  */
00082   q15_t *py;                                     /* Intermediate inputB pointer  */
00083   q31_t x1, x2, x3;                              /* temporary variables for holding input1 and input2 values */
00084   uint32_t j, blkCnt, outBlockSize;              /* loop counter                 */
00085   int32_t inc = 1;                               /* output pointer increment     */
00086   uint32_t tapCnt;
00087   q31_t y1, y2;
00088   q15_t *pScr;                                   /* Intermediate pointers        */
00089   q15_t *pOut = pDst;                            /* output pointer               */
00090 #ifdef UNALIGNED_SUPPORT_DISABLE
00091 
00092   q15_t a, b;
00093 
00094 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00095 
00096   /* The algorithm implementation is based on the lengths of the inputs. */
00097   /* srcB is always made to slide across srcA. */
00098   /* So srcBLen is always considered as shorter or equal to srcALen */
00099   /* But CORR(x, y) is reverse of CORR(y, x) */
00100   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00101   /* and the destination pointer modifier, inc is set to -1 */
00102   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00103   /* But to improve the performance,
00104    * we include zeroes in the output instead of zero padding either of the the inputs*/
00105   /* If srcALen > srcBLen,
00106    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00107   /* If srcALen < srcBLen,
00108    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00109   if (srcALen >= srcBLen)
00110   {
00111     /* Initialization of inputA pointer */
00112     pIn1 = (pSrcA);
00113 
00114     /* Initialization of inputB pointer */
00115     pIn2 = (pSrcB);
00116 
00117     /* Number of output samples is calculated */
00118     outBlockSize = (2U * srcALen) - 1U;
00119 
00120     /* When srcALen > srcBLen, zero padding is done to srcB
00121      * to make their lengths equal.
00122      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
00123      * number of output samples are made zero */
00124     j = outBlockSize - (srcALen + (srcBLen - 1U));
00125 
00126     /* Updating the pointer position to non zero value */
00127     pOut += j;
00128 
00129   }
00130   else
00131   {
00132     /* Initialization of inputA pointer */
00133     pIn1 = (pSrcB);
00134 
00135     /* Initialization of inputB pointer */
00136     pIn2 = (pSrcA);
00137 
00138     /* srcBLen is always considered as shorter or equal to srcALen */
00139     j = srcBLen;
00140     srcBLen = srcALen;
00141     srcALen = j;
00142 
00143     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00144     /* Hence set the destination pointer to point to the last output sample */
00145     pOut = pDst + ((srcALen + srcBLen) - 2U);
00146 
00147     /* Destination address modifier is set to -1 */
00148     inc = -1;
00149 
00150   }
00151 
00152   pScr = pScratch;
00153 
00154   /* Fill (srcBLen - 1U) zeros in scratch buffer */
00155   arm_fill_q15(0, pScr, (srcBLen - 1U));
00156 
00157   /* Update temporary scratch pointer */
00158   pScr += (srcBLen - 1U);
00159 
00160 #ifndef UNALIGNED_SUPPORT_DISABLE
00161 
00162   /* Copy (srcALen) samples in scratch buffer */
00163   arm_copy_q15(pIn1, pScr, srcALen);
00164 
00165   /* Update pointers */
00166   //pIn1 += srcALen;
00167   pScr += srcALen;
00168 
00169 #else
00170 
00171   /* Apply loop unrolling and do 4 Copies simultaneously. */
00172   j = srcALen >> 2U;
00173 
00174   /* First part of the processing with loop unrolling copies 4 data points at a time.
00175    ** a second loop below copies for the remaining 1 to 3 samples. */
00176   while (j > 0U)
00177   {
00178     /* copy second buffer in reversal manner */
00179     *pScr++ = *pIn1++;
00180     *pScr++ = *pIn1++;
00181     *pScr++ = *pIn1++;
00182     *pScr++ = *pIn1++;
00183 
00184     /* Decrement the loop counter */
00185     j--;
00186   }
00187 
00188   /* If the count is not a multiple of 4, copy remaining samples here.
00189    ** No loop unrolling is used. */
00190   j = srcALen % 0x4U;
00191 
00192   while (j > 0U)
00193   {
00194     /* copy second buffer in reversal manner for remaining samples */
00195     *pScr++ = *pIn1++;
00196 
00197     /* Decrement the loop counter */
00198     j--;
00199   }
00200 
00201 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00202 
00203 #ifndef UNALIGNED_SUPPORT_DISABLE
00204 
00205   /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
00206   arm_fill_q15(0, pScr, (srcBLen - 1U));
00207 
00208   /* Update pointer */
00209   pScr += (srcBLen - 1U);
00210 
00211 #else
00212 
00213 /* Apply loop unrolling and do 4 Copies simultaneously. */
00214   j = (srcBLen - 1U) >> 2U;
00215 
00216   /* First part of the processing with loop unrolling copies 4 data points at a time.
00217    ** a second loop below copies for the remaining 1 to 3 samples. */
00218   while (j > 0U)
00219   {
00220     /* copy second buffer in reversal manner */
00221     *pScr++ = 0;
00222     *pScr++ = 0;
00223     *pScr++ = 0;
00224     *pScr++ = 0;
00225 
00226     /* Decrement the loop counter */
00227     j--;
00228   }
00229 
00230   /* If the count is not a multiple of 4, copy remaining samples here.
00231    ** No loop unrolling is used. */
00232   j = (srcBLen - 1U) % 0x4U;
00233 
00234   while (j > 0U)
00235   {
00236     /* copy second buffer in reversal manner for remaining samples */
00237     *pScr++ = 0;
00238 
00239     /* Decrement the loop counter */
00240     j--;
00241   }
00242 
00243 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00244 
00245   /* Temporary pointer for scratch2 */
00246   py = pIn2;
00247 
00248 
00249   /* Actual correlation process starts here */
00250   blkCnt = (srcALen + srcBLen - 1U) >> 2;
00251 
00252   while (blkCnt > 0)
00253   {
00254     /* Initialze temporary scratch pointer as scratch1 */
00255     pScr = pScratch;
00256 
00257     /* Clear Accumlators */
00258     acc0 = 0;
00259     acc1 = 0;
00260     acc2 = 0;
00261     acc3 = 0;
00262 
00263     /* Read four samples from scratch1 buffer */
00264     x1 = *__SIMD32(pScr)++;
00265 
00266     /* Read next four samples from scratch1 buffer */
00267     x2 = *__SIMD32(pScr)++;
00268 
00269     tapCnt = (srcBLen) >> 2U;
00270 
00271     while (tapCnt > 0U)
00272     {
00273 
00274 #ifndef UNALIGNED_SUPPORT_DISABLE
00275 
00276       /* Read four samples from smaller buffer */
00277       y1 = _SIMD32_OFFSET(pIn2);
00278       y2 = _SIMD32_OFFSET(pIn2 + 2U);
00279 
00280       acc0 = __SMLALD(x1, y1, acc0);
00281 
00282       acc2 = __SMLALD(x2, y1, acc2);
00283 
00284 #ifndef ARM_MATH_BIG_ENDIAN
00285       x3 = __PKHBT(x2, x1, 0);
00286 #else
00287       x3 = __PKHBT(x1, x2, 0);
00288 #endif
00289 
00290       acc1 = __SMLALDX(x3, y1, acc1);
00291 
00292       x1 = _SIMD32_OFFSET(pScr);
00293 
00294       acc0 = __SMLALD(x2, y2, acc0);
00295 
00296       acc2 = __SMLALD(x1, y2, acc2);
00297 
00298 #ifndef ARM_MATH_BIG_ENDIAN
00299       x3 = __PKHBT(x1, x2, 0);
00300 #else
00301       x3 = __PKHBT(x2, x1, 0);
00302 #endif
00303 
00304       acc3 = __SMLALDX(x3, y1, acc3);
00305 
00306       acc1 = __SMLALDX(x3, y2, acc1);
00307 
00308       x2 = _SIMD32_OFFSET(pScr + 2U);
00309 
00310 #ifndef ARM_MATH_BIG_ENDIAN
00311       x3 = __PKHBT(x2, x1, 0);
00312 #else
00313       x3 = __PKHBT(x1, x2, 0);
00314 #endif
00315 
00316       acc3 = __SMLALDX(x3, y2, acc3);
00317 
00318 #else
00319 
00320       /* Read four samples from smaller buffer */
00321       a = *pIn2;
00322       b = *(pIn2 + 1);
00323 
00324 #ifndef ARM_MATH_BIG_ENDIAN
00325       y1 = __PKHBT(a, b, 16);
00326 #else
00327       y1 = __PKHBT(b, a, 16);
00328 #endif
00329 
00330       a = *(pIn2 + 2);
00331       b = *(pIn2 + 3);
00332 #ifndef ARM_MATH_BIG_ENDIAN
00333       y2 = __PKHBT(a, b, 16);
00334 #else
00335       y2 = __PKHBT(b, a, 16);
00336 #endif
00337 
00338       acc0 = __SMLALD(x1, y1, acc0);
00339 
00340       acc2 = __SMLALD(x2, y1, acc2);
00341 
00342 #ifndef ARM_MATH_BIG_ENDIAN
00343       x3 = __PKHBT(x2, x1, 0);
00344 #else
00345       x3 = __PKHBT(x1, x2, 0);
00346 #endif
00347 
00348       acc1 = __SMLALDX(x3, y1, acc1);
00349 
00350       a = *pScr;
00351       b = *(pScr + 1);
00352 
00353 #ifndef ARM_MATH_BIG_ENDIAN
00354       x1 = __PKHBT(a, b, 16);
00355 #else
00356       x1 = __PKHBT(b, a, 16);
00357 #endif
00358 
00359       acc0 = __SMLALD(x2, y2, acc0);
00360 
00361       acc2 = __SMLALD(x1, y2, acc2);
00362 
00363 #ifndef ARM_MATH_BIG_ENDIAN
00364       x3 = __PKHBT(x1, x2, 0);
00365 #else
00366       x3 = __PKHBT(x2, x1, 0);
00367 #endif
00368 
00369       acc3 = __SMLALDX(x3, y1, acc3);
00370 
00371       acc1 = __SMLALDX(x3, y2, acc1);
00372 
00373       a = *(pScr + 2);
00374       b = *(pScr + 3);
00375 
00376 #ifndef ARM_MATH_BIG_ENDIAN
00377       x2 = __PKHBT(a, b, 16);
00378 #else
00379       x2 = __PKHBT(b, a, 16);
00380 #endif
00381 
00382 #ifndef ARM_MATH_BIG_ENDIAN
00383       x3 = __PKHBT(x2, x1, 0);
00384 #else
00385       x3 = __PKHBT(x1, x2, 0);
00386 #endif
00387 
00388       acc3 = __SMLALDX(x3, y2, acc3);
00389 
00390 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00391 
00392       pIn2 += 4U;
00393 
00394       pScr += 4U;
00395 
00396 
00397       /* Decrement the loop counter */
00398       tapCnt--;
00399     }
00400 
00401 
00402 
00403     /* Update scratch pointer for remaining samples of smaller length sequence */
00404     pScr -= 4U;
00405 
00406 
00407     /* apply same above for remaining samples of smaller length sequence */
00408     tapCnt = (srcBLen) & 3U;
00409 
00410     while (tapCnt > 0U)
00411     {
00412 
00413       /* accumlate the results */
00414       acc0 += (*pScr++ * *pIn2);
00415       acc1 += (*pScr++ * *pIn2);
00416       acc2 += (*pScr++ * *pIn2);
00417       acc3 += (*pScr++ * *pIn2++);
00418 
00419       pScr -= 3U;
00420 
00421       /* Decrement the loop counter */
00422       tapCnt--;
00423     }
00424 
00425     blkCnt--;
00426 
00427 
00428     /* Store the results in the accumulators in the destination buffer. */
00429     *pOut = (__SSAT(acc0 >> 15U, 16));
00430     pOut += inc;
00431     *pOut = (__SSAT(acc1 >> 15U, 16));
00432     pOut += inc;
00433     *pOut = (__SSAT(acc2 >> 15U, 16));
00434     pOut += inc;
00435     *pOut = (__SSAT(acc3 >> 15U, 16));
00436     pOut += inc;
00437 
00438     /* Initialization of inputB pointer */
00439     pIn2 = py;
00440 
00441     pScratch += 4U;
00442 
00443   }
00444 
00445 
00446   blkCnt = (srcALen + srcBLen - 1U) & 0x3;
00447 
00448   /* Calculate correlation for remaining samples of Bigger length sequence */
00449   while (blkCnt > 0)
00450   {
00451     /* Initialze temporary scratch pointer as scratch1 */
00452     pScr = pScratch;
00453 
00454     /* Clear Accumlators */
00455     acc0 = 0;
00456 
00457     tapCnt = (srcBLen) >> 1U;
00458 
00459     while (tapCnt > 0U)
00460     {
00461 
00462       acc0 += (*pScr++ * *pIn2++);
00463       acc0 += (*pScr++ * *pIn2++);
00464 
00465       /* Decrement the loop counter */
00466       tapCnt--;
00467     }
00468 
00469     tapCnt = (srcBLen) & 1U;
00470 
00471     /* apply same above for remaining samples of smaller length sequence */
00472     while (tapCnt > 0U)
00473     {
00474 
00475       /* accumlate the results */
00476       acc0 += (*pScr++ * *pIn2++);
00477 
00478       /* Decrement the loop counter */
00479       tapCnt--;
00480     }
00481 
00482     blkCnt--;
00483 
00484     /* Store the result in the accumulator in the destination buffer. */
00485     *pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
00486 
00487     pOut += inc;
00488 
00489     /* Initialization of inputB pointer */
00490     pIn2 = py;
00491 
00492     pScratch += 1U;
00493 
00494   }
00495 
00496 
00497 }
00498 
00499 /**
00500  * @} end of Corr group
00501  */
00502