Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_opt_q7.c Source File

arm_correlate_opt_q7.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_correlate_opt_q7.c
00004  * Description:  Correlation of Q7 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup Corr
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Correlation of Q7 sequences.
00042  * @param[in] *pSrcA points to the first input sequence.
00043  * @param[in] srcALen length of the first input sequence.
00044  * @param[in] *pSrcB points to the second input sequence.
00045  * @param[in] srcBLen length of the second input sequence.
00046  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
00047  * @param[in]  *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
00048  * @param[in]  *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
00049  * @return none.
00050  *
00051  *
00052  * \par Restrictions
00053  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
00054  *  In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
00055  *
00056  * @details
00057  * <b>Scaling and Overflow Behavior:</b>
00058  *
00059  * \par
00060  * The function is implemented using a 32-bit internal accumulator.
00061  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
00062  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
00063  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
00064  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
00065  *
00066  *
00067  */
00068 
00069 
00070 
00071 void arm_correlate_opt_q7(
00072   q7_t * pSrcA,
00073   uint32_t srcALen,
00074   q7_t * pSrcB,
00075   uint32_t srcBLen,
00076   q7_t * pDst,
00077   q15_t * pScratch1,
00078   q15_t * pScratch2)
00079 {
00080   q7_t *pOut = pDst;                             /* output pointer                */
00081   q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch */
00082   q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch */
00083   q7_t *pIn1;                                    /* inputA pointer                */
00084   q7_t *pIn2;                                    /* inputB pointer                */
00085   q15_t *py;                                     /* Intermediate inputB pointer   */
00086   q31_t acc0, acc1, acc2, acc3;                  /* Accumulators                  */
00087   uint32_t j, k = 0U, blkCnt;                    /* loop counter                  */
00088   int32_t inc = 1;                               /* output pointer increment          */
00089   uint32_t outBlockSize;                         /* loop counter                  */
00090   q15_t x4;                                      /* Temporary input variable      */
00091   uint32_t tapCnt;                               /* loop counter                  */
00092   q31_t x1, x2, x3, y1;                          /* Temporary input variables     */
00093 
00094   /* The algorithm implementation is based on the lengths of the inputs. */
00095   /* srcB is always made to slide across srcA. */
00096   /* So srcBLen is always considered as shorter or equal to srcALen */
00097   /* But CORR(x, y) is reverse of CORR(y, x) */
00098   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00099   /* and the destination pointer modifier, inc is set to -1 */
00100   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00101   /* But to improve the performance,
00102    * we include zeroes in the output instead of zero padding either of the the inputs*/
00103   /* If srcALen > srcBLen,
00104    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00105   /* If srcALen < srcBLen,
00106    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00107   if (srcALen >= srcBLen)
00108   {
00109     /* Initialization of inputA pointer */
00110     pIn1 = (pSrcA);
00111 
00112     /* Initialization of inputB pointer */
00113     pIn2 = (pSrcB);
00114 
00115     /* Number of output samples is calculated */
00116     outBlockSize = (2U * srcALen) - 1U;
00117 
00118     /* When srcALen > srcBLen, zero padding is done to srcB
00119      * to make their lengths equal.
00120      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
00121      * number of output samples are made zero */
00122     j = outBlockSize - (srcALen + (srcBLen - 1U));
00123 
00124     /* Updating the pointer position to non zero value */
00125     pOut += j;
00126 
00127   }
00128   else
00129   {
00130     /* Initialization of inputA pointer */
00131     pIn1 = (pSrcB);
00132 
00133     /* Initialization of inputB pointer */
00134     pIn2 = (pSrcA);
00135 
00136     /* srcBLen is always considered as shorter or equal to srcALen */
00137     j = srcBLen;
00138     srcBLen = srcALen;
00139     srcALen = j;
00140 
00141     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00142     /* Hence set the destination pointer to point to the last output sample */
00143     pOut = pDst + ((srcALen + srcBLen) - 2U);
00144 
00145     /* Destination address modifier is set to -1 */
00146     inc = -1;
00147 
00148   }
00149 
00150 
00151   /* Copy (srcBLen) samples in scratch buffer */
00152   k = srcBLen >> 2U;
00153 
00154   /* First part of the processing with loop unrolling copies 4 data points at a time.
00155    ** a second loop below copies for the remaining 1 to 3 samples. */
00156   while (k > 0U)
00157   {
00158     /* copy second buffer in reversal manner */
00159     x4 = (q15_t) * pIn2++;
00160     *pScr2++ = x4;
00161     x4 = (q15_t) * pIn2++;
00162     *pScr2++ = x4;
00163     x4 = (q15_t) * pIn2++;
00164     *pScr2++ = x4;
00165     x4 = (q15_t) * pIn2++;
00166     *pScr2++ = x4;
00167 
00168     /* Decrement the loop counter */
00169     k--;
00170   }
00171 
00172   /* If the count is not a multiple of 4, copy remaining samples here.
00173    ** No loop unrolling is used. */
00174   k = srcBLen % 0x4U;
00175 
00176   while (k > 0U)
00177   {
00178     /* copy second buffer in reversal manner for remaining samples */
00179     x4 = (q15_t) * pIn2++;
00180     *pScr2++ = x4;
00181 
00182     /* Decrement the loop counter */
00183     k--;
00184   }
00185 
00186   /* Fill (srcBLen - 1U) zeros in scratch buffer */
00187   arm_fill_q15(0, pScr1, (srcBLen - 1U));
00188 
00189   /* Update temporary scratch pointer */
00190   pScr1 += (srcBLen - 1U);
00191 
00192   /* Copy (srcALen) samples in scratch buffer */
00193   k = srcALen >> 2U;
00194 
00195   /* First part of the processing with loop unrolling copies 4 data points at a time.
00196    ** a second loop below copies for the remaining 1 to 3 samples. */
00197   while (k > 0U)
00198   {
00199     /* copy second buffer in reversal manner */
00200     x4 = (q15_t) * pIn1++;
00201     *pScr1++ = x4;
00202     x4 = (q15_t) * pIn1++;
00203     *pScr1++ = x4;
00204     x4 = (q15_t) * pIn1++;
00205     *pScr1++ = x4;
00206     x4 = (q15_t) * pIn1++;
00207     *pScr1++ = x4;
00208 
00209     /* Decrement the loop counter */
00210     k--;
00211   }
00212 
00213   /* If the count is not a multiple of 4, copy remaining samples here.
00214    ** No loop unrolling is used. */
00215   k = srcALen % 0x4U;
00216 
00217   while (k > 0U)
00218   {
00219     /* copy second buffer in reversal manner for remaining samples */
00220     x4 = (q15_t) * pIn1++;
00221     *pScr1++ = x4;
00222 
00223     /* Decrement the loop counter */
00224     k--;
00225   }
00226 
00227 #ifndef UNALIGNED_SUPPORT_DISABLE
00228 
00229   /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
00230   arm_fill_q15(0, pScr1, (srcBLen - 1U));
00231 
00232   /* Update pointer */
00233   pScr1 += (srcBLen - 1U);
00234 
00235 #else
00236 
00237 /* Apply loop unrolling and do 4 Copies simultaneously. */
00238   k = (srcBLen - 1U) >> 2U;
00239 
00240   /* First part of the processing with loop unrolling copies 4 data points at a time.
00241    ** a second loop below copies for the remaining 1 to 3 samples. */
00242   while (k > 0U)
00243   {
00244     /* copy second buffer in reversal manner */
00245     *pScr1++ = 0;
00246     *pScr1++ = 0;
00247     *pScr1++ = 0;
00248     *pScr1++ = 0;
00249 
00250     /* Decrement the loop counter */
00251     k--;
00252   }
00253 
00254   /* If the count is not a multiple of 4, copy remaining samples here.
00255    ** No loop unrolling is used. */
00256   k = (srcBLen - 1U) % 0x4U;
00257 
00258   while (k > 0U)
00259   {
00260     /* copy second buffer in reversal manner for remaining samples */
00261     *pScr1++ = 0;
00262 
00263     /* Decrement the loop counter */
00264     k--;
00265   }
00266 
00267 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00268 
00269   /* Temporary pointer for second sequence */
00270   py = pScratch2;
00271 
00272   /* Initialization of pScr2 pointer */
00273   pScr2 = pScratch2;
00274 
00275   /* Actual correlation process starts here */
00276   blkCnt = (srcALen + srcBLen - 1U) >> 2;
00277 
00278   while (blkCnt > 0)
00279   {
00280     /* Initialze temporary scratch pointer as scratch1 */
00281     pScr1 = pScratch1;
00282 
00283     /* Clear Accumlators */
00284     acc0 = 0;
00285     acc1 = 0;
00286     acc2 = 0;
00287     acc3 = 0;
00288 
00289     /* Read two samples from scratch1 buffer */
00290     x1 = *__SIMD32(pScr1)++;
00291 
00292     /* Read next two samples from scratch1 buffer */
00293     x2 = *__SIMD32(pScr1)++;
00294 
00295     tapCnt = (srcBLen) >> 2U;
00296 
00297     while (tapCnt > 0U)
00298     {
00299 
00300       /* Read four samples from smaller buffer */
00301       y1 = _SIMD32_OFFSET(pScr2);
00302 
00303       /* multiply and accumlate */
00304       acc0 = __SMLAD(x1, y1, acc0);
00305       acc2 = __SMLAD(x2, y1, acc2);
00306 
00307       /* pack input data */
00308 #ifndef ARM_MATH_BIG_ENDIAN
00309       x3 = __PKHBT(x2, x1, 0);
00310 #else
00311       x3 = __PKHBT(x1, x2, 0);
00312 #endif
00313 
00314       /* multiply and accumlate */
00315       acc1 = __SMLADX(x3, y1, acc1);
00316 
00317       /* Read next two samples from scratch1 buffer */
00318       x1 = *__SIMD32(pScr1)++;
00319 
00320       /* pack input data */
00321 #ifndef ARM_MATH_BIG_ENDIAN
00322       x3 = __PKHBT(x1, x2, 0);
00323 #else
00324       x3 = __PKHBT(x2, x1, 0);
00325 #endif
00326 
00327       acc3 = __SMLADX(x3, y1, acc3);
00328 
00329       /* Read four samples from smaller buffer */
00330       y1 = _SIMD32_OFFSET(pScr2 + 2U);
00331 
00332       acc0 = __SMLAD(x2, y1, acc0);
00333 
00334       acc2 = __SMLAD(x1, y1, acc2);
00335 
00336       acc1 = __SMLADX(x3, y1, acc1);
00337 
00338       x2 = *__SIMD32(pScr1)++;
00339 
00340 #ifndef ARM_MATH_BIG_ENDIAN
00341       x3 = __PKHBT(x2, x1, 0);
00342 #else
00343       x3 = __PKHBT(x1, x2, 0);
00344 #endif
00345 
00346       acc3 = __SMLADX(x3, y1, acc3);
00347 
00348       pScr2 += 4U;
00349 
00350 
00351       /* Decrement the loop counter */
00352       tapCnt--;
00353     }
00354 
00355 
00356 
00357     /* Update scratch pointer for remaining samples of smaller length sequence */
00358     pScr1 -= 4U;
00359 
00360 
00361     /* apply same above for remaining samples of smaller length sequence */
00362     tapCnt = (srcBLen) & 3U;
00363 
00364     while (tapCnt > 0U)
00365     {
00366 
00367       /* accumlate the results */
00368       acc0 += (*pScr1++ * *pScr2);
00369       acc1 += (*pScr1++ * *pScr2);
00370       acc2 += (*pScr1++ * *pScr2);
00371       acc3 += (*pScr1++ * *pScr2++);
00372 
00373       pScr1 -= 3U;
00374 
00375       /* Decrement the loop counter */
00376       tapCnt--;
00377     }
00378 
00379     blkCnt--;
00380 
00381     /* Store the result in the accumulator in the destination buffer. */
00382     *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
00383     pOut += inc;
00384     *pOut = (q7_t) (__SSAT(acc1 >> 7U, 8));
00385     pOut += inc;
00386     *pOut = (q7_t) (__SSAT(acc2 >> 7U, 8));
00387     pOut += inc;
00388     *pOut = (q7_t) (__SSAT(acc3 >> 7U, 8));
00389     pOut += inc;
00390 
00391     /* Initialization of inputB pointer */
00392     pScr2 = py;
00393 
00394     pScratch1 += 4U;
00395 
00396   }
00397 
00398 
00399   blkCnt = (srcALen + srcBLen - 1U) & 0x3;
00400 
00401   /* Calculate correlation for remaining samples of Bigger length sequence */
00402   while (blkCnt > 0)
00403   {
00404     /* Initialze temporary scratch pointer as scratch1 */
00405     pScr1 = pScratch1;
00406 
00407     /* Clear Accumlators */
00408     acc0 = 0;
00409 
00410     tapCnt = (srcBLen) >> 1U;
00411 
00412     while (tapCnt > 0U)
00413     {
00414       acc0 += (*pScr1++ * *pScr2++);
00415       acc0 += (*pScr1++ * *pScr2++);
00416 
00417       /* Decrement the loop counter */
00418       tapCnt--;
00419     }
00420 
00421     tapCnt = (srcBLen) & 1U;
00422 
00423     /* apply same above for remaining samples of smaller length sequence */
00424     while (tapCnt > 0U)
00425     {
00426 
00427       /* accumlate the results */
00428       acc0 += (*pScr1++ * *pScr2++);
00429 
00430       /* Decrement the loop counter */
00431       tapCnt--;
00432     }
00433 
00434     blkCnt--;
00435 
00436     /* Store the result in the accumulator in the destination buffer. */
00437     *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
00438 
00439     pOut += inc;
00440 
00441     /* Initialization of inputB pointer */
00442     pScr2 = py;
00443 
00444     pScratch1 += 1U;
00445 
00446   }
00447 
00448 }
00449 
00450 /**
00451  * @} end of Corr group
00452  */
00453