Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_q31.c Source File

arm_conv_partial_q31.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_partial_q31.c
00004  * Description:  Partial convolution of Q31 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup PartialConv
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Partial convolution of Q31 sequences.
00042  * @param[in]       *pSrcA points to the first input sequence.
00043  * @param[in]       srcALen length of the first input sequence.
00044  * @param[in]       *pSrcB points to the second input sequence.
00045  * @param[in]       srcBLen length of the second input sequence.
00046  * @param[out]      *pDst points to the location where the output result is written.
00047  * @param[in]       firstIndex is the first output sample to start with.
00048  * @param[in]       numPoints is the number of output points to be computed.
00049  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
00050  *
00051  * See <code>arm_conv_partial_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
00052  */
00053 
00054 arm_status arm_conv_partial_q31(
00055   q31_t * pSrcA,
00056   uint32_t srcALen,
00057   q31_t * pSrcB,
00058   uint32_t srcBLen,
00059   q31_t * pDst,
00060   uint32_t firstIndex,
00061   uint32_t numPoints)
00062 {
00063 
00064 
00065 #if defined (ARM_MATH_DSP)
00066 
00067   /* Run the below code for Cortex-M4 and Cortex-M3 */
00068 
00069   q31_t *pIn1;                                   /* inputA pointer               */
00070   q31_t *pIn2;                                   /* inputB pointer               */
00071   q31_t *pOut = pDst;                            /* output pointer               */
00072   q31_t *px;                                     /* Intermediate inputA pointer  */
00073   q31_t *py;                                     /* Intermediate inputB pointer  */
00074   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00075   q63_t sum, acc0, acc1, acc2;                   /* Accumulator                  */
00076   q31_t x0, x1, x2, c0;
00077   uint32_t j, k, count, check, blkCnt;
00078   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */
00079   arm_status status;                             /* status of Partial convolution */
00080 
00081 
00082   /* Check for range of output samples to be calculated */
00083   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00084   {
00085     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00086     status = ARM_MATH_ARGUMENT_ERROR;
00087   }
00088   else
00089   {
00090 
00091     /* The algorithm implementation is based on the lengths of the inputs. */
00092     /* srcB is always made to slide across srcA. */
00093     /* So srcBLen is always considered as shorter or equal to srcALen */
00094     if (srcALen >= srcBLen)
00095     {
00096       /* Initialization of inputA pointer */
00097       pIn1 = pSrcA;
00098 
00099       /* Initialization of inputB pointer */
00100       pIn2 = pSrcB;
00101     }
00102     else
00103     {
00104       /* Initialization of inputA pointer */
00105       pIn1 = pSrcB;
00106 
00107       /* Initialization of inputB pointer */
00108       pIn2 = pSrcA;
00109 
00110       /* srcBLen is always considered as shorter or equal to srcALen */
00111       j = srcBLen;
00112       srcBLen = srcALen;
00113       srcALen = j;
00114     }
00115 
00116     /* Conditions to check which loopCounter holds
00117      * the first and last indices of the output samples to be calculated. */
00118     check = firstIndex + numPoints;
00119     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00120     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00121     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00122     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
00123                                      (int32_t) numPoints) : 0;
00124     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00125                                     (int32_t) firstIndex);
00126     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00127 
00128     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00129     /* The function is internally
00130      * divided into three stages according to the number of multiplications that has to be
00131      * taken place between inputA samples and inputB samples. In the first stage of the
00132      * algorithm, the multiplications increase by one for every iteration.
00133      * In the second stage of the algorithm, srcBLen number of multiplications are done.
00134      * In the third stage of the algorithm, the multiplications decrease by one
00135      * for every iteration. */
00136 
00137     /* Set the output pointer to point to the firstIndex
00138      * of the output sample to be calculated. */
00139     pOut = pDst + firstIndex;
00140 
00141     /* --------------------------
00142      * Initializations of stage1
00143      * -------------------------*/
00144 
00145     /* sum = x[0] * y[0]
00146      * sum = x[0] * y[1] + x[1] * y[0]
00147      * ....
00148      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
00149      */
00150 
00151     /* In this stage the MAC operations are increased by 1 for every iteration.
00152        The count variable holds the number of MAC operations performed.
00153        Since the partial convolution starts from firstIndex
00154        Number of Macs to be performed is firstIndex + 1 */
00155     count = 1U + firstIndex;
00156 
00157     /* Working pointer of inputA */
00158     px = pIn1;
00159 
00160     /* Working pointer of inputB */
00161     pSrc2 = pIn2 + firstIndex;
00162     py = pSrc2;
00163 
00164     /* ------------------------
00165      * Stage1 process
00166      * ----------------------*/
00167 
00168     /* The first loop starts here */
00169     while (blockSize1 > 0)
00170     {
00171       /* Accumulator is made zero for every iteration */
00172       sum = 0;
00173 
00174       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00175       k = count >> 2U;
00176 
00177       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00178        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00179       while (k > 0U)
00180       {
00181         /* x[0] * y[srcBLen - 1] */
00182         sum += (q63_t) * px++ * (*py--);
00183         /* x[1] * y[srcBLen - 2] */
00184         sum += (q63_t) * px++ * (*py--);
00185         /* x[2] * y[srcBLen - 3] */
00186         sum += (q63_t) * px++ * (*py--);
00187         /* x[3] * y[srcBLen - 4] */
00188         sum += (q63_t) * px++ * (*py--);
00189 
00190         /* Decrement the loop counter */
00191         k--;
00192       }
00193 
00194       /* If the count is not a multiple of 4, compute any remaining MACs here.
00195        ** No loop unrolling is used. */
00196       k = count % 0x4U;
00197 
00198       while (k > 0U)
00199       {
00200         /* Perform the multiply-accumulate */
00201         sum += (q63_t) * px++ * (*py--);
00202 
00203         /* Decrement the loop counter */
00204         k--;
00205       }
00206 
00207       /* Store the result in the accumulator in the destination buffer. */
00208       *pOut++ = (q31_t) (sum >> 31);
00209 
00210       /* Update the inputA and inputB pointers for next MAC calculation */
00211       py = ++pSrc2;
00212       px = pIn1;
00213 
00214       /* Increment the MAC count */
00215       count++;
00216 
00217       /* Decrement the loop counter */
00218       blockSize1--;
00219     }
00220 
00221     /* --------------------------
00222      * Initializations of stage2
00223      * ------------------------*/
00224 
00225     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
00226      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
00227      * ....
00228      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
00229      */
00230 
00231     /* Working pointer of inputA */
00232     if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00233     {
00234       px = pIn1 + firstIndex - srcBLen + 1;
00235     }
00236     else
00237     {
00238       px = pIn1;
00239     }
00240 
00241     /* Working pointer of inputB */
00242     pSrc2 = pIn2 + (srcBLen - 1U);
00243     py = pSrc2;
00244 
00245     /* count is index by which the pointer pIn1 to be incremented */
00246     count = 0U;
00247 
00248     /* -------------------
00249      * Stage2 process
00250      * ------------------*/
00251 
00252     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00253      * So, to loop unroll over blockSize2,
00254      * srcBLen should be greater than or equal to 4 */
00255     if (srcBLen >= 4U)
00256     {
00257       /* Loop unroll over blkCnt */
00258 
00259       blkCnt = blockSize2 / 3;
00260       while (blkCnt > 0U)
00261       {
00262         /* Set all accumulators to zero */
00263         acc0 = 0;
00264         acc1 = 0;
00265         acc2 = 0;
00266 
00267         /* read x[0], x[1] samples */
00268         x0 = *(px++);
00269         x1 = *(px++);
00270 
00271         /* Apply loop unrolling and compute 3 MACs simultaneously. */
00272         k = srcBLen / 3;
00273 
00274         /* First part of the processing with loop unrolling.  Compute 3 MACs at a time.
00275          ** a second loop below computes MACs for the remaining 1 to 2 samples. */
00276         do
00277         {
00278           /* Read y[srcBLen - 1] sample */
00279           c0 = *(py);
00280 
00281           /* Read x[2] sample */
00282           x2 = *(px);
00283 
00284           /* Perform the multiply-accumulates */
00285           /* acc0 +=  x[0] * y[srcBLen - 1] */
00286           acc0 += (q63_t) x0 *c0;
00287           /* acc1 +=  x[1] * y[srcBLen - 1] */
00288           acc1 += (q63_t) x1 *c0;
00289           /* acc2 +=  x[2] * y[srcBLen - 1] */
00290           acc2 += (q63_t) x2 *c0;
00291 
00292           /* Read y[srcBLen - 2] sample */
00293           c0 = *(py - 1U);
00294 
00295           /* Read x[3] sample */
00296           x0 = *(px + 1U);
00297 
00298           /* Perform the multiply-accumulate */
00299           /* acc0 +=  x[1] * y[srcBLen - 2] */
00300           acc0 += (q63_t) x1 *c0;
00301           /* acc1 +=  x[2] * y[srcBLen - 2] */
00302           acc1 += (q63_t) x2 *c0;
00303           /* acc2 +=  x[3] * y[srcBLen - 2] */
00304           acc2 += (q63_t) x0 *c0;
00305 
00306           /* Read y[srcBLen - 3] sample */
00307           c0 = *(py - 2U);
00308 
00309           /* Read x[4] sample */
00310           x1 = *(px + 2U);
00311 
00312           /* Perform the multiply-accumulates */
00313           /* acc0 +=  x[2] * y[srcBLen - 3] */
00314           acc0 += (q63_t) x2 *c0;
00315           /* acc1 +=  x[3] * y[srcBLen - 2] */
00316           acc1 += (q63_t) x0 *c0;
00317           /* acc2 +=  x[4] * y[srcBLen - 2] */
00318           acc2 += (q63_t) x1 *c0;
00319 
00320 
00321           px += 3U;
00322 
00323           py -= 3U;
00324 
00325         } while (--k);
00326 
00327         /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
00328          ** No loop unrolling is used. */
00329         k = srcBLen - (3 * (srcBLen / 3));
00330 
00331         while (k > 0U)
00332         {
00333           /* Read y[srcBLen - 5] sample */
00334           c0 = *(py--);
00335 
00336           /* Read x[7] sample */
00337           x2 = *(px++);
00338 
00339           /* Perform the multiply-accumulates */
00340           /* acc0 +=  x[4] * y[srcBLen - 5] */
00341           acc0 += (q63_t) x0 *c0;
00342           /* acc1 +=  x[5] * y[srcBLen - 5] */
00343           acc1 += (q63_t) x1 *c0;
00344           /* acc2 +=  x[6] * y[srcBLen - 5] */
00345           acc2 += (q63_t) x2 *c0;
00346 
00347           /* Reuse the present samples for the next MAC */
00348           x0 = x1;
00349           x1 = x2;
00350 
00351           /* Decrement the loop counter */
00352           k--;
00353         }
00354 
00355         /* Store the result in the accumulator in the destination buffer. */
00356         *pOut++ = (q31_t) (acc0 >> 31);
00357         *pOut++ = (q31_t) (acc1 >> 31);
00358         *pOut++ = (q31_t) (acc2 >> 31);
00359 
00360         /* Increment the pointer pIn1 index, count by 3 */
00361         count += 3U;
00362 
00363         /* Update the inputA and inputB pointers for next MAC calculation */
00364         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00365         {
00366           px = pIn1 + firstIndex - srcBLen + 1 + count;
00367         }
00368         else
00369         {
00370           px = pIn1 + count;
00371         }
00372         py = pSrc2;
00373 
00374         /* Decrement the loop counter */
00375         blkCnt--;
00376       }
00377 
00378       /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.
00379        ** No loop unrolling is used. */
00380       blkCnt = blockSize2 - 3 * (blockSize2 / 3);
00381 
00382       while (blkCnt > 0U)
00383       {
00384         /* Accumulator is made zero for every iteration */
00385         sum = 0;
00386 
00387         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00388         k = srcBLen >> 2U;
00389 
00390         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00391          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00392         while (k > 0U)
00393         {
00394           /* Perform the multiply-accumulates */
00395           sum += (q63_t) * px++ * (*py--);
00396           sum += (q63_t) * px++ * (*py--);
00397           sum += (q63_t) * px++ * (*py--);
00398           sum += (q63_t) * px++ * (*py--);
00399 
00400           /* Decrement the loop counter */
00401           k--;
00402         }
00403 
00404         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00405          ** No loop unrolling is used. */
00406         k = srcBLen % 0x4U;
00407 
00408         while (k > 0U)
00409         {
00410           /* Perform the multiply-accumulate */
00411           sum += (q63_t) * px++ * (*py--);
00412 
00413           /* Decrement the loop counter */
00414           k--;
00415         }
00416 
00417         /* Store the result in the accumulator in the destination buffer. */
00418         *pOut++ = (q31_t) (sum >> 31);
00419 
00420         /* Increment the MAC count */
00421         count++;
00422 
00423         /* Update the inputA and inputB pointers for next MAC calculation */
00424         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00425         {
00426           px = pIn1 + firstIndex - srcBLen + 1 + count;
00427         }
00428         else
00429         {
00430           px = pIn1 + count;
00431         }
00432         py = pSrc2;
00433 
00434         /* Decrement the loop counter */
00435         blkCnt--;
00436       }
00437     }
00438     else
00439     {
00440       /* If the srcBLen is not a multiple of 4,
00441        * the blockSize2 loop cannot be unrolled by 4 */
00442       blkCnt = (uint32_t) blockSize2;
00443 
00444       while (blkCnt > 0U)
00445       {
00446         /* Accumulator is made zero for every iteration */
00447         sum = 0;
00448 
00449         /* srcBLen number of MACS should be performed */
00450         k = srcBLen;
00451 
00452         while (k > 0U)
00453         {
00454           /* Perform the multiply-accumulate */
00455           sum += (q63_t) * px++ * (*py--);
00456 
00457           /* Decrement the loop counter */
00458           k--;
00459         }
00460 
00461         /* Store the result in the accumulator in the destination buffer. */
00462         *pOut++ = (q31_t) (sum >> 31);
00463 
00464         /* Increment the MAC count */
00465         count++;
00466 
00467         /* Update the inputA and inputB pointers for next MAC calculation */
00468         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00469         {
00470           px = pIn1 + firstIndex - srcBLen + 1 + count;
00471         }
00472         else
00473         {
00474           px = pIn1 + count;
00475         }
00476         py = pSrc2;
00477 
00478         /* Decrement the loop counter */
00479         blkCnt--;
00480       }
00481     }
00482 
00483 
00484     /* --------------------------
00485      * Initializations of stage3
00486      * -------------------------*/
00487 
00488     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
00489      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
00490      * ....
00491      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
00492      * sum +=  x[srcALen-1] * y[srcBLen-1]
00493      */
00494 
00495     /* In this stage the MAC operations are decreased by 1 for every iteration.
00496        The blockSize3 variable holds the number of MAC operations performed */
00497     count = srcBLen - 1U;
00498 
00499     /* Working pointer of inputA */
00500     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
00501     px = pSrc1;
00502 
00503     /* Working pointer of inputB */
00504     pSrc2 = pIn2 + (srcBLen - 1U);
00505     py = pSrc2;
00506 
00507     /* -------------------
00508      * Stage3 process
00509      * ------------------*/
00510 
00511     while (blockSize3 > 0)
00512     {
00513       /* Accumulator is made zero for every iteration */
00514       sum = 0;
00515 
00516       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00517       k = count >> 2U;
00518 
00519       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00520        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00521       while (k > 0U)
00522       {
00523         sum += (q63_t) * px++ * (*py--);
00524         sum += (q63_t) * px++ * (*py--);
00525         sum += (q63_t) * px++ * (*py--);
00526         sum += (q63_t) * px++ * (*py--);
00527 
00528         /* Decrement the loop counter */
00529         k--;
00530       }
00531 
00532       /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
00533        ** No loop unrolling is used. */
00534       k = count % 0x4U;
00535 
00536       while (k > 0U)
00537       {
00538         /* Perform the multiply-accumulate */
00539         sum += (q63_t) * px++ * (*py--);
00540 
00541         /* Decrement the loop counter */
00542         k--;
00543       }
00544 
00545       /* Store the result in the accumulator in the destination buffer. */
00546       *pOut++ = (q31_t) (sum >> 31);
00547 
00548       /* Update the inputA and inputB pointers for next MAC calculation */
00549       px = ++pSrc1;
00550       py = pSrc2;
00551 
00552       /* Decrement the MAC count */
00553       count--;
00554 
00555       /* Decrement the loop counter */
00556       blockSize3--;
00557 
00558     }
00559 
00560     /* set status as ARM_MATH_SUCCESS */
00561     status = ARM_MATH_SUCCESS;
00562   }
00563 
00564   /* Return to application */
00565   return (status);
00566 
00567 #else
00568 
00569   /* Run the below code for Cortex-M0 */
00570 
00571   q31_t *pIn1 = pSrcA;                           /* inputA pointer */
00572   q31_t *pIn2 = pSrcB;                           /* inputB pointer */
00573   q63_t sum;                                     /* Accumulator */
00574   uint32_t i, j;                                 /* loop counters */
00575   arm_status status;                             /* status of Partial convolution */
00576 
00577   /* Check for range of output samples to be calculated */
00578   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00579   {
00580     /* Set status as ARM_ARGUMENT_ERROR */
00581     status = ARM_MATH_ARGUMENT_ERROR;
00582   }
00583   else
00584   {
00585     /* Loop to calculate convolution for output length number of values */
00586     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00587     {
00588       /* Initialize sum with zero to carry on MAC operations */
00589       sum = 0;
00590 
00591       /* Loop to perform MAC operations according to convolution equation */
00592       for (j = 0; j <= i; j++)
00593       {
00594         /* Check the array limitations */
00595         if (((i - j) < srcBLen) && (j < srcALen))
00596         {
00597           /* z[i] += x[i-j] * y[j] */
00598           sum += ((q63_t) pIn1[j] * (pIn2[i - j]));
00599         }
00600       }
00601 
00602       /* Store the output in the destination buffer */
00603       pDst[i] = (q31_t) (sum >> 31U);
00604     }
00605     /* set status as ARM_SUCCESS as there are no argument errors */
00606     status = ARM_MATH_SUCCESS;
00607   }
00608   return (status);
00609 
00610 #endif /*    #if defined (ARM_MATH_DSP)      */
00611 
00612 }
00613 
00614 /**
00615  * @} end of PartialConv group
00616  */
00617