Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_q31.c Source File

arm_conv_q31.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_q31.c
00004  * Description:  Convolution of Q31 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup Conv
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Convolution of Q31 sequences.
00042  * @param[in] *pSrcA points to the first input sequence.
00043  * @param[in] srcALen length of the first input sequence.
00044  * @param[in] *pSrcB points to the second input sequence.
00045  * @param[in] srcBLen length of the second input sequence.
00046  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
00047  * @return none.
00048  *
00049  * @details
00050  * <b>Scaling and Overflow Behavior:</b>
00051  *
00052  * \par
00053  * The function is implemented using an internal 64-bit accumulator.
00054  * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
00055  * There is no saturation on intermediate additions.
00056  * Thus, if the accumulator overflows it wraps around and distorts the result.
00057  * The input signals should be scaled down to avoid intermediate overflows.
00058  * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
00059  * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
00060  * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
00061  *
00062  * \par
00063  * See <code>arm_conv_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
00064  */
00065 
00066 void arm_conv_q31(
00067   q31_t * pSrcA,
00068   uint32_t srcALen,
00069   q31_t * pSrcB,
00070   uint32_t srcBLen,
00071   q31_t * pDst)
00072 {
00073 
00074 
00075 #if defined (ARM_MATH_DSP)
00076 
00077   /* Run the below code for Cortex-M4 and Cortex-M3 */
00078 
00079   q31_t *pIn1;                                   /* inputA pointer */
00080   q31_t *pIn2;                                   /* inputB pointer */
00081   q31_t *pOut = pDst;                            /* output pointer */
00082   q31_t *px;                                     /* Intermediate inputA pointer  */
00083   q31_t *py;                                     /* Intermediate inputB pointer  */
00084   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
00085   q63_t sum;                                     /* Accumulator */
00086   q63_t acc0, acc1, acc2;                        /* Accumulator */
00087   q31_t x0, x1, x2, c0;                          /* Temporary variables to hold state and coefficient values */
00088   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */
00089 
00090   /* The algorithm implementation is based on the lengths of the inputs. */
00091   /* srcB is always made to slide across srcA. */
00092   /* So srcBLen is always considered as shorter or equal to srcALen */
00093   if (srcALen >= srcBLen)
00094   {
00095     /* Initialization of inputA pointer */
00096     pIn1 = pSrcA;
00097 
00098     /* Initialization of inputB pointer */
00099     pIn2 = pSrcB;
00100   }
00101   else
00102   {
00103     /* Initialization of inputA pointer */
00104     pIn1 = (q31_t *) pSrcB;
00105 
00106     /* Initialization of inputB pointer */
00107     pIn2 = (q31_t *) pSrcA;
00108 
00109     /* srcBLen is always considered as shorter or equal to srcALen */
00110     j = srcBLen;
00111     srcBLen = srcALen;
00112     srcALen = j;
00113   }
00114 
00115   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00116   /* The function is internally
00117    * divided into three stages according to the number of multiplications that has to be
00118    * taken place between inputA samples and inputB samples. In the first stage of the
00119    * algorithm, the multiplications increase by one for every iteration.
00120    * In the second stage of the algorithm, srcBLen number of multiplications are done.
00121    * In the third stage of the algorithm, the multiplications decrease by one
00122    * for every iteration. */
00123 
00124   /* The algorithm is implemented in three stages.
00125      The loop counters of each stage is initiated here. */
00126   blockSize1 = srcBLen - 1U;
00127   blockSize2 = srcALen - (srcBLen - 1U);
00128   blockSize3 = blockSize1;
00129 
00130   /* --------------------------
00131    * Initializations of stage1
00132    * -------------------------*/
00133 
00134   /* sum = x[0] * y[0]
00135    * sum = x[0] * y[1] + x[1] * y[0]
00136    * ....
00137    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
00138    */
00139 
00140   /* In this stage the MAC operations are increased by 1 for every iteration.
00141      The count variable holds the number of MAC operations performed */
00142   count = 1U;
00143 
00144   /* Working pointer of inputA */
00145   px = pIn1;
00146 
00147   /* Working pointer of inputB */
00148   py = pIn2;
00149 
00150 
00151   /* ------------------------
00152    * Stage1 process
00153    * ----------------------*/
00154 
00155   /* The first stage starts here */
00156   while (blockSize1 > 0U)
00157   {
00158     /* Accumulator is made zero for every iteration */
00159     sum = 0;
00160 
00161     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00162     k = count >> 2U;
00163 
00164     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00165      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00166     while (k > 0U)
00167     {
00168       /* x[0] * y[srcBLen - 1] */
00169       sum += (q63_t) * px++ * (*py--);
00170       /* x[1] * y[srcBLen - 2] */
00171       sum += (q63_t) * px++ * (*py--);
00172       /* x[2] * y[srcBLen - 3] */
00173       sum += (q63_t) * px++ * (*py--);
00174       /* x[3] * y[srcBLen - 4] */
00175       sum += (q63_t) * px++ * (*py--);
00176 
00177       /* Decrement the loop counter */
00178       k--;
00179     }
00180 
00181     /* If the count is not a multiple of 4, compute any remaining MACs here.
00182      ** No loop unrolling is used. */
00183     k = count % 0x4U;
00184 
00185     while (k > 0U)
00186     {
00187       /* Perform the multiply-accumulate */
00188       sum += (q63_t) * px++ * (*py--);
00189 
00190       /* Decrement the loop counter */
00191       k--;
00192     }
00193 
00194     /* Store the result in the accumulator in the destination buffer. */
00195     *pOut++ = (q31_t) (sum >> 31);
00196 
00197     /* Update the inputA and inputB pointers for next MAC calculation */
00198     py = pIn2 + count;
00199     px = pIn1;
00200 
00201     /* Increment the MAC count */
00202     count++;
00203 
00204     /* Decrement the loop counter */
00205     blockSize1--;
00206   }
00207 
00208   /* --------------------------
00209    * Initializations of stage2
00210    * ------------------------*/
00211 
00212   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
00213    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
00214    * ....
00215    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
00216    */
00217 
00218   /* Working pointer of inputA */
00219   px = pIn1;
00220 
00221   /* Working pointer of inputB */
00222   pSrc2 = pIn2 + (srcBLen - 1U);
00223   py = pSrc2;
00224 
00225   /* count is index by which the pointer pIn1 to be incremented */
00226   count = 0U;
00227 
00228   /* -------------------
00229    * Stage2 process
00230    * ------------------*/
00231 
00232   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00233    * So, to loop unroll over blockSize2,
00234    * srcBLen should be greater than or equal to 4 */
00235   if (srcBLen >= 4U)
00236   {
00237     /* Loop unroll by 3 */
00238     blkCnt = blockSize2 / 3;
00239 
00240     while (blkCnt > 0U)
00241     {
00242       /* Set all accumulators to zero */
00243       acc0 = 0;
00244       acc1 = 0;
00245       acc2 = 0;
00246 
00247       /* read x[0], x[1], x[2] samples */
00248       x0 = *(px++);
00249       x1 = *(px++);
00250 
00251       /* Apply loop unrolling and compute 3 MACs simultaneously. */
00252       k = srcBLen / 3;
00253 
00254       /* First part of the processing with loop unrolling.  Compute 3 MACs at a time.
00255        ** a second loop below computes MACs for the remaining 1 to 2 samples. */
00256       do
00257       {
00258         /* Read y[srcBLen - 1] sample */
00259         c0 = *(py);
00260 
00261         /* Read x[3] sample */
00262         x2 = *(px);
00263 
00264         /* Perform the multiply-accumulates */
00265         /* acc0 +=  x[0] * y[srcBLen - 1] */
00266         acc0 += ((q63_t) x0 * c0);
00267         /* acc1 +=  x[1] * y[srcBLen - 1] */
00268         acc1 += ((q63_t) x1 * c0);
00269         /* acc2 +=  x[2] * y[srcBLen - 1] */
00270         acc2 += ((q63_t) x2 * c0);
00271 
00272         /* Read y[srcBLen - 2] sample */
00273         c0 = *(py - 1U);
00274 
00275         /* Read x[4] sample */
00276         x0 = *(px + 1U);
00277 
00278         /* Perform the multiply-accumulate */
00279         /* acc0 +=  x[1] * y[srcBLen - 2] */
00280         acc0 += ((q63_t) x1 * c0);
00281         /* acc1 +=  x[2] * y[srcBLen - 2] */
00282         acc1 += ((q63_t) x2 * c0);
00283         /* acc2 +=  x[3] * y[srcBLen - 2] */
00284         acc2 += ((q63_t) x0 * c0);
00285 
00286         /* Read y[srcBLen - 3] sample */
00287         c0 = *(py - 2U);
00288 
00289         /* Read x[5] sample */
00290         x1 = *(px + 2U);
00291 
00292         /* Perform the multiply-accumulates */
00293         /* acc0 +=  x[2] * y[srcBLen - 3] */
00294         acc0 += ((q63_t) x2 * c0);
00295         /* acc1 +=  x[3] * y[srcBLen - 2] */
00296         acc1 += ((q63_t) x0 * c0);
00297         /* acc2 +=  x[4] * y[srcBLen - 2] */
00298         acc2 += ((q63_t) x1 * c0);
00299 
00300         /* update scratch pointers */
00301         px += 3U;
00302         py -= 3U;
00303 
00304       } while (--k);
00305 
00306       /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
00307        ** No loop unrolling is used. */
00308       k = srcBLen - (3 * (srcBLen / 3));
00309 
00310       while (k > 0U)
00311       {
00312         /* Read y[srcBLen - 5] sample */
00313         c0 = *(py--);
00314 
00315         /* Read x[7] sample */
00316         x2 = *(px++);
00317 
00318         /* Perform the multiply-accumulates */
00319         /* acc0 +=  x[4] * y[srcBLen - 5] */
00320         acc0 += ((q63_t) x0 * c0);
00321         /* acc1 +=  x[5] * y[srcBLen - 5] */
00322         acc1 += ((q63_t) x1 * c0);
00323         /* acc2 +=  x[6] * y[srcBLen - 5] */
00324         acc2 += ((q63_t) x2 * c0);
00325 
00326         /* Reuse the present samples for the next MAC */
00327         x0 = x1;
00328         x1 = x2;
00329 
00330         /* Decrement the loop counter */
00331         k--;
00332       }
00333 
00334       /* Store the results in the accumulators in the destination buffer. */
00335       *pOut++ = (q31_t) (acc0 >> 31);
00336       *pOut++ = (q31_t) (acc1 >> 31);
00337       *pOut++ = (q31_t) (acc2 >> 31);
00338 
00339       /* Increment the pointer pIn1 index, count by 3 */
00340       count += 3U;
00341 
00342       /* Update the inputA and inputB pointers for next MAC calculation */
00343       px = pIn1 + count;
00344       py = pSrc2;
00345 
00346       /* Decrement the loop counter */
00347       blkCnt--;
00348     }
00349 
00350     /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.
00351      ** No loop unrolling is used. */
00352     blkCnt = blockSize2 - 3 * (blockSize2 / 3);
00353 
00354     while (blkCnt > 0U)
00355     {
00356       /* Accumulator is made zero for every iteration */
00357       sum = 0;
00358 
00359       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00360       k = srcBLen >> 2U;
00361 
00362       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00363        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00364       while (k > 0U)
00365       {
00366         /* Perform the multiply-accumulates */
00367         sum += (q63_t) * px++ * (*py--);
00368         sum += (q63_t) * px++ * (*py--);
00369         sum += (q63_t) * px++ * (*py--);
00370         sum += (q63_t) * px++ * (*py--);
00371 
00372         /* Decrement the loop counter */
00373         k--;
00374       }
00375 
00376       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00377        ** No loop unrolling is used. */
00378       k = srcBLen % 0x4U;
00379 
00380       while (k > 0U)
00381       {
00382         /* Perform the multiply-accumulate */
00383         sum += (q63_t) * px++ * (*py--);
00384 
00385         /* Decrement the loop counter */
00386         k--;
00387       }
00388 
00389       /* Store the result in the accumulator in the destination buffer. */
00390       *pOut++ = (q31_t) (sum >> 31);
00391 
00392       /* Increment the MAC count */
00393       count++;
00394 
00395       /* Update the inputA and inputB pointers for next MAC calculation */
00396       px = pIn1 + count;
00397       py = pSrc2;
00398 
00399       /* Decrement the loop counter */
00400       blkCnt--;
00401     }
00402   }
00403   else
00404   {
00405     /* If the srcBLen is not a multiple of 4,
00406      * the blockSize2 loop cannot be unrolled by 4 */
00407     blkCnt = blockSize2;
00408 
00409     while (blkCnt > 0U)
00410     {
00411       /* Accumulator is made zero for every iteration */
00412       sum = 0;
00413 
00414       /* srcBLen number of MACS should be performed */
00415       k = srcBLen;
00416 
00417       while (k > 0U)
00418       {
00419         /* Perform the multiply-accumulate */
00420         sum += (q63_t) * px++ * (*py--);
00421 
00422         /* Decrement the loop counter */
00423         k--;
00424       }
00425 
00426       /* Store the result in the accumulator in the destination buffer. */
00427       *pOut++ = (q31_t) (sum >> 31);
00428 
00429       /* Increment the MAC count */
00430       count++;
00431 
00432       /* Update the inputA and inputB pointers for next MAC calculation */
00433       px = pIn1 + count;
00434       py = pSrc2;
00435 
00436       /* Decrement the loop counter */
00437       blkCnt--;
00438     }
00439   }
00440 
00441 
00442   /* --------------------------
00443    * Initializations of stage3
00444    * -------------------------*/
00445 
00446   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
00447    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
00448    * ....
00449    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
00450    * sum +=  x[srcALen-1] * y[srcBLen-1]
00451    */
00452 
00453   /* In this stage the MAC operations are decreased by 1 for every iteration.
00454      The blockSize3 variable holds the number of MAC operations performed */
00455 
00456   /* Working pointer of inputA */
00457   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
00458   px = pSrc1;
00459 
00460   /* Working pointer of inputB */
00461   pSrc2 = pIn2 + (srcBLen - 1U);
00462   py = pSrc2;
00463 
00464   /* -------------------
00465    * Stage3 process
00466    * ------------------*/
00467 
00468   while (blockSize3 > 0U)
00469   {
00470     /* Accumulator is made zero for every iteration */
00471     sum = 0;
00472 
00473     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00474     k = blockSize3 >> 2U;
00475 
00476     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00477      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00478     while (k > 0U)
00479     {
00480       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00481       sum += (q63_t) * px++ * (*py--);
00482       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00483       sum += (q63_t) * px++ * (*py--);
00484       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00485       sum += (q63_t) * px++ * (*py--);
00486       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00487       sum += (q63_t) * px++ * (*py--);
00488 
00489       /* Decrement the loop counter */
00490       k--;
00491     }
00492 
00493     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
00494      ** No loop unrolling is used. */
00495     k = blockSize3 % 0x4U;
00496 
00497     while (k > 0U)
00498     {
00499       /* Perform the multiply-accumulate */
00500       sum += (q63_t) * px++ * (*py--);
00501 
00502       /* Decrement the loop counter */
00503       k--;
00504     }
00505 
00506     /* Store the result in the accumulator in the destination buffer. */
00507     *pOut++ = (q31_t) (sum >> 31);
00508 
00509     /* Update the inputA and inputB pointers for next MAC calculation */
00510     px = ++pSrc1;
00511     py = pSrc2;
00512 
00513     /* Decrement the loop counter */
00514     blockSize3--;
00515   }
00516 
00517 #else
00518 
00519   /* Run the below code for Cortex-M0 */
00520 
00521   q31_t *pIn1 = pSrcA;                           /* input pointer */
00522   q31_t *pIn2 = pSrcB;                           /* coefficient pointer */
00523   q63_t sum;                                     /* Accumulator */
00524   uint32_t i, j;                                 /* loop counter */
00525 
00526   /* Loop to calculate output of convolution for output length number of times */
00527   for (i = 0; i < (srcALen + srcBLen - 1); i++)
00528   {
00529     /* Initialize sum with zero to carry on MAC operations */
00530     sum = 0;
00531 
00532     /* Loop to perform MAC operations according to convolution equation */
00533     for (j = 0; j <= i; j++)
00534     {
00535       /* Check the array limitations */
00536       if (((i - j) < srcBLen) && (j < srcALen))
00537       {
00538         /* z[i] += x[i-j] * y[j] */
00539         sum += ((q63_t) pIn1[j] * (pIn2[i - j]));
00540       }
00541     }
00542 
00543     /* Store the output in the destination buffer */
00544     pDst[i] = (q31_t) (sum >> 31U);
00545   }
00546 
00547 #endif /*     #if defined (ARM_MATH_DSP) */
00548 
00549 }
00550 
00551 /**
00552  * @} end of Conv group
00553  */
00554