CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_q31.c Source File

arm_conv_partial_q31.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_conv_partial_q31.c    
00009 *    
00010 * Description:  Partial convolution of Q31 sequences.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup PartialConv    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Partial convolution of Q31 sequences.    
00054  * @param[in]       *pSrcA points to the first input sequence.    
00055  * @param[in]       srcALen length of the first input sequence.    
00056  * @param[in]       *pSrcB points to the second input sequence.    
00057  * @param[in]       srcBLen length of the second input sequence.    
00058  * @param[out]      *pDst points to the location where the output result is written.    
00059  * @param[in]       firstIndex is the first output sample to start with.    
00060  * @param[in]       numPoints is the number of output points to be computed.    
00061  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].    
00062  *    
00063  * See <code>arm_conv_partial_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.    
00064  */
00065 
00066 arm_status arm_conv_partial_q31(
00067   q31_t * pSrcA,
00068   uint32_t srcALen,
00069   q31_t * pSrcB,
00070   uint32_t srcBLen,
00071   q31_t * pDst,
00072   uint32_t firstIndex,
00073   uint32_t numPoints)
00074 {
00075 
00076 
00077 #ifndef ARM_MATH_CM0_FAMILY
00078 
00079   /* Run the below code for Cortex-M4 and Cortex-M3 */
00080 
00081   q31_t *pIn1;                                   /* inputA pointer               */
00082   q31_t *pIn2;                                   /* inputB pointer               */
00083   q31_t *pOut = pDst;                            /* output pointer               */
00084   q31_t *px;                                     /* Intermediate inputA pointer  */
00085   q31_t *py;                                     /* Intermediate inputB pointer  */
00086   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00087   q63_t sum, acc0, acc1, acc2;                   /* Accumulator                  */
00088   q31_t x0, x1, x2, c0;
00089   uint32_t j, k, count, check, blkCnt;
00090   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */
00091   arm_status status;                             /* status of Partial convolution */
00092 
00093 
00094   /* Check for range of output samples to be calculated */
00095   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00096   {
00097     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00098     status = ARM_MATH_ARGUMENT_ERROR;
00099   }
00100   else
00101   {
00102 
00103     /* The algorithm implementation is based on the lengths of the inputs. */
00104     /* srcB is always made to slide across srcA. */
00105     /* So srcBLen is always considered as shorter or equal to srcALen */
00106     if(srcALen >= srcBLen)
00107     {
00108       /* Initialization of inputA pointer */
00109       pIn1 = pSrcA;
00110 
00111       /* Initialization of inputB pointer */
00112       pIn2 = pSrcB;
00113     }
00114     else
00115     {
00116       /* Initialization of inputA pointer */
00117       pIn1 = pSrcB;
00118 
00119       /* Initialization of inputB pointer */
00120       pIn2 = pSrcA;
00121 
00122       /* srcBLen is always considered as shorter or equal to srcALen */
00123       j = srcBLen;
00124       srcBLen = srcALen;
00125       srcALen = j;
00126     }
00127 
00128     /* Conditions to check which loopCounter holds    
00129      * the first and last indices of the output samples to be calculated. */
00130     check = firstIndex + numPoints;
00131     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00132     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00133     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00134     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00135                                      (int32_t) numPoints) : 0;
00136     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00137                                     (int32_t) firstIndex);
00138     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00139 
00140     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00141     /* The function is internally    
00142      * divided into three stages according to the number of multiplications that has to be    
00143      * taken place between inputA samples and inputB samples. In the first stage of the    
00144      * algorithm, the multiplications increase by one for every iteration.    
00145      * In the second stage of the algorithm, srcBLen number of multiplications are done.    
00146      * In the third stage of the algorithm, the multiplications decrease by one    
00147      * for every iteration. */
00148 
00149     /* Set the output pointer to point to the firstIndex    
00150      * of the output sample to be calculated. */
00151     pOut = pDst + firstIndex;
00152 
00153     /* --------------------------    
00154      * Initializations of stage1    
00155      * -------------------------*/
00156 
00157     /* sum = x[0] * y[0]    
00158      * sum = x[0] * y[1] + x[1] * y[0]    
00159      * ....    
00160      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]    
00161      */
00162 
00163     /* In this stage the MAC operations are increased by 1 for every iteration.    
00164        The count variable holds the number of MAC operations performed.    
00165        Since the partial convolution starts from firstIndex    
00166        Number of Macs to be performed is firstIndex + 1 */
00167     count = 1u + firstIndex;
00168 
00169     /* Working pointer of inputA */
00170     px = pIn1;
00171 
00172     /* Working pointer of inputB */
00173     pSrc2 = pIn2 + firstIndex;
00174     py = pSrc2;
00175 
00176     /* ------------------------    
00177      * Stage1 process    
00178      * ----------------------*/
00179 
00180     /* The first loop starts here */
00181     while(blockSize1 > 0)
00182     {
00183       /* Accumulator is made zero for every iteration */
00184       sum = 0;
00185 
00186       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00187       k = count >> 2u;
00188 
00189       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00190        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00191       while(k > 0u)
00192       {
00193         /* x[0] * y[srcBLen - 1] */
00194         sum += (q63_t) * px++ * (*py--);
00195         /* x[1] * y[srcBLen - 2] */
00196         sum += (q63_t) * px++ * (*py--);
00197         /* x[2] * y[srcBLen - 3] */
00198         sum += (q63_t) * px++ * (*py--);
00199         /* x[3] * y[srcBLen - 4] */
00200         sum += (q63_t) * px++ * (*py--);
00201 
00202         /* Decrement the loop counter */
00203         k--;
00204       }
00205 
00206       /* If the count is not a multiple of 4, compute any remaining MACs here.    
00207        ** No loop unrolling is used. */
00208       k = count % 0x4u;
00209 
00210       while(k > 0u)
00211       {
00212         /* Perform the multiply-accumulate */
00213         sum += (q63_t) * px++ * (*py--);
00214 
00215         /* Decrement the loop counter */
00216         k--;
00217       }
00218 
00219       /* Store the result in the accumulator in the destination buffer. */
00220       *pOut++ = (q31_t) (sum >> 31);
00221 
00222       /* Update the inputA and inputB pointers for next MAC calculation */
00223       py = ++pSrc2;
00224       px = pIn1;
00225 
00226       /* Increment the MAC count */
00227       count++;
00228 
00229       /* Decrement the loop counter */
00230       blockSize1--;
00231     }
00232 
00233     /* --------------------------    
00234      * Initializations of stage2    
00235      * ------------------------*/
00236 
00237     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]    
00238      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]    
00239      * ....    
00240      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]    
00241      */
00242 
00243     /* Working pointer of inputA */
00244     if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00245     {
00246       px = pIn1 + firstIndex - srcBLen + 1;
00247     }
00248     else
00249     {
00250       px = pIn1;
00251     }
00252 
00253     /* Working pointer of inputB */
00254     pSrc2 = pIn2 + (srcBLen - 1u);
00255     py = pSrc2;
00256 
00257     /* count is index by which the pointer pIn1 to be incremented */
00258     count = 0u;
00259 
00260     /* -------------------    
00261      * Stage2 process    
00262      * ------------------*/
00263 
00264     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.    
00265      * So, to loop unroll over blockSize2,    
00266      * srcBLen should be greater than or equal to 4 */
00267     if(srcBLen >= 4u)
00268     {
00269       /* Loop unroll over blkCnt */
00270 
00271       blkCnt = blockSize2 / 3;
00272       while(blkCnt > 0u)
00273       {
00274         /* Set all accumulators to zero */
00275         acc0 = 0;
00276         acc1 = 0;
00277         acc2 = 0;
00278 
00279         /* read x[0], x[1] samples */
00280         x0 = *(px++);
00281         x1 = *(px++);
00282 
00283         /* Apply loop unrolling and compute 3 MACs simultaneously. */
00284         k = srcBLen / 3;
00285 
00286         /* First part of the processing with loop unrolling.  Compute 3 MACs at a time.        
00287          ** a second loop below computes MACs for the remaining 1 to 2 samples. */
00288         do
00289         {
00290           /* Read y[srcBLen - 1] sample */
00291           c0 = *(py);
00292 
00293           /* Read x[2] sample */
00294           x2 = *(px);
00295 
00296           /* Perform the multiply-accumulates */
00297           /* acc0 +=  x[0] * y[srcBLen - 1] */
00298           acc0 += (q63_t) x0 *c0;
00299           /* acc1 +=  x[1] * y[srcBLen - 1] */
00300           acc1 += (q63_t) x1 *c0;
00301           /* acc2 +=  x[2] * y[srcBLen - 1] */
00302           acc2 += (q63_t) x2 *c0;
00303 
00304           /* Read y[srcBLen - 2] sample */
00305           c0 = *(py - 1u);
00306 
00307           /* Read x[3] sample */
00308           x0 = *(px + 1u);
00309 
00310           /* Perform the multiply-accumulate */
00311           /* acc0 +=  x[1] * y[srcBLen - 2] */
00312           acc0 += (q63_t) x1 *c0;
00313           /* acc1 +=  x[2] * y[srcBLen - 2] */
00314           acc1 += (q63_t) x2 *c0;
00315           /* acc2 +=  x[3] * y[srcBLen - 2] */
00316           acc2 += (q63_t) x0 *c0;
00317 
00318           /* Read y[srcBLen - 3] sample */
00319           c0 = *(py - 2u);
00320 
00321           /* Read x[4] sample */
00322           x1 = *(px + 2u);
00323 
00324           /* Perform the multiply-accumulates */
00325           /* acc0 +=  x[2] * y[srcBLen - 3] */
00326           acc0 += (q63_t) x2 *c0;
00327           /* acc1 +=  x[3] * y[srcBLen - 2] */
00328           acc1 += (q63_t) x0 *c0;
00329           /* acc2 +=  x[4] * y[srcBLen - 2] */
00330           acc2 += (q63_t) x1 *c0;
00331 
00332 
00333           px += 3u;
00334 
00335           py -= 3u;
00336 
00337         } while(--k);
00338 
00339         /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.        
00340          ** No loop unrolling is used. */
00341         k = srcBLen - (3 * (srcBLen / 3));
00342 
00343         while(k > 0u)
00344         {
00345           /* Read y[srcBLen - 5] sample */
00346           c0 = *(py--);
00347 
00348           /* Read x[7] sample */
00349           x2 = *(px++);
00350 
00351           /* Perform the multiply-accumulates */
00352           /* acc0 +=  x[4] * y[srcBLen - 5] */
00353           acc0 += (q63_t) x0 *c0;
00354           /* acc1 +=  x[5] * y[srcBLen - 5] */
00355           acc1 += (q63_t) x1 *c0;
00356           /* acc2 +=  x[6] * y[srcBLen - 5] */
00357           acc2 += (q63_t) x2 *c0;
00358 
00359           /* Reuse the present samples for the next MAC */
00360           x0 = x1;
00361           x1 = x2;
00362 
00363           /* Decrement the loop counter */
00364           k--;
00365         }
00366 
00367         /* Store the result in the accumulator in the destination buffer. */
00368         *pOut++ = (q31_t) (acc0 >> 31);
00369         *pOut++ = (q31_t) (acc1 >> 31);
00370         *pOut++ = (q31_t) (acc2 >> 31);
00371 
00372         /* Increment the pointer pIn1 index, count by 3 */
00373         count += 3u;
00374 
00375         /* Update the inputA and inputB pointers for next MAC calculation */
00376         px = pIn1 + count;
00377         py = pSrc2;
00378 
00379         /* Decrement the loop counter */
00380         blkCnt--;
00381       }
00382 
00383       /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.        
00384        ** No loop unrolling is used. */
00385       blkCnt = blockSize2 - 3 * (blockSize2 / 3);
00386 
00387       while(blkCnt > 0u)
00388       {
00389         /* Accumulator is made zero for every iteration */
00390         sum = 0;
00391 
00392         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00393         k = srcBLen >> 2u;
00394 
00395         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00396          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00397         while(k > 0u)
00398         {
00399           /* Perform the multiply-accumulates */
00400           sum += (q63_t) * px++ * (*py--);
00401           sum += (q63_t) * px++ * (*py--);
00402           sum += (q63_t) * px++ * (*py--);
00403           sum += (q63_t) * px++ * (*py--);
00404 
00405           /* Decrement the loop counter */
00406           k--;
00407         }
00408 
00409         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00410          ** No loop unrolling is used. */
00411         k = srcBLen % 0x4u;
00412 
00413         while(k > 0u)
00414         {
00415           /* Perform the multiply-accumulate */
00416           sum += (q63_t) * px++ * (*py--);
00417 
00418           /* Decrement the loop counter */
00419           k--;
00420         }
00421 
00422         /* Store the result in the accumulator in the destination buffer. */
00423         *pOut++ = (q31_t) (sum >> 31);
00424 
00425         /* Increment the MAC count */
00426         count++;
00427 
00428         /* Update the inputA and inputB pointers for next MAC calculation */
00429         px = pIn1 + count;
00430         py = pSrc2;
00431 
00432         /* Decrement the loop counter */
00433         blkCnt--;
00434       }
00435     }
00436     else
00437     {
00438       /* If the srcBLen is not a multiple of 4,    
00439        * the blockSize2 loop cannot be unrolled by 4 */
00440       blkCnt = (uint32_t) blockSize2;
00441 
00442       while(blkCnt > 0u)
00443       {
00444         /* Accumulator is made zero for every iteration */
00445         sum = 0;
00446 
00447         /* srcBLen number of MACS should be performed */
00448         k = srcBLen;
00449 
00450         while(k > 0u)
00451         {
00452           /* Perform the multiply-accumulate */
00453           sum += (q63_t) * px++ * (*py--);
00454 
00455           /* Decrement the loop counter */
00456           k--;
00457         }
00458 
00459         /* Store the result in the accumulator in the destination buffer. */
00460         *pOut++ = (q31_t) (sum >> 31);
00461 
00462         /* Increment the MAC count */
00463         count++;
00464 
00465         /* Update the inputA and inputB pointers for next MAC calculation */
00466         px = pIn1 + count;
00467         py = pSrc2;
00468 
00469         /* Decrement the loop counter */
00470         blkCnt--;
00471       }
00472     }
00473 
00474 
00475     /* --------------------------    
00476      * Initializations of stage3    
00477      * -------------------------*/
00478 
00479     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]    
00480      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]    
00481      * ....    
00482      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]    
00483      * sum +=  x[srcALen-1] * y[srcBLen-1]    
00484      */
00485 
00486     /* In this stage the MAC operations are decreased by 1 for every iteration.    
00487        The blockSize3 variable holds the number of MAC operations performed */
00488     count = srcBLen - 1u;
00489 
00490     /* Working pointer of inputA */
00491     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00492     px = pSrc1;
00493 
00494     /* Working pointer of inputB */
00495     pSrc2 = pIn2 + (srcBLen - 1u);
00496     py = pSrc2;
00497 
00498     /* -------------------    
00499      * Stage3 process    
00500      * ------------------*/
00501 
00502     while(blockSize3 > 0)
00503     {
00504       /* Accumulator is made zero for every iteration */
00505       sum = 0;
00506 
00507       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00508       k = count >> 2u;
00509 
00510       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00511        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00512       while(k > 0u)
00513       {
00514         sum += (q63_t) * px++ * (*py--);
00515         sum += (q63_t) * px++ * (*py--);
00516         sum += (q63_t) * px++ * (*py--);
00517         sum += (q63_t) * px++ * (*py--);
00518 
00519         /* Decrement the loop counter */
00520         k--;
00521       }
00522 
00523       /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.    
00524        ** No loop unrolling is used. */
00525       k = count % 0x4u;
00526 
00527       while(k > 0u)
00528       {
00529         /* Perform the multiply-accumulate */
00530         sum += (q63_t) * px++ * (*py--);
00531 
00532         /* Decrement the loop counter */
00533         k--;
00534       }
00535 
00536       /* Store the result in the accumulator in the destination buffer. */
00537       *pOut++ = (q31_t) (sum >> 31);
00538 
00539       /* Update the inputA and inputB pointers for next MAC calculation */
00540       px = ++pSrc1;
00541       py = pSrc2;
00542 
00543       /* Decrement the MAC count */
00544       count--;
00545 
00546       /* Decrement the loop counter */
00547       blockSize3--;
00548 
00549     }
00550 
00551     /* set status as ARM_MATH_SUCCESS */
00552     status = ARM_MATH_SUCCESS;
00553   }
00554 
00555   /* Return to application */
00556   return (status);
00557 
00558 #else
00559 
00560   /* Run the below code for Cortex-M0 */
00561 
00562   q31_t *pIn1 = pSrcA;                           /* inputA pointer */
00563   q31_t *pIn2 = pSrcB;                           /* inputB pointer */
00564   q63_t sum;                                     /* Accumulator */
00565   uint32_t i, j;                                 /* loop counters */
00566   arm_status status;                             /* status of Partial convolution */
00567 
00568   /* Check for range of output samples to be calculated */
00569   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00570   {
00571     /* Set status as ARM_ARGUMENT_ERROR */
00572     status = ARM_MATH_ARGUMENT_ERROR;
00573   }
00574   else
00575   {
00576     /* Loop to calculate convolution for output length number of values */
00577     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00578     {
00579       /* Initialize sum with zero to carry on MAC operations */
00580       sum = 0;
00581 
00582       /* Loop to perform MAC operations according to convolution equation */
00583       for (j = 0; j <= i; j++)
00584       {
00585         /* Check the array limitations */
00586         if(((i - j) < srcBLen) && (j < srcALen))
00587         {
00588           /* z[i] += x[i-j] * y[j] */
00589           sum += ((q63_t) pIn1[j] * (pIn2[i - j]));
00590         }
00591       }
00592 
00593       /* Store the output in the destination buffer */
00594       pDst[i] = (q31_t) (sum >> 31u);
00595     }
00596     /* set status as ARM_SUCCESS as there are no argument errors */
00597     status = ARM_MATH_SUCCESS;
00598   }
00599   return (status);
00600 
00601 #endif /*    #ifndef ARM_MATH_CM0_FAMILY      */
00602 
00603 }
00604 
00605 /**    
00606  * @} end of PartialConv group    
00607  */