Laxmi Kant Tiwari / mbed-dsp

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_q31.c Source File

arm_conv_partial_q31.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_conv_partial_q31.c    
00009 *    
00010 * Description:  Partial convolution of Q31 sequences.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup PartialConv    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Partial convolution of Q31 sequences.    
00054  * @param[in]       *pSrcA points to the first input sequence.    
00055  * @param[in]       srcALen length of the first input sequence.    
00056  * @param[in]       *pSrcB points to the second input sequence.    
00057  * @param[in]       srcBLen length of the second input sequence.    
00058  * @param[out]      *pDst points to the location where the output result is written.    
00059  * @param[in]       firstIndex is the first output sample to start with.    
00060  * @param[in]       numPoints is the number of output points to be computed.    
00061  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].    
00062  *    
00063  * See <code>arm_conv_partial_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.    
00064  */
00065 
00066 arm_status arm_conv_partial_q31(
00067   q31_t * pSrcA,
00068   uint32_t srcALen,
00069   q31_t * pSrcB,
00070   uint32_t srcBLen,
00071   q31_t * pDst,
00072   uint32_t firstIndex,
00073   uint32_t numPoints)
00074 {
00075 
00076 
00077 #ifndef ARM_MATH_CM0_FAMILY
00078 
00079   /* Run the below code for Cortex-M4 and Cortex-M3 */
00080 
00081   q31_t *pIn1;                                   /* inputA pointer               */
00082   q31_t *pIn2;                                   /* inputB pointer               */
00083   q31_t *pOut = pDst;                            /* output pointer               */
00084   q31_t *px;                                     /* Intermediate inputA pointer  */
00085   q31_t *py;                                     /* Intermediate inputB pointer  */
00086   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00087   q63_t sum, acc0, acc1, acc2;                   /* Accumulator                  */
00088   q31_t x0, x1, x2, c0;
00089   uint32_t j, k, count, check, blkCnt;
00090   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */
00091   arm_status status;                             /* status of Partial convolution */
00092 
00093 
00094   /* Check for range of output samples to be calculated */
00095   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00096   {
00097     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00098     status = ARM_MATH_ARGUMENT_ERROR;
00099   }
00100   else
00101   {
00102 
00103     /* The algorithm implementation is based on the lengths of the inputs. */
00104     /* srcB is always made to slide across srcA. */
00105     /* So srcBLen is always considered as shorter or equal to srcALen */
00106     if(srcALen >= srcBLen)
00107     {
00108       /* Initialization of inputA pointer */
00109       pIn1 = pSrcA;
00110 
00111       /* Initialization of inputB pointer */
00112       pIn2 = pSrcB;
00113     }
00114     else
00115     {
00116       /* Initialization of inputA pointer */
00117       pIn1 = pSrcB;
00118 
00119       /* Initialization of inputB pointer */
00120       pIn2 = pSrcA;
00121 
00122       /* srcBLen is always considered as shorter or equal to srcALen */
00123       j = srcBLen;
00124       srcBLen = srcALen;
00125       srcALen = j;
00126     }
00127 
00128     /* Conditions to check which loopCounter holds    
00129      * the first and last indices of the output samples to be calculated. */
00130     check = firstIndex + numPoints;
00131     blockSize3 = ((int32_t) check - (int32_t) srcALen);
00132     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
00133     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00134     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00135                                      (int32_t) numPoints) : 0;
00136     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00137                                     (int32_t) firstIndex);
00138     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00139 
00140     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00141     /* The function is internally    
00142      * divided into three stages according to the number of multiplications that has to be    
00143      * taken place between inputA samples and inputB samples. In the first stage of the    
00144      * algorithm, the multiplications increase by one for every iteration.    
00145      * In the second stage of the algorithm, srcBLen number of multiplications are done.    
00146      * In the third stage of the algorithm, the multiplications decrease by one    
00147      * for every iteration. */
00148 
00149     /* Set the output pointer to point to the firstIndex    
00150      * of the output sample to be calculated. */
00151     pOut = pDst + firstIndex;
00152 
00153     /* --------------------------    
00154      * Initializations of stage1    
00155      * -------------------------*/
00156 
00157     /* sum = x[0] * y[0]    
00158      * sum = x[0] * y[1] + x[1] * y[0]    
00159      * ....    
00160      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]    
00161      */
00162 
00163     /* In this stage the MAC operations are increased by 1 for every iteration.    
00164        The count variable holds the number of MAC operations performed.    
00165        Since the partial convolution starts from firstIndex    
00166        Number of Macs to be performed is firstIndex + 1 */
00167     count = 1u + firstIndex;
00168 
00169     /* Working pointer of inputA */
00170     px = pIn1;
00171 
00172     /* Working pointer of inputB */
00173     pSrc2 = pIn2 + firstIndex;
00174     py = pSrc2;
00175 
00176     /* ------------------------    
00177      * Stage1 process    
00178      * ----------------------*/
00179 
00180     /* The first loop starts here */
00181     while(blockSize1 > 0)
00182     {
00183       /* Accumulator is made zero for every iteration */
00184       sum = 0;
00185 
00186       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00187       k = count >> 2u;
00188 
00189       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00190        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00191       while(k > 0u)
00192       {
00193         /* x[0] * y[srcBLen - 1] */
00194         sum += (q63_t) * px++ * (*py--);
00195         /* x[1] * y[srcBLen - 2] */
00196         sum += (q63_t) * px++ * (*py--);
00197         /* x[2] * y[srcBLen - 3] */
00198         sum += (q63_t) * px++ * (*py--);
00199         /* x[3] * y[srcBLen - 4] */
00200         sum += (q63_t) * px++ * (*py--);
00201 
00202         /* Decrement the loop counter */
00203         k--;
00204       }
00205 
00206       /* If the count is not a multiple of 4, compute any remaining MACs here.    
00207        ** No loop unrolling is used. */
00208       k = count % 0x4u;
00209 
00210       while(k > 0u)
00211       {
00212         /* Perform the multiply-accumulate */
00213         sum += (q63_t) * px++ * (*py--);
00214 
00215         /* Decrement the loop counter */
00216         k--;
00217       }
00218 
00219       /* Store the result in the accumulator in the destination buffer. */
00220       *pOut++ = (q31_t) (sum >> 31);
00221 
00222       /* Update the inputA and inputB pointers for next MAC calculation */
00223       py = ++pSrc2;
00224       px = pIn1;
00225 
00226       /* Increment the MAC count */
00227       count++;
00228 
00229       /* Decrement the loop counter */
00230       blockSize1--;
00231     }
00232 
00233     /* --------------------------    
00234      * Initializations of stage2    
00235      * ------------------------*/
00236 
00237     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]    
00238      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]    
00239      * ....    
00240      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]    
00241      */
00242 
00243     /* Working pointer of inputA */
00244     px = pIn1;
00245 
00246     /* Working pointer of inputB */
00247     pSrc2 = pIn2 + (srcBLen - 1u);
00248     py = pSrc2;
00249 
00250     /* count is index by which the pointer pIn1 to be incremented */
00251     count = 0u;
00252 
00253     /* -------------------    
00254      * Stage2 process    
00255      * ------------------*/
00256 
00257     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.    
00258      * So, to loop unroll over blockSize2,    
00259      * srcBLen should be greater than or equal to 4 */
00260     if(srcBLen >= 4u)
00261     {
00262       /* Loop unroll over blkCnt */
00263 
00264       blkCnt = blockSize2 / 3;
00265       while(blkCnt > 0u)
00266       {
00267         /* Set all accumulators to zero */
00268         acc0 = 0;
00269         acc1 = 0;
00270         acc2 = 0;
00271 
00272         /* read x[0], x[1] samples */
00273         x0 = *(px++);
00274         x1 = *(px++);
00275 
00276         /* Apply loop unrolling and compute 3 MACs simultaneously. */
00277         k = srcBLen / 3;
00278 
00279         /* First part of the processing with loop unrolling.  Compute 3 MACs at a time.        
00280          ** a second loop below computes MACs for the remaining 1 to 2 samples. */
00281         do
00282         {
00283           /* Read y[srcBLen - 1] sample */
00284           c0 = *(py);
00285 
00286           /* Read x[2] sample */
00287           x2 = *(px);
00288 
00289           /* Perform the multiply-accumulates */
00290           /* acc0 +=  x[0] * y[srcBLen - 1] */
00291           acc0 += (q63_t) x0 *c0;
00292           /* acc1 +=  x[1] * y[srcBLen - 1] */
00293           acc1 += (q63_t) x1 *c0;
00294           /* acc2 +=  x[2] * y[srcBLen - 1] */
00295           acc2 += (q63_t) x2 *c0;
00296 
00297           /* Read y[srcBLen - 2] sample */
00298           c0 = *(py - 1u);
00299 
00300           /* Read x[3] sample */
00301           x0 = *(px + 1u);
00302 
00303           /* Perform the multiply-accumulate */
00304           /* acc0 +=  x[1] * y[srcBLen - 2] */
00305           acc0 += (q63_t) x1 *c0;
00306           /* acc1 +=  x[2] * y[srcBLen - 2] */
00307           acc1 += (q63_t) x2 *c0;
00308           /* acc2 +=  x[3] * y[srcBLen - 2] */
00309           acc2 += (q63_t) x0 *c0;
00310 
00311           /* Read y[srcBLen - 3] sample */
00312           c0 = *(py - 2u);
00313 
00314           /* Read x[4] sample */
00315           x1 = *(px + 2u);
00316 
00317           /* Perform the multiply-accumulates */
00318           /* acc0 +=  x[2] * y[srcBLen - 3] */
00319           acc0 += (q63_t) x2 *c0;
00320           /* acc1 +=  x[3] * y[srcBLen - 2] */
00321           acc1 += (q63_t) x0 *c0;
00322           /* acc2 +=  x[4] * y[srcBLen - 2] */
00323           acc2 += (q63_t) x1 *c0;
00324 
00325 
00326           px += 3u;
00327 
00328           py -= 3u;
00329 
00330         } while(--k);
00331 
00332         /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.        
00333          ** No loop unrolling is used. */
00334         k = srcBLen - (3 * (srcBLen / 3));
00335 
00336         while(k > 0u)
00337         {
00338           /* Read y[srcBLen - 5] sample */
00339           c0 = *(py--);
00340 
00341           /* Read x[7] sample */
00342           x2 = *(px++);
00343 
00344           /* Perform the multiply-accumulates */
00345           /* acc0 +=  x[4] * y[srcBLen - 5] */
00346           acc0 += (q63_t) x0 *c0;
00347           /* acc1 +=  x[5] * y[srcBLen - 5] */
00348           acc1 += (q63_t) x1 *c0;
00349           /* acc2 +=  x[6] * y[srcBLen - 5] */
00350           acc2 += (q63_t) x2 *c0;
00351 
00352           /* Reuse the present samples for the next MAC */
00353           x0 = x1;
00354           x1 = x2;
00355 
00356           /* Decrement the loop counter */
00357           k--;
00358         }
00359 
00360         /* Store the result in the accumulator in the destination buffer. */
00361         *pOut++ = (q31_t) (acc0 >> 31);
00362         *pOut++ = (q31_t) (acc1 >> 31);
00363         *pOut++ = (q31_t) (acc2 >> 31);
00364 
00365         /* Increment the pointer pIn1 index, count by 3 */
00366         count += 3u;
00367 
00368         /* Update the inputA and inputB pointers for next MAC calculation */
00369         px = pIn1 + count;
00370         py = pSrc2;
00371 
00372         /* Decrement the loop counter */
00373         blkCnt--;
00374       }
00375 
00376       /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.        
00377        ** No loop unrolling is used. */
00378       blkCnt = blockSize2 - 3 * (blockSize2 / 3);
00379 
00380       while(blkCnt > 0u)
00381       {
00382         /* Accumulator is made zero for every iteration */
00383         sum = 0;
00384 
00385         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00386         k = srcBLen >> 2u;
00387 
00388         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00389          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00390         while(k > 0u)
00391         {
00392           /* Perform the multiply-accumulates */
00393           sum += (q63_t) * px++ * (*py--);
00394           sum += (q63_t) * px++ * (*py--);
00395           sum += (q63_t) * px++ * (*py--);
00396           sum += (q63_t) * px++ * (*py--);
00397 
00398           /* Decrement the loop counter */
00399           k--;
00400         }
00401 
00402         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00403          ** No loop unrolling is used. */
00404         k = srcBLen % 0x4u;
00405 
00406         while(k > 0u)
00407         {
00408           /* Perform the multiply-accumulate */
00409           sum += (q63_t) * px++ * (*py--);
00410 
00411           /* Decrement the loop counter */
00412           k--;
00413         }
00414 
00415         /* Store the result in the accumulator in the destination buffer. */
00416         *pOut++ = (q31_t) (sum >> 31);
00417 
00418         /* Increment the MAC count */
00419         count++;
00420 
00421         /* Update the inputA and inputB pointers for next MAC calculation */
00422         px = pIn1 + count;
00423         py = pSrc2;
00424 
00425         /* Decrement the loop counter */
00426         blkCnt--;
00427       }
00428     }
00429     else
00430     {
00431       /* If the srcBLen is not a multiple of 4,    
00432        * the blockSize2 loop cannot be unrolled by 4 */
00433       blkCnt = (uint32_t) blockSize2;
00434 
00435       while(blkCnt > 0u)
00436       {
00437         /* Accumulator is made zero for every iteration */
00438         sum = 0;
00439 
00440         /* srcBLen number of MACS should be performed */
00441         k = srcBLen;
00442 
00443         while(k > 0u)
00444         {
00445           /* Perform the multiply-accumulate */
00446           sum += (q63_t) * px++ * (*py--);
00447 
00448           /* Decrement the loop counter */
00449           k--;
00450         }
00451 
00452         /* Store the result in the accumulator in the destination buffer. */
00453         *pOut++ = (q31_t) (sum >> 31);
00454 
00455         /* Increment the MAC count */
00456         count++;
00457 
00458         /* Update the inputA and inputB pointers for next MAC calculation */
00459         px = pIn1 + count;
00460         py = pSrc2;
00461 
00462         /* Decrement the loop counter */
00463         blkCnt--;
00464       }
00465     }
00466 
00467 
00468     /* --------------------------    
00469      * Initializations of stage3    
00470      * -------------------------*/
00471 
00472     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]    
00473      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]    
00474      * ....    
00475      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]    
00476      * sum +=  x[srcALen-1] * y[srcBLen-1]    
00477      */
00478 
00479     /* In this stage the MAC operations are decreased by 1 for every iteration.    
00480        The blockSize3 variable holds the number of MAC operations performed */
00481     count = srcBLen - 1u;
00482 
00483     /* Working pointer of inputA */
00484     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00485     px = pSrc1;
00486 
00487     /* Working pointer of inputB */
00488     pSrc2 = pIn2 + (srcBLen - 1u);
00489     py = pSrc2;
00490 
00491     /* -------------------    
00492      * Stage3 process    
00493      * ------------------*/
00494 
00495     while(blockSize3 > 0)
00496     {
00497       /* Accumulator is made zero for every iteration */
00498       sum = 0;
00499 
00500       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00501       k = count >> 2u;
00502 
00503       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00504        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00505       while(k > 0u)
00506       {
00507         sum += (q63_t) * px++ * (*py--);
00508         sum += (q63_t) * px++ * (*py--);
00509         sum += (q63_t) * px++ * (*py--);
00510         sum += (q63_t) * px++ * (*py--);
00511 
00512         /* Decrement the loop counter */
00513         k--;
00514       }
00515 
00516       /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.    
00517        ** No loop unrolling is used. */
00518       k = count % 0x4u;
00519 
00520       while(k > 0u)
00521       {
00522         /* Perform the multiply-accumulate */
00523         sum += (q63_t) * px++ * (*py--);
00524 
00525         /* Decrement the loop counter */
00526         k--;
00527       }
00528 
00529       /* Store the result in the accumulator in the destination buffer. */
00530       *pOut++ = (q31_t) (sum >> 31);
00531 
00532       /* Update the inputA and inputB pointers for next MAC calculation */
00533       px = ++pSrc1;
00534       py = pSrc2;
00535 
00536       /* Decrement the MAC count */
00537       count--;
00538 
00539       /* Decrement the loop counter */
00540       blockSize3--;
00541 
00542     }
00543 
00544     /* set status as ARM_MATH_SUCCESS */
00545     status = ARM_MATH_SUCCESS;
00546   }
00547 
00548   /* Return to application */
00549   return (status);
00550 
00551 #else
00552 
00553   /* Run the below code for Cortex-M0 */
00554 
00555   q31_t *pIn1 = pSrcA;                           /* inputA pointer */
00556   q31_t *pIn2 = pSrcB;                           /* inputB pointer */
00557   q63_t sum;                                     /* Accumulator */
00558   uint32_t i, j;                                 /* loop counters */
00559   arm_status status;                             /* status of Partial convolution */
00560 
00561   /* Check for range of output samples to be calculated */
00562   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00563   {
00564     /* Set status as ARM_ARGUMENT_ERROR */
00565     status = ARM_MATH_ARGUMENT_ERROR;
00566   }
00567   else
00568   {
00569     /* Loop to calculate convolution for output length number of values */
00570     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00571     {
00572       /* Initialize sum with zero to carry on MAC operations */
00573       sum = 0;
00574 
00575       /* Loop to perform MAC operations according to convolution equation */
00576       for (j = 0; j <= i; j++)
00577       {
00578         /* Check the array limitations */
00579         if(((i - j) < srcBLen) && (j < srcALen))
00580         {
00581           /* z[i] += x[i-j] * y[j] */
00582           sum += ((q63_t) pIn1[j] * (pIn2[i - j]));
00583         }
00584       }
00585 
00586       /* Store the output in the destination buffer */
00587       pDst[i] = (q31_t) (sum >> 31u);
00588     }
00589     /* set status as ARM_SUCCESS as there are no argument errors */
00590     status = ARM_MATH_SUCCESS;
00591   }
00592   return (status);
00593 
00594 #endif /*    #ifndef ARM_MATH_CM0_FAMILY      */
00595 
00596 }
00597 
00598 /**    
00599  * @} end of PartialConv group    
00600  */