CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_mat_mult_fast_q15.c Source File

arm_mat_mult_fast_q15.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_mat_mult_fast_q15.c    
00009 *    
00010 * Description:   Q15 matrix multiplication (fast variant)    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.    
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupMatrix    
00045  */
00046 
00047 /**    
00048  * @addtogroup MatrixMult    
00049  * @{    
00050  */
00051 
00052 
00053 /**    
00054  * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4    
00055  * @param[in]       *pSrcA points to the first input matrix structure    
00056  * @param[in]       *pSrcB points to the second input matrix structure    
00057  * @param[out]      *pDst points to output matrix structure    
00058  * @param[in]       *pState points to the array for storing intermediate results    
00059  * @return          The function returns either    
00060  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.    
00061  *    
00062  * @details    
00063  * <b>Scaling and Overflow Behavior:</b>    
00064  *    
00065  * \par    
00066  * The difference between the function arm_mat_mult_q15() and this fast variant is that    
00067  * the fast variant use a 32-bit rather than a 64-bit accumulator.    
00068  * The result of each 1.15 x 1.15 multiplication is truncated to        
00069  * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30        
00070  * format. Finally, the accumulator is saturated and converted to a 1.15 result.        
00071  *        
00072  * \par        
00073  * The fast version has the same overflow behavior as the standard version but provides        
00074  * less precision since it discards the low 16 bits of each multiplication result.        
00075  * In order to avoid overflows completely the input signals must be scaled down.        
00076  * Scale down one of the input matrices by log2(numColsA) bits to        
00077  * avoid overflows, as a total of numColsA additions are computed internally for each        
00078  * output element.        
00079  *        
00080  * \par    
00081  * See <code>arm_mat_mult_q15()</code> for a slower implementation of this function    
00082  * which uses 64-bit accumulation to provide higher precision.    
00083  */
00084 
00085 arm_status arm_mat_mult_fast_q15(
00086   const arm_matrix_instance_q15 * pSrcA,
00087   const arm_matrix_instance_q15 * pSrcB,
00088   arm_matrix_instance_q15 * pDst,
00089   q15_t * pState)
00090 {
00091   q31_t sum;                                     /* accumulator */
00092   q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
00093   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
00094   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
00095   q15_t *px;                                     /* Temporary output data matrix pointer */
00096   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
00097   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
00098   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
00099   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
00100   uint16_t col, i = 0u, row = numRowsB, colCnt;  /* loop counters */
00101   arm_status status;                             /* status of matrix multiplication */
00102 
00103 #ifndef UNALIGNED_SUPPORT_DISABLE
00104 
00105   q31_t in;                                      /* Temporary variable to hold the input value */
00106   q31_t inA1, inA2, inB1, inB2;
00107 
00108 #else
00109 
00110   q15_t in;                                      /* Temporary variable to hold the input value */
00111   q15_t inA1, inA2, inB1, inB2;
00112 
00113 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00114 
00115 #ifdef ARM_MATH_MATRIX_CHECK
00116   /* Check for matrix mismatch condition */
00117   if((pSrcA->numCols != pSrcB->numRows) ||
00118      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
00119   {
00120     /* Set status as ARM_MATH_SIZE_MISMATCH */
00121     status = ARM_MATH_SIZE_MISMATCH;
00122   }
00123   else
00124 #endif
00125   {
00126     /* Matrix transpose */
00127     do
00128     {
00129       /* Apply loop unrolling and exchange the columns with row elements */
00130       col = numColsB >> 2;
00131 
00132       /* The pointer px is set to starting address of the column being processed */
00133       px = pSrcBT + i;
00134 
00135       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
00136        ** a second loop below computes the remaining 1 to 3 samples. */
00137       while(col > 0u)
00138       {
00139 #ifndef UNALIGNED_SUPPORT_DISABLE
00140         /* Read two elements from the row */
00141         in = *__SIMD32(pInB)++;
00142 
00143         /* Unpack and store one element in the destination */
00144 #ifndef ARM_MATH_BIG_ENDIAN
00145 
00146         *px = (q15_t) in;
00147 
00148 #else
00149 
00150         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
00151 
00152 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
00153 
00154         /* Update the pointer px to point to the next row of the transposed matrix */
00155         px += numRowsB;
00156 
00157         /* Unpack and store the second element in the destination */
00158 #ifndef ARM_MATH_BIG_ENDIAN
00159 
00160         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
00161 
00162 #else
00163 
00164         *px = (q15_t) in;
00165 
00166 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
00167 
00168         /* Update the pointer px to point to the next row of the transposed matrix */
00169         px += numRowsB;
00170 
00171         /* Read two elements from the row */
00172         in = *__SIMD32(pInB)++;
00173 
00174         /* Unpack and store one element in the destination */
00175 #ifndef ARM_MATH_BIG_ENDIAN
00176 
00177         *px = (q15_t) in;
00178 
00179 #else
00180 
00181         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
00182 
00183 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
00184 
00185         /* Update the pointer px to point to the next row of the transposed matrix */
00186         px += numRowsB;
00187 
00188         /* Unpack and store the second element in the destination */
00189 
00190 #ifndef ARM_MATH_BIG_ENDIAN
00191 
00192         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
00193 
00194 #else
00195 
00196         *px = (q15_t) in;
00197 
00198 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
00199 
00200 #else
00201 
00202         /* Read one element from the row */
00203         in = *pInB++;
00204 
00205         /* Store one element in the destination */
00206         *px = in;
00207  
00208         /* Update the pointer px to point to the next row of the transposed matrix */
00209         px += numRowsB;
00210 
00211         /* Read one element from the row */
00212         in = *pInB++;
00213 
00214         /* Store one element in the destination */
00215         *px = in;
00216  
00217         /* Update the pointer px to point to the next row of the transposed matrix */
00218         px += numRowsB;
00219 
00220         /* Read one element from the row */
00221         in = *pInB++;
00222 
00223         /* Store one element in the destination */
00224         *px = in;
00225  
00226         /* Update the pointer px to point to the next row of the transposed matrix */
00227         px += numRowsB;
00228 
00229         /* Read one element from the row */
00230         in = *pInB++;
00231 
00232         /* Store one element in the destination */
00233         *px = in;
00234 
00235 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00236         
00237         /* Update the pointer px to point to the next row of the transposed matrix */
00238         px += numRowsB;
00239 
00240         /* Decrement the column loop counter */
00241         col--;
00242       }
00243 
00244       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.        
00245        ** No loop unrolling is used. */
00246       col = numColsB % 0x4u;
00247 
00248       while(col > 0u)
00249       {
00250         /* Read and store the input element in the destination */
00251         *px = *pInB++;
00252 
00253         /* Update the pointer px to point to the next row of the transposed matrix */
00254         px += numRowsB;
00255 
00256         /* Decrement the column loop counter */
00257         col--;
00258       }
00259 
00260       i++;
00261 
00262       /* Decrement the row loop counter */
00263       row--;
00264 
00265     } while(row > 0u);
00266 
00267     /* Reset the variables for the usage in the following multiplication process */
00268     row = numRowsA;
00269     i = 0u;
00270     px = pDst->pData;
00271 
00272     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
00273     /* row loop */
00274     do
00275     {
00276       /* For every row wise process, the column loop counter is to be initiated */
00277       col = numColsB;
00278 
00279       /* For every row wise process, the pIn2 pointer is set        
00280        ** to the starting address of the transposed pSrcB data */
00281       pInB = pSrcBT;
00282 
00283       /* column loop */
00284       do
00285       {
00286         /* Set the variable sum, that acts as accumulator, to zero */
00287         sum = 0;
00288 
00289         /* Apply loop unrolling and compute 2 MACs simultaneously. */
00290         colCnt = numColsA >> 2;
00291 
00292         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
00293         pInA = pSrcA->pData + i;
00294 
00295         /* matrix multiplication */
00296         while(colCnt > 0u)
00297         {
00298           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
00299 #ifndef UNALIGNED_SUPPORT_DISABLE
00300 
00301           inA1 = *__SIMD32(pInA)++;
00302           inB1 = *__SIMD32(pInB)++;
00303           inA2 = *__SIMD32(pInA)++;
00304           inB2 = *__SIMD32(pInB)++;
00305 
00306           sum = __SMLAD(inA1, inB1, sum);
00307           sum = __SMLAD(inA2, inB2, sum);
00308 
00309 #else
00310 
00311           inA1 = *pInA++;
00312           inB1 = *pInB++;
00313           inA2 = *pInA++;
00314           sum += inA1 * inB1;
00315           inB2 = *pInB++;
00316 
00317           inA1 = *pInA++;
00318           inB1 = *pInB++;
00319           sum += inA2 * inB2;
00320           inA2 = *pInA++;
00321           inB2 = *pInB++;
00322 
00323           sum += inA1 * inB1;
00324           sum += inA2 * inB2;
00325 
00326 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00327 
00328           /* Decrement the loop counter */
00329           colCnt--;
00330         }
00331 
00332         /* process odd column samples */
00333         colCnt = numColsA % 0x4u;
00334 
00335         while(colCnt > 0u)
00336         {
00337           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
00338           sum += (q31_t) (*pInA++) * (*pInB++);
00339 
00340           colCnt--;
00341         }
00342 
00343         /* Saturate and store the result in the destination buffer */
00344         *px = (q15_t) (sum >> 15);
00345         px++;
00346 
00347         /* Decrement the column loop counter */
00348         col--;
00349 
00350       } while(col > 0u);
00351 
00352       i = i + numColsA;
00353 
00354       /* Decrement the row loop counter */
00355       row--;
00356 
00357     } while(row > 0u);
00358 
00359     /* set status as ARM_MATH_SUCCESS */
00360     status = ARM_MATH_SUCCESS;
00361   }
00362 
00363   /* Return to application */
00364   return (status);
00365 }
00366 
00367 /**        
00368  * @} end of MatrixMult group        
00369  */