CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_mat_mult_fast_q15.c Source File

arm_mat_mult_fast_q15.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_mat_mult_fast_q15.c  
00009 *  
00010 * Description:   Q15 matrix multiplication (fast variant)  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated.  
00025 * -------------------------------------------------------------------- */ 
00026  
00027 #include "arm_math.h" 
00028  
00029 /**  
00030  * @ingroup groupMatrix  
00031  */ 
00032  
00033 /**  
00034  * @addtogroup MatrixMult  
00035  * @{  
00036  */ 
00037  
00038  
00039 /**  
00040  * @brief Q15 matrix multiplication (fast variant)  
00041  * @param[in]       *pSrcA points to the first input matrix structure  
00042  * @param[in]       *pSrcB points to the second input matrix structure  
00043  * @param[out]      *pDst points to output matrix structure  
00044  * @param[in]       *pState points to the array for storing intermediate results  
00045  * @return          The function returns either  
00046  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.  
00047  *  
00048  * @details  
00049  * <b>Scaling and Overflow Behavior:</b>  
00050  *  
00051  * \par  
00052  * The difference between the function arm_mat_mult_q15() and this fast variant is that  
00053  * the fast variant use a 32-bit rather than a 64-bit accumulator.  
00054  * The result of each 1.15 x 1.15 multiplication is truncated to  
00055  * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30  
00056  * format. Finally, the accumulator is saturated and converted to a 1.15 result.  
00057  *  
00058  * \par  
00059  * The fast version has the same overflow behavior as the standard version but provides  
00060  * less precision since it discards the low 16 bits of each multiplication result.  
00061  * In order to avoid overflows completely the input signals must be scaled down.  
00062  * Scale down one of the input matrices by log2(numColsA) bits to  
00063  * avoid overflows, as a total of numColsA additions are computed internally for each  
00064  * output element.  
00065  *  
00066  * \par  
00067  * See <code>arm_mat_mult_q15()</code> for a slower implementation of this function  
00068  * which uses 64-bit accumulation to provide higher precision.  
00069  */ 
00070  
00071 arm_status arm_mat_mult_fast_q15( 
00072   const arm_matrix_instance_q15 * pSrcA, 
00073   const arm_matrix_instance_q15 * pSrcB, 
00074   arm_matrix_instance_q15 * pDst, 
00075   q15_t * pState) 
00076 { 
00077   q31_t sum;                                     /* accumulator */ 
00078   q31_t in;                                      /* Temporary variable to hold the input value */ 
00079   q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */ 
00080   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */ 
00081   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */ 
00082 //  q15_t *pDst = pDst->pData;                     /* output data matrix pointer */  
00083   q15_t *px;                                     /* Temporary output data matrix pointer */ 
00084   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */ 
00085   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */ 
00086   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */ 
00087   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */ 
00088   uint16_t col, i = 0u, row = numRowsB, colCnt;  /* loop counters */ 
00089   arm_status status;                             /* status of matrix multiplication */ 
00090  
00091 #ifdef ARM_MATH_MATRIX_CHECK 
00092   /* Check for matrix mismatch condition */ 
00093   if((pSrcA->numCols != pSrcB->numRows) || 
00094      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 
00095   { 
00096     /* Set status as ARM_MATH_SIZE_MISMATCH */ 
00097     status = ARM_MATH_SIZE_MISMATCH; 
00098   } 
00099   else 
00100 #endif 
00101   { 
00102     /* Matrix transpose */ 
00103     do 
00104     { 
00105       /* Apply loop unrolling and exchange the columns with row elements */ 
00106       col = numColsB >> 2; 
00107  
00108       /* The pointer px is set to starting address of the column being processed */ 
00109       px = pSrcBT + i; 
00110  
00111       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
00112        ** a second loop below computes the remaining 1 to 3 samples. */ 
00113       while(col > 0u) 
00114       { 
00115         /* Read two elements from the row */ 
00116         in = *__SIMD32(pInB)++; 
00117  
00118         /* Unpack and store one element in the destination */ 
00119         *px = (q15_t) in; 
00120  
00121         /* Update the pointer px to point to the next row of the transposed matrix */ 
00122         px += numRowsB; 
00123  
00124         /* Unpack and store the second element in the destination */ 
00125         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 
00126  
00127         /* Update the pointer px to point to the next row of the transposed matrix */ 
00128         px += numRowsB; 
00129  
00130         /* Read two elements from the row */ 
00131         in = *__SIMD32(pInB)++; 
00132  
00133         /* Unpack and store one element in the destination */ 
00134         *px = (q15_t) in; 
00135  
00136         /* Update the pointer px to point to the next row of the transposed matrix */ 
00137         px += numRowsB; 
00138  
00139         /* Unpack and store the second element in the destination */ 
00140         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 
00141  
00142         /* Update the pointer px to point to the next row of the transposed matrix */ 
00143         px += numRowsB; 
00144  
00145         /* Decrement the column loop counter */ 
00146         col--; 
00147       } 
00148  
00149       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.  
00150        ** No loop unrolling is used. */ 
00151       col = numColsB % 0x4u; 
00152  
00153       while(col > 0u) 
00154       { 
00155         /* Read and store the input element in the destination */ 
00156         *px = *pInB++; 
00157  
00158         /* Update the pointer px to point to the next row of the transposed matrix */ 
00159         px += numRowsB; 
00160  
00161         /* Decrement the column loop counter */ 
00162         col--; 
00163       } 
00164  
00165       i++; 
00166  
00167       /* Decrement the row loop counter */ 
00168       row--; 
00169  
00170     } while(row > 0u); 
00171  
00172     /* Reset the variables for the usage in the following multiplication process */ 
00173     row = numRowsA; 
00174     i = 0u; 
00175     px = pDst->pData; 
00176  
00177     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 
00178     /* row loop */ 
00179     do 
00180     { 
00181       /* For every row wise process, the column loop counter is to be initiated */ 
00182       col = numColsB; 
00183  
00184       /* For every row wise process, the pIn2 pointer is set  
00185        ** to the starting address of the transposed pSrcB data */ 
00186       pInB = pSrcBT; 
00187  
00188       /* column loop */ 
00189       do 
00190       { 
00191         /* Set the variable sum, that acts as accumulator, to zero */ 
00192         sum = 0; 
00193  
00194         /* Apply loop unrolling and compute 2 MACs simultaneously. */ 
00195         colCnt = numColsA >> 1; 
00196  
00197         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ 
00198         pInA = pSrcA->pData + i; 
00199  
00200         /* matrix multiplication */ 
00201         while(colCnt > 0u) 
00202         { 
00203           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 
00204           sum = __SMLAD(*__SIMD32(pInA)++, *__SIMD32(pInB)++, sum); 
00205  
00206           /* Decrement the loop counter */ 
00207           colCnt--; 
00208         } 
00209  
00210         /* process odd column samples */ 
00211         if((numColsA & 0x1u) > 0u) 
00212         { 
00213           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 
00214           sum += (q31_t) * pInA * (*pInB++); 
00215         } 
00216  
00217         /* Saturate and store the result in the destination buffer */ 
00218         *px = (q15_t) (sum >> 15); 
00219         px++; 
00220  
00221         /* Decrement the column loop counter */ 
00222         col--; 
00223  
00224       } while(col > 0u); 
00225  
00226       i = i + numColsA; 
00227  
00228       /* Decrement the row loop counter */ 
00229       row--; 
00230  
00231     } while(row > 0u); 
00232  
00233     /* set status as ARM_MATH_SUCCESS */ 
00234     status = ARM_MATH_SUCCESS; 
00235   } 
00236  
00237   /* Return to application */ 
00238   return (status); 
00239 } 
00240  
00241 /**  
00242  * @} end of MatrixMult group  
00243  */