CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_mat_mult_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_mat_mult_fast_q15.c 00009 * 00010 * Description: Q15 matrix multiplication (fast variant) 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated. 00025 * -------------------------------------------------------------------- */ 00026 00027 #include "arm_math.h" 00028 00029 /** 00030 * @ingroup groupMatrix 00031 */ 00032 00033 /** 00034 * @addtogroup MatrixMult 00035 * @{ 00036 */ 00037 00038 00039 /** 00040 * @brief Q15 matrix multiplication (fast variant) 00041 * @param[in] *pSrcA points to the first input matrix structure 00042 * @param[in] *pSrcB points to the second input matrix structure 00043 * @param[out] *pDst points to output matrix structure 00044 * @param[in] *pState points to the array for storing intermediate results 00045 * @return The function returns either 00046 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 00047 * 00048 * @details 00049 * <b>Scaling and Overflow Behavior:</b> 00050 * 00051 * \par 00052 * The difference between the function arm_mat_mult_q15() and this fast variant is that 00053 * the fast variant use a 32-bit rather than a 64-bit accumulator. 00054 * The result of each 1.15 x 1.15 multiplication is truncated to 00055 * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30 00056 * format. Finally, the accumulator is saturated and converted to a 1.15 result. 00057 * 00058 * \par 00059 * The fast version has the same overflow behavior as the standard version but provides 00060 * less precision since it discards the low 16 bits of each multiplication result. 00061 * In order to avoid overflows completely the input signals must be scaled down. 00062 * Scale down one of the input matrices by log2(numColsA) bits to 00063 * avoid overflows, as a total of numColsA additions are computed internally for each 00064 * output element. 00065 * 00066 * \par 00067 * See <code>arm_mat_mult_q15()</code> for a slower implementation of this function 00068 * which uses 64-bit accumulation to provide higher precision. 00069 */ 00070 00071 arm_status arm_mat_mult_fast_q15( 00072 const arm_matrix_instance_q15 * pSrcA, 00073 const arm_matrix_instance_q15 * pSrcB, 00074 arm_matrix_instance_q15 * pDst, 00075 q15_t * pState) 00076 { 00077 q31_t sum; /* accumulator */ 00078 q31_t in; /* Temporary variable to hold the input value */ 00079 q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */ 00080 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00081 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00082 // q15_t *pDst = pDst->pData; /* output data matrix pointer */ 00083 q15_t *px; /* Temporary output data matrix pointer */ 00084 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00085 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00086 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00087 uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ 00088 uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */ 00089 arm_status status; /* status of matrix multiplication */ 00090 00091 #ifdef ARM_MATH_MATRIX_CHECK 00092 /* Check for matrix mismatch condition */ 00093 if((pSrcA->numCols != pSrcB->numRows) || 00094 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00095 { 00096 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00097 status = ARM_MATH_SIZE_MISMATCH; 00098 } 00099 else 00100 #endif 00101 { 00102 /* Matrix transpose */ 00103 do 00104 { 00105 /* Apply loop unrolling and exchange the columns with row elements */ 00106 col = numColsB >> 2; 00107 00108 /* The pointer px is set to starting address of the column being processed */ 00109 px = pSrcBT + i; 00110 00111 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00112 ** a second loop below computes the remaining 1 to 3 samples. */ 00113 while(col > 0u) 00114 { 00115 /* Read two elements from the row */ 00116 in = *__SIMD32(pInB)++; 00117 00118 /* Unpack and store one element in the destination */ 00119 *px = (q15_t) in; 00120 00121 /* Update the pointer px to point to the next row of the transposed matrix */ 00122 px += numRowsB; 00123 00124 /* Unpack and store the second element in the destination */ 00125 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00126 00127 /* Update the pointer px to point to the next row of the transposed matrix */ 00128 px += numRowsB; 00129 00130 /* Read two elements from the row */ 00131 in = *__SIMD32(pInB)++; 00132 00133 /* Unpack and store one element in the destination */ 00134 *px = (q15_t) in; 00135 00136 /* Update the pointer px to point to the next row of the transposed matrix */ 00137 px += numRowsB; 00138 00139 /* Unpack and store the second element in the destination */ 00140 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00141 00142 /* Update the pointer px to point to the next row of the transposed matrix */ 00143 px += numRowsB; 00144 00145 /* Decrement the column loop counter */ 00146 col--; 00147 } 00148 00149 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00150 ** No loop unrolling is used. */ 00151 col = numColsB % 0x4u; 00152 00153 while(col > 0u) 00154 { 00155 /* Read and store the input element in the destination */ 00156 *px = *pInB++; 00157 00158 /* Update the pointer px to point to the next row of the transposed matrix */ 00159 px += numRowsB; 00160 00161 /* Decrement the column loop counter */ 00162 col--; 00163 } 00164 00165 i++; 00166 00167 /* Decrement the row loop counter */ 00168 row--; 00169 00170 } while(row > 0u); 00171 00172 /* Reset the variables for the usage in the following multiplication process */ 00173 row = numRowsA; 00174 i = 0u; 00175 px = pDst->pData; 00176 00177 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00178 /* row loop */ 00179 do 00180 { 00181 /* For every row wise process, the column loop counter is to be initiated */ 00182 col = numColsB; 00183 00184 /* For every row wise process, the pIn2 pointer is set 00185 ** to the starting address of the transposed pSrcB data */ 00186 pInB = pSrcBT; 00187 00188 /* column loop */ 00189 do 00190 { 00191 /* Set the variable sum, that acts as accumulator, to zero */ 00192 sum = 0; 00193 00194 /* Apply loop unrolling and compute 2 MACs simultaneously. */ 00195 colCnt = numColsA >> 1; 00196 00197 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ 00198 pInA = pSrcA->pData + i; 00199 00200 /* matrix multiplication */ 00201 while(colCnt > 0u) 00202 { 00203 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00204 sum = __SMLAD(*__SIMD32(pInA)++, *__SIMD32(pInB)++, sum); 00205 00206 /* Decrement the loop counter */ 00207 colCnt--; 00208 } 00209 00210 /* process odd column samples */ 00211 if((numColsA & 0x1u) > 0u) 00212 { 00213 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00214 sum += (q31_t) * pInA * (*pInB++); 00215 } 00216 00217 /* Saturate and store the result in the destination buffer */ 00218 *px = (q15_t) (sum >> 15); 00219 px++; 00220 00221 /* Decrement the column loop counter */ 00222 col--; 00223 00224 } while(col > 0u); 00225 00226 i = i + numColsA; 00227 00228 /* Decrement the row loop counter */ 00229 row--; 00230 00231 } while(row > 0u); 00232 00233 /* set status as ARM_MATH_SUCCESS */ 00234 status = ARM_MATH_SUCCESS; 00235 } 00236 00237 /* Return to application */ 00238 return (status); 00239 } 00240 00241 /** 00242 * @} end of MatrixMult group 00243 */
Generated on Tue Jul 12 2022 14:13:53 by 1.7.2