Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_mat_mult_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_mat_mult_fast_q15.c 00009 * 00010 * Description: Q15 matrix multiplication (fast variant) 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupMatrix 00045 */ 00046 00047 /** 00048 * @addtogroup MatrixMult 00049 * @{ 00050 */ 00051 00052 00053 /** 00054 * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4 00055 * @param[in] *pSrcA points to the first input matrix structure 00056 * @param[in] *pSrcB points to the second input matrix structure 00057 * @param[out] *pDst points to output matrix structure 00058 * @param[in] *pState points to the array for storing intermediate results 00059 * @return The function returns either 00060 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 00061 * 00062 * @details 00063 * <b>Scaling and Overflow Behavior:</b> 00064 * 00065 * \par 00066 * The difference between the function arm_mat_mult_q15() and this fast variant is that 00067 * the fast variant use a 32-bit rather than a 64-bit accumulator. 00068 * The result of each 1.15 x 1.15 multiplication is truncated to 00069 * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30 00070 * format. Finally, the accumulator is saturated and converted to a 1.15 result. 00071 * 00072 * \par 00073 * The fast version has the same overflow behavior as the standard version but provides 00074 * less precision since it discards the low 16 bits of each multiplication result. 00075 * In order to avoid overflows completely the input signals must be scaled down. 00076 * Scale down one of the input matrices by log2(numColsA) bits to 00077 * avoid overflows, as a total of numColsA additions are computed internally for each 00078 * output element. 00079 * 00080 * \par 00081 * See <code>arm_mat_mult_q15()</code> for a slower implementation of this function 00082 * which uses 64-bit accumulation to provide higher precision. 00083 */ 00084 00085 arm_status arm_mat_mult_fast_q15( 00086 const arm_matrix_instance_q15 * pSrcA, 00087 const arm_matrix_instance_q15 * pSrcB, 00088 arm_matrix_instance_q15 * pDst, 00089 q15_t * pState) 00090 { 00091 q31_t sum; /* accumulator */ 00092 q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */ 00093 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00094 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00095 q15_t *px; /* Temporary output data matrix pointer */ 00096 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00097 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00098 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00099 uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ 00100 uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */ 00101 arm_status status; /* status of matrix multiplication */ 00102 00103 #ifndef UNALIGNED_SUPPORT_DISABLE 00104 00105 q31_t in; /* Temporary variable to hold the input value */ 00106 q31_t inA1, inA2, inB1, inB2; 00107 00108 #else 00109 00110 q15_t in; /* Temporary variable to hold the input value */ 00111 q15_t inA1, inA2, inB1, inB2; 00112 00113 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00114 00115 #ifdef ARM_MATH_MATRIX_CHECK 00116 /* Check for matrix mismatch condition */ 00117 if((pSrcA->numCols != pSrcB->numRows) || 00118 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00119 { 00120 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00121 status = ARM_MATH_SIZE_MISMATCH; 00122 } 00123 else 00124 #endif 00125 { 00126 /* Matrix transpose */ 00127 do 00128 { 00129 /* Apply loop unrolling and exchange the columns with row elements */ 00130 col = numColsB >> 2; 00131 00132 /* The pointer px is set to starting address of the column being processed */ 00133 px = pSrcBT + i; 00134 00135 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00136 ** a second loop below computes the remaining 1 to 3 samples. */ 00137 while(col > 0u) 00138 { 00139 #ifndef UNALIGNED_SUPPORT_DISABLE 00140 /* Read two elements from the row */ 00141 in = *__SIMD32(pInB)++; 00142 00143 /* Unpack and store one element in the destination */ 00144 #ifndef ARM_MATH_BIG_ENDIAN 00145 00146 *px = (q15_t) in; 00147 00148 #else 00149 00150 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00151 00152 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00153 00154 /* Update the pointer px to point to the next row of the transposed matrix */ 00155 px += numRowsB; 00156 00157 /* Unpack and store the second element in the destination */ 00158 #ifndef ARM_MATH_BIG_ENDIAN 00159 00160 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00161 00162 #else 00163 00164 *px = (q15_t) in; 00165 00166 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00167 00168 /* Update the pointer px to point to the next row of the transposed matrix */ 00169 px += numRowsB; 00170 00171 /* Read two elements from the row */ 00172 in = *__SIMD32(pInB)++; 00173 00174 /* Unpack and store one element in the destination */ 00175 #ifndef ARM_MATH_BIG_ENDIAN 00176 00177 *px = (q15_t) in; 00178 00179 #else 00180 00181 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00182 00183 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00184 00185 /* Update the pointer px to point to the next row of the transposed matrix */ 00186 px += numRowsB; 00187 00188 /* Unpack and store the second element in the destination */ 00189 00190 #ifndef ARM_MATH_BIG_ENDIAN 00191 00192 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00193 00194 #else 00195 00196 *px = (q15_t) in; 00197 00198 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00199 00200 #else 00201 00202 /* Read one element from the row */ 00203 in = *pInB++; 00204 00205 /* Store one element in the destination */ 00206 *px = in; 00207 00208 /* Update the pointer px to point to the next row of the transposed matrix */ 00209 px += numRowsB; 00210 00211 /* Read one element from the row */ 00212 in = *pInB++; 00213 00214 /* Store one element in the destination */ 00215 *px = in; 00216 00217 /* Update the pointer px to point to the next row of the transposed matrix */ 00218 px += numRowsB; 00219 00220 /* Read one element from the row */ 00221 in = *pInB++; 00222 00223 /* Store one element in the destination */ 00224 *px = in; 00225 00226 /* Update the pointer px to point to the next row of the transposed matrix */ 00227 px += numRowsB; 00228 00229 /* Read one element from the row */ 00230 in = *pInB++; 00231 00232 /* Store one element in the destination */ 00233 *px = in; 00234 00235 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00236 00237 /* Update the pointer px to point to the next row of the transposed matrix */ 00238 px += numRowsB; 00239 00240 /* Decrement the column loop counter */ 00241 col--; 00242 } 00243 00244 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00245 ** No loop unrolling is used. */ 00246 col = numColsB % 0x4u; 00247 00248 while(col > 0u) 00249 { 00250 /* Read and store the input element in the destination */ 00251 *px = *pInB++; 00252 00253 /* Update the pointer px to point to the next row of the transposed matrix */ 00254 px += numRowsB; 00255 00256 /* Decrement the column loop counter */ 00257 col--; 00258 } 00259 00260 i++; 00261 00262 /* Decrement the row loop counter */ 00263 row--; 00264 00265 } while(row > 0u); 00266 00267 /* Reset the variables for the usage in the following multiplication process */ 00268 row = numRowsA; 00269 i = 0u; 00270 px = pDst->pData; 00271 00272 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00273 /* row loop */ 00274 do 00275 { 00276 /* For every row wise process, the column loop counter is to be initiated */ 00277 col = numColsB; 00278 00279 /* For every row wise process, the pIn2 pointer is set 00280 ** to the starting address of the transposed pSrcB data */ 00281 pInB = pSrcBT; 00282 00283 /* column loop */ 00284 do 00285 { 00286 /* Set the variable sum, that acts as accumulator, to zero */ 00287 sum = 0; 00288 00289 /* Apply loop unrolling and compute 2 MACs simultaneously. */ 00290 colCnt = numColsA >> 2; 00291 00292 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ 00293 pInA = pSrcA->pData + i; 00294 00295 /* matrix multiplication */ 00296 while(colCnt > 0u) 00297 { 00298 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00299 #ifndef UNALIGNED_SUPPORT_DISABLE 00300 00301 inA1 = *__SIMD32(pInA)++; 00302 inB1 = *__SIMD32(pInB)++; 00303 inA2 = *__SIMD32(pInA)++; 00304 inB2 = *__SIMD32(pInB)++; 00305 00306 sum = __SMLAD(inA1, inB1, sum); 00307 sum = __SMLAD(inA2, inB2, sum); 00308 00309 #else 00310 00311 inA1 = *pInA++; 00312 inB1 = *pInB++; 00313 inA2 = *pInA++; 00314 sum += inA1 * inB1; 00315 inB2 = *pInB++; 00316 00317 inA1 = *pInA++; 00318 inB1 = *pInB++; 00319 sum += inA2 * inB2; 00320 inA2 = *pInA++; 00321 inB2 = *pInB++; 00322 00323 sum += inA1 * inB1; 00324 sum += inA2 * inB2; 00325 00326 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00327 00328 /* Decrement the loop counter */ 00329 colCnt--; 00330 } 00331 00332 /* process odd column samples */ 00333 colCnt = numColsA % 0x4u; 00334 00335 while(colCnt > 0u) 00336 { 00337 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00338 sum += (q31_t) (*pInA++) * (*pInB++); 00339 00340 colCnt--; 00341 } 00342 00343 /* Saturate and store the result in the destination buffer */ 00344 *px = (q15_t) (sum >> 15); 00345 px++; 00346 00347 /* Decrement the column loop counter */ 00348 col--; 00349 00350 } while(col > 0u); 00351 00352 i = i + numColsA; 00353 00354 /* Decrement the row loop counter */ 00355 row--; 00356 00357 } while(row > 0u); 00358 00359 /* set status as ARM_MATH_SUCCESS */ 00360 status = ARM_MATH_SUCCESS; 00361 } 00362 00363 /* Return to application */ 00364 return (status); 00365 } 00366 00367 /** 00368 * @} end of MatrixMult group 00369 */
Generated on Tue Jul 12 2022 18:44:09 by
1.7.2
