Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-os by
arm_mat_mult_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_mat_mult_q15.c 00009 * 00010 * Description: Q15 matrix multiplication. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupMatrix 00045 */ 00046 00047 /** 00048 * @addtogroup MatrixMult 00049 * @{ 00050 */ 00051 00052 00053 /** 00054 * @brief Q15 matrix multiplication 00055 * @param[in] *pSrcA points to the first input matrix structure 00056 * @param[in] *pSrcB points to the second input matrix structure 00057 * @param[out] *pDst points to output matrix structure 00058 * @param[in] *pState points to the array for storing intermediate results (Unused) 00059 * @return The function returns either 00060 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 00061 * 00062 * @details 00063 * <b>Scaling and Overflow Behavior:</b> 00064 * 00065 * \par 00066 * The function is implemented using a 64-bit internal accumulator. The inputs to the 00067 * multiplications are in 1.15 format and multiplications yield a 2.30 result. 00068 * The 2.30 intermediate 00069 * results are accumulated in a 64-bit accumulator in 34.30 format. This approach 00070 * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then 00071 * truncated to 34.15 format by discarding the low 15 bits and then saturated to 00072 * 1.15 format. 00073 * 00074 * \par 00075 * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00076 * 00077 */ 00078 00079 arm_status arm_mat_mult_q15( 00080 const arm_matrix_instance_q15 * pSrcA, 00081 const arm_matrix_instance_q15 * pSrcB, 00082 arm_matrix_instance_q15 * pDst, 00083 q15_t * pState CMSIS_UNUSED) 00084 { 00085 q63_t sum; /* accumulator */ 00086 00087 #ifndef ARM_MATH_CM0_FAMILY 00088 00089 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00090 00091 q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */ 00092 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00093 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00094 q15_t *px; /* Temporary output data matrix pointer */ 00095 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00096 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00097 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00098 uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ 00099 uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */ 00100 arm_status status; /* status of matrix multiplication */ 00101 00102 #ifndef UNALIGNED_SUPPORT_DISABLE 00103 00104 q31_t in; /* Temporary variable to hold the input value */ 00105 q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2; 00106 00107 #else 00108 00109 q15_t in; /* Temporary variable to hold the input value */ 00110 q15_t inA1, inB1, inA2, inB2; 00111 00112 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00113 00114 #ifdef ARM_MATH_MATRIX_CHECK 00115 /* Check for matrix mismatch condition */ 00116 if((pSrcA->numCols != pSrcB->numRows) || 00117 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00118 { 00119 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00120 status = ARM_MATH_SIZE_MISMATCH; 00121 } 00122 else 00123 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00124 { 00125 /* Matrix transpose */ 00126 do 00127 { 00128 /* Apply loop unrolling and exchange the columns with row elements */ 00129 col = numColsB >> 2; 00130 00131 /* The pointer px is set to starting address of the column being processed */ 00132 px = pSrcBT + i; 00133 00134 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00135 ** a second loop below computes the remaining 1 to 3 samples. */ 00136 while(col > 0u) 00137 { 00138 #ifndef UNALIGNED_SUPPORT_DISABLE 00139 00140 /* Read two elements from the row */ 00141 in = *__SIMD32(pInB)++; 00142 00143 /* Unpack and store one element in the destination */ 00144 #ifndef ARM_MATH_BIG_ENDIAN 00145 00146 *px = (q15_t) in; 00147 00148 #else 00149 00150 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00151 00152 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00153 00154 /* Update the pointer px to point to the next row of the transposed matrix */ 00155 px += numRowsB; 00156 00157 /* Unpack and store the second element in the destination */ 00158 #ifndef ARM_MATH_BIG_ENDIAN 00159 00160 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00161 00162 #else 00163 00164 *px = (q15_t) in; 00165 00166 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00167 00168 /* Update the pointer px to point to the next row of the transposed matrix */ 00169 px += numRowsB; 00170 00171 /* Read two elements from the row */ 00172 in = *__SIMD32(pInB)++; 00173 00174 /* Unpack and store one element in the destination */ 00175 #ifndef ARM_MATH_BIG_ENDIAN 00176 00177 *px = (q15_t) in; 00178 00179 #else 00180 00181 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00182 00183 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00184 00185 /* Update the pointer px to point to the next row of the transposed matrix */ 00186 px += numRowsB; 00187 00188 /* Unpack and store the second element in the destination */ 00189 00190 #ifndef ARM_MATH_BIG_ENDIAN 00191 00192 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00193 00194 #else 00195 00196 *px = (q15_t) in; 00197 00198 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00199 00200 /* Update the pointer px to point to the next row of the transposed matrix */ 00201 px += numRowsB; 00202 00203 #else 00204 00205 /* Read one element from the row */ 00206 in = *pInB++; 00207 00208 /* Store one element in the destination */ 00209 *px = in; 00210 00211 /* Update the pointer px to point to the next row of the transposed matrix */ 00212 px += numRowsB; 00213 00214 /* Read one element from the row */ 00215 in = *pInB++; 00216 00217 /* Store one element in the destination */ 00218 *px = in; 00219 00220 /* Update the pointer px to point to the next row of the transposed matrix */ 00221 px += numRowsB; 00222 00223 /* Read one element from the row */ 00224 in = *pInB++; 00225 00226 /* Store one element in the destination */ 00227 *px = in; 00228 00229 /* Update the pointer px to point to the next row of the transposed matrix */ 00230 px += numRowsB; 00231 00232 /* Read one element from the row */ 00233 in = *pInB++; 00234 00235 /* Store one element in the destination */ 00236 *px = in; 00237 00238 /* Update the pointer px to point to the next row of the transposed matrix */ 00239 px += numRowsB; 00240 00241 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00242 00243 /* Decrement the column loop counter */ 00244 col--; 00245 } 00246 00247 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00248 ** No loop unrolling is used. */ 00249 col = numColsB % 0x4u; 00250 00251 while(col > 0u) 00252 { 00253 /* Read and store the input element in the destination */ 00254 *px = *pInB++; 00255 00256 /* Update the pointer px to point to the next row of the transposed matrix */ 00257 px += numRowsB; 00258 00259 /* Decrement the column loop counter */ 00260 col--; 00261 } 00262 00263 i++; 00264 00265 /* Decrement the row loop counter */ 00266 row--; 00267 00268 } while(row > 0u); 00269 00270 /* Reset the variables for the usage in the following multiplication process */ 00271 row = numRowsA; 00272 i = 0u; 00273 px = pDst->pData; 00274 00275 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00276 /* row loop */ 00277 do 00278 { 00279 /* For every row wise process, the column loop counter is to be initiated */ 00280 col = numColsB; 00281 00282 /* For every row wise process, the pIn2 pointer is set 00283 ** to the starting address of the transposed pSrcB data */ 00284 pInB = pSrcBT; 00285 00286 /* column loop */ 00287 do 00288 { 00289 /* Set the variable sum, that acts as accumulator, to zero */ 00290 sum = 0; 00291 00292 /* Apply loop unrolling and compute 2 MACs simultaneously. */ 00293 colCnt = numColsA >> 2; 00294 00295 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ 00296 pInA = pSrcA->pData + i; 00297 00298 00299 /* matrix multiplication */ 00300 while(colCnt > 0u) 00301 { 00302 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00303 #ifndef UNALIGNED_SUPPORT_DISABLE 00304 00305 /* read real and imag values from pSrcA and pSrcB buffer */ 00306 pSourceA1 = *__SIMD32(pInA)++; 00307 pSourceB1 = *__SIMD32(pInB)++; 00308 00309 pSourceA2 = *__SIMD32(pInA)++; 00310 pSourceB2 = *__SIMD32(pInB)++; 00311 00312 /* Multiply and Accumlates */ 00313 sum = __SMLALD(pSourceA1, pSourceB1, sum); 00314 sum = __SMLALD(pSourceA2, pSourceB2, sum); 00315 00316 #else 00317 /* read real and imag values from pSrcA and pSrcB buffer */ 00318 inA1 = *pInA++; 00319 inB1 = *pInB++; 00320 inA2 = *pInA++; 00321 /* Multiply and Accumlates */ 00322 sum += inA1 * inB1; 00323 inB2 = *pInB++; 00324 00325 inA1 = *pInA++; 00326 inB1 = *pInB++; 00327 /* Multiply and Accumlates */ 00328 sum += inA2 * inB2; 00329 inA2 = *pInA++; 00330 inB2 = *pInB++; 00331 00332 /* Multiply and Accumlates */ 00333 sum += inA1 * inB1; 00334 sum += inA2 * inB2; 00335 00336 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00337 00338 /* Decrement the loop counter */ 00339 colCnt--; 00340 } 00341 00342 /* process remaining column samples */ 00343 colCnt = numColsA & 3u; 00344 00345 while(colCnt > 0u) 00346 { 00347 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00348 sum += *pInA++ * *pInB++; 00349 00350 /* Decrement the loop counter */ 00351 colCnt--; 00352 } 00353 00354 /* Saturate and store the result in the destination buffer */ 00355 *px = (q15_t) (__SSAT((sum >> 15), 16)); 00356 px++; 00357 00358 /* Decrement the column loop counter */ 00359 col--; 00360 00361 } while(col > 0u); 00362 00363 i = i + numColsA; 00364 00365 /* Decrement the row loop counter */ 00366 row--; 00367 00368 } while(row > 0u); 00369 00370 #else 00371 00372 /* Run the below code for Cortex-M0 */ 00373 00374 q15_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */ 00375 q15_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */ 00376 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00377 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00378 q15_t *pOut = pDst->pData; /* output data matrix pointer */ 00379 q15_t *px; /* Temporary output data matrix pointer */ 00380 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00381 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00382 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00383 uint16_t col, i = 0u, row = numRowsA, colCnt; /* loop counters */ 00384 arm_status status; /* status of matrix multiplication */ 00385 00386 #ifdef ARM_MATH_MATRIX_CHECK 00387 00388 /* Check for matrix mismatch condition */ 00389 if((pSrcA->numCols != pSrcB->numRows) || 00390 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00391 { 00392 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00393 status = ARM_MATH_SIZE_MISMATCH; 00394 } 00395 else 00396 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00397 00398 { 00399 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00400 /* row loop */ 00401 do 00402 { 00403 /* Output pointer is set to starting address of the row being processed */ 00404 px = pOut + i; 00405 00406 /* For every row wise process, the column loop counter is to be initiated */ 00407 col = numColsB; 00408 00409 /* For every row wise process, the pIn2 pointer is set 00410 ** to the starting address of the pSrcB data */ 00411 pIn2 = pSrcB->pData; 00412 00413 /* column loop */ 00414 do 00415 { 00416 /* Set the variable sum, that acts as accumulator, to zero */ 00417 sum = 0; 00418 00419 /* Initiate the pointer pIn1 to point to the starting address of pSrcA */ 00420 pIn1 = pInA; 00421 00422 /* Matrix A columns number of MAC operations are to be performed */ 00423 colCnt = numColsA; 00424 00425 /* matrix multiplication */ 00426 while(colCnt > 0u) 00427 { 00428 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00429 /* Perform the multiply-accumulates */ 00430 sum += (q31_t) * pIn1++ * *pIn2; 00431 pIn2 += numColsB; 00432 00433 /* Decrement the loop counter */ 00434 colCnt--; 00435 } 00436 00437 /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */ 00438 /* Saturate and store the result in the destination buffer */ 00439 *px++ = (q15_t) __SSAT((sum >> 15), 16); 00440 00441 /* Decrement the column loop counter */ 00442 col--; 00443 00444 /* Update the pointer pIn2 to point to the starting address of the next column */ 00445 pIn2 = pInB + (numColsB - col); 00446 00447 } while(col > 0u); 00448 00449 /* Update the pointer pSrcA to point to the starting address of the next row */ 00450 i = i + numColsB; 00451 pInA = pInA + numColsA; 00452 00453 /* Decrement the row loop counter */ 00454 row--; 00455 00456 } while(row > 0u); 00457 00458 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00459 /* set status as ARM_MATH_SUCCESS */ 00460 status = ARM_MATH_SUCCESS; 00461 } 00462 00463 /* Return to application */ 00464 return (status); 00465 } 00466 00467 /** 00468 * @} end of MatrixMult group 00469 */
Generated on Tue Jul 12 2022 13:15:25 by
