Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
arm_mat_mult_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_mat_mult_q15.c 00004 * Description: Q15 matrix multiplication 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupMatrix 00033 */ 00034 00035 /** 00036 * @addtogroup MatrixMult 00037 * @{ 00038 */ 00039 00040 00041 /** 00042 * @brief Q15 matrix multiplication 00043 * @param[in] *pSrcA points to the first input matrix structure 00044 * @param[in] *pSrcB points to the second input matrix structure 00045 * @param[out] *pDst points to output matrix structure 00046 * @param[in] *pState points to the array for storing intermediate results (Unused) 00047 * @return The function returns either 00048 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 00049 * 00050 * @details 00051 * <b>Scaling and Overflow Behavior:</b> 00052 * 00053 * \par 00054 * The function is implemented using a 64-bit internal accumulator. The inputs to the 00055 * multiplications are in 1.15 format and multiplications yield a 2.30 result. 00056 * The 2.30 intermediate 00057 * results are accumulated in a 64-bit accumulator in 34.30 format. This approach 00058 * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then 00059 * truncated to 34.15 format by discarding the low 15 bits and then saturated to 00060 * 1.15 format. 00061 * 00062 * \par 00063 * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 00064 * 00065 */ 00066 00067 arm_status arm_mat_mult_q15( 00068 const arm_matrix_instance_q15 * pSrcA, 00069 const arm_matrix_instance_q15 * pSrcB, 00070 arm_matrix_instance_q15 * pDst, 00071 q15_t * pState) 00072 { 00073 q63_t sum; /* accumulator */ 00074 00075 #if defined (ARM_MATH_DSP) 00076 00077 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00078 00079 q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */ 00080 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00081 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00082 q15_t *px; /* Temporary output data matrix pointer */ 00083 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00084 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00085 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00086 uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ 00087 uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */ 00088 arm_status status; /* status of matrix multiplication */ 00089 00090 #ifndef UNALIGNED_SUPPORT_DISABLE 00091 00092 q31_t in; /* Temporary variable to hold the input value */ 00093 q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2; 00094 00095 #else 00096 00097 q15_t in; /* Temporary variable to hold the input value */ 00098 q15_t inA1, inB1, inA2, inB2; 00099 00100 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00101 00102 #ifdef ARM_MATH_MATRIX_CHECK 00103 /* Check for matrix mismatch condition */ 00104 if ((pSrcA->numCols != pSrcB->numRows) || 00105 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00106 { 00107 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00108 status = ARM_MATH_SIZE_MISMATCH; 00109 } 00110 else 00111 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00112 { 00113 /* Matrix transpose */ 00114 do 00115 { 00116 /* Apply loop unrolling and exchange the columns with row elements */ 00117 col = numColsB >> 2; 00118 00119 /* The pointer px is set to starting address of the column being processed */ 00120 px = pSrcBT + i; 00121 00122 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00123 ** a second loop below computes the remaining 1 to 3 samples. */ 00124 while (col > 0U) 00125 { 00126 #ifndef UNALIGNED_SUPPORT_DISABLE 00127 00128 /* Read two elements from the row */ 00129 in = *__SIMD32(pInB)++; 00130 00131 /* Unpack and store one element in the destination */ 00132 #ifndef ARM_MATH_BIG_ENDIAN 00133 00134 *px = (q15_t) in; 00135 00136 #else 00137 00138 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00139 00140 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00141 00142 /* Update the pointer px to point to the next row of the transposed matrix */ 00143 px += numRowsB; 00144 00145 /* Unpack and store the second element in the destination */ 00146 #ifndef ARM_MATH_BIG_ENDIAN 00147 00148 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00149 00150 #else 00151 00152 *px = (q15_t) in; 00153 00154 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00155 00156 /* Update the pointer px to point to the next row of the transposed matrix */ 00157 px += numRowsB; 00158 00159 /* Read two elements from the row */ 00160 in = *__SIMD32(pInB)++; 00161 00162 /* Unpack and store one element in the destination */ 00163 #ifndef ARM_MATH_BIG_ENDIAN 00164 00165 *px = (q15_t) in; 00166 00167 #else 00168 00169 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00170 00171 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00172 00173 /* Update the pointer px to point to the next row of the transposed matrix */ 00174 px += numRowsB; 00175 00176 /* Unpack and store the second element in the destination */ 00177 00178 #ifndef ARM_MATH_BIG_ENDIAN 00179 00180 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00181 00182 #else 00183 00184 *px = (q15_t) in; 00185 00186 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00187 00188 /* Update the pointer px to point to the next row of the transposed matrix */ 00189 px += numRowsB; 00190 00191 #else 00192 00193 /* Read one element from the row */ 00194 in = *pInB++; 00195 00196 /* Store one element in the destination */ 00197 *px = in; 00198 00199 /* Update the pointer px to point to the next row of the transposed matrix */ 00200 px += numRowsB; 00201 00202 /* Read one element from the row */ 00203 in = *pInB++; 00204 00205 /* Store one element in the destination */ 00206 *px = in; 00207 00208 /* Update the pointer px to point to the next row of the transposed matrix */ 00209 px += numRowsB; 00210 00211 /* Read one element from the row */ 00212 in = *pInB++; 00213 00214 /* Store one element in the destination */ 00215 *px = in; 00216 00217 /* Update the pointer px to point to the next row of the transposed matrix */ 00218 px += numRowsB; 00219 00220 /* Read one element from the row */ 00221 in = *pInB++; 00222 00223 /* Store one element in the destination */ 00224 *px = in; 00225 00226 /* Update the pointer px to point to the next row of the transposed matrix */ 00227 px += numRowsB; 00228 00229 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00230 00231 /* Decrement the column loop counter */ 00232 col--; 00233 } 00234 00235 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00236 ** No loop unrolling is used. */ 00237 col = numColsB % 0x4U; 00238 00239 while (col > 0U) 00240 { 00241 /* Read and store the input element in the destination */ 00242 *px = *pInB++; 00243 00244 /* Update the pointer px to point to the next row of the transposed matrix */ 00245 px += numRowsB; 00246 00247 /* Decrement the column loop counter */ 00248 col--; 00249 } 00250 00251 i++; 00252 00253 /* Decrement the row loop counter */ 00254 row--; 00255 00256 } while (row > 0U); 00257 00258 /* Reset the variables for the usage in the following multiplication process */ 00259 row = numRowsA; 00260 i = 0U; 00261 px = pDst->pData; 00262 00263 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00264 /* row loop */ 00265 do 00266 { 00267 /* For every row wise process, the column loop counter is to be initiated */ 00268 col = numColsB; 00269 00270 /* For every row wise process, the pIn2 pointer is set 00271 ** to the starting address of the transposed pSrcB data */ 00272 pInB = pSrcBT; 00273 00274 /* column loop */ 00275 do 00276 { 00277 /* Set the variable sum, that acts as accumulator, to zero */ 00278 sum = 0; 00279 00280 /* Apply loop unrolling and compute 2 MACs simultaneously. */ 00281 colCnt = numColsA >> 2; 00282 00283 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ 00284 pInA = pSrcA->pData + i; 00285 00286 00287 /* matrix multiplication */ 00288 while (colCnt > 0U) 00289 { 00290 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00291 #ifndef UNALIGNED_SUPPORT_DISABLE 00292 00293 /* read real and imag values from pSrcA and pSrcB buffer */ 00294 pSourceA1 = *__SIMD32(pInA)++; 00295 pSourceB1 = *__SIMD32(pInB)++; 00296 00297 pSourceA2 = *__SIMD32(pInA)++; 00298 pSourceB2 = *__SIMD32(pInB)++; 00299 00300 /* Multiply and Accumlates */ 00301 sum = __SMLALD(pSourceA1, pSourceB1, sum); 00302 sum = __SMLALD(pSourceA2, pSourceB2, sum); 00303 00304 #else 00305 /* read real and imag values from pSrcA and pSrcB buffer */ 00306 inA1 = *pInA++; 00307 inB1 = *pInB++; 00308 inA2 = *pInA++; 00309 /* Multiply and Accumlates */ 00310 sum += inA1 * inB1; 00311 inB2 = *pInB++; 00312 00313 inA1 = *pInA++; 00314 inB1 = *pInB++; 00315 /* Multiply and Accumlates */ 00316 sum += inA2 * inB2; 00317 inA2 = *pInA++; 00318 inB2 = *pInB++; 00319 00320 /* Multiply and Accumlates */ 00321 sum += inA1 * inB1; 00322 sum += inA2 * inB2; 00323 00324 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00325 00326 /* Decrement the loop counter */ 00327 colCnt--; 00328 } 00329 00330 /* process remaining column samples */ 00331 colCnt = numColsA & 3U; 00332 00333 while (colCnt > 0U) 00334 { 00335 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00336 sum += *pInA++ * *pInB++; 00337 00338 /* Decrement the loop counter */ 00339 colCnt--; 00340 } 00341 00342 /* Saturate and store the result in the destination buffer */ 00343 *px = (q15_t) (__SSAT((sum >> 15), 16)); 00344 px++; 00345 00346 /* Decrement the column loop counter */ 00347 col--; 00348 00349 } while (col > 0U); 00350 00351 i = i + numColsA; 00352 00353 /* Decrement the row loop counter */ 00354 row--; 00355 00356 } while (row > 0U); 00357 00358 #else 00359 00360 /* Run the below code for Cortex-M0 */ 00361 00362 q15_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */ 00363 q15_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */ 00364 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00365 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00366 q15_t *pOut = pDst->pData; /* output data matrix pointer */ 00367 q15_t *px; /* Temporary output data matrix pointer */ 00368 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00369 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00370 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00371 uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */ 00372 arm_status status; /* status of matrix multiplication */ 00373 00374 #ifdef ARM_MATH_MATRIX_CHECK 00375 00376 /* Check for matrix mismatch condition */ 00377 if ((pSrcA->numCols != pSrcB->numRows) || 00378 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00379 { 00380 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00381 status = ARM_MATH_SIZE_MISMATCH; 00382 } 00383 else 00384 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00385 00386 { 00387 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00388 /* row loop */ 00389 do 00390 { 00391 /* Output pointer is set to starting address of the row being processed */ 00392 px = pOut + i; 00393 00394 /* For every row wise process, the column loop counter is to be initiated */ 00395 col = numColsB; 00396 00397 /* For every row wise process, the pIn2 pointer is set 00398 ** to the starting address of the pSrcB data */ 00399 pIn2 = pSrcB->pData; 00400 00401 /* column loop */ 00402 do 00403 { 00404 /* Set the variable sum, that acts as accumulator, to zero */ 00405 sum = 0; 00406 00407 /* Initiate the pointer pIn1 to point to the starting address of pSrcA */ 00408 pIn1 = pInA; 00409 00410 /* Matrix A columns number of MAC operations are to be performed */ 00411 colCnt = numColsA; 00412 00413 /* matrix multiplication */ 00414 while (colCnt > 0U) 00415 { 00416 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00417 /* Perform the multiply-accumulates */ 00418 sum += (q31_t) * pIn1++ * *pIn2; 00419 pIn2 += numColsB; 00420 00421 /* Decrement the loop counter */ 00422 colCnt--; 00423 } 00424 00425 /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */ 00426 /* Saturate and store the result in the destination buffer */ 00427 *px++ = (q15_t) __SSAT((sum >> 15), 16); 00428 00429 /* Decrement the column loop counter */ 00430 col--; 00431 00432 /* Update the pointer pIn2 to point to the starting address of the next column */ 00433 pIn2 = pInB + (numColsB - col); 00434 00435 } while (col > 0U); 00436 00437 /* Update the pointer pSrcA to point to the starting address of the next row */ 00438 i = i + numColsB; 00439 pInA = pInA + numColsA; 00440 00441 /* Decrement the row loop counter */ 00442 row--; 00443 00444 } while (row > 0U); 00445 00446 #endif /* #if defined (ARM_MATH_DSP) */ 00447 /* set status as ARM_MATH_SUCCESS */ 00448 status = ARM_MATH_SUCCESS; 00449 } 00450 00451 /* Return to application */ 00452 return (status); 00453 } 00454 00455 /** 00456 * @} end of MatrixMult group 00457 */ 00458
Generated on Tue Jul 12 2022 16:47:27 by
