Aded CMSIS5 DSP and NN folder. Needs some work
Embed:
(wiki syntax)
Show/hide line numbers
arm_mat_mult_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_mat_mult_fast_q15.c 00004 * Description: Q15 matrix multiplication (fast variant) 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupMatrix 00033 */ 00034 00035 /** 00036 * @addtogroup MatrixMult 00037 * @{ 00038 */ 00039 00040 00041 /** 00042 * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4 00043 * @param[in] *pSrcA points to the first input matrix structure 00044 * @param[in] *pSrcB points to the second input matrix structure 00045 * @param[out] *pDst points to output matrix structure 00046 * @param[in] *pState points to the array for storing intermediate results 00047 * @return The function returns either 00048 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 00049 * 00050 * @details 00051 * <b>Scaling and Overflow Behavior:</b> 00052 * 00053 * \par 00054 * The difference between the function arm_mat_mult_q15() and this fast variant is that 00055 * the fast variant use a 32-bit rather than a 64-bit accumulator. 00056 * The result of each 1.15 x 1.15 multiplication is truncated to 00057 * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30 00058 * format. Finally, the accumulator is saturated and converted to a 1.15 result. 00059 * 00060 * \par 00061 * The fast version has the same overflow behavior as the standard version but provides 00062 * less precision since it discards the low 16 bits of each multiplication result. 00063 * In order to avoid overflows completely the input signals must be scaled down. 00064 * Scale down one of the input matrices by log2(numColsA) bits to 00065 * avoid overflows, as a total of numColsA additions are computed internally for each 00066 * output element. 00067 * 00068 * \par 00069 * See <code>arm_mat_mult_q15()</code> for a slower implementation of this function 00070 * which uses 64-bit accumulation to provide higher precision. 00071 */ 00072 00073 arm_status arm_mat_mult_fast_q15( 00074 const arm_matrix_instance_q15 * pSrcA, 00075 const arm_matrix_instance_q15 * pSrcB, 00076 arm_matrix_instance_q15 * pDst, 00077 q15_t * pState) 00078 { 00079 q31_t sum; /* accumulator */ 00080 q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */ 00081 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00082 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00083 q15_t *px; /* Temporary output data matrix pointer */ 00084 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00085 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00086 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00087 uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ 00088 uint32_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */ 00089 arm_status status; /* status of matrix multiplication */ 00090 00091 #ifndef UNALIGNED_SUPPORT_DISABLE 00092 00093 q31_t in; /* Temporary variable to hold the input value */ 00094 q31_t inA1, inA2, inB1, inB2; 00095 q31_t sum2, sum3, sum4; 00096 q15_t *pInA2, *pInB2, *px2; 00097 uint32_t j = 0; 00098 00099 #else 00100 00101 q15_t in; /* Temporary variable to hold the input value */ 00102 q15_t inA1, inA2, inB1, inB2; 00103 00104 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00105 00106 #ifdef ARM_MATH_MATRIX_CHECK 00107 /* Check for matrix mismatch condition */ 00108 if ((pSrcA->numCols != pSrcB->numRows) || 00109 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00110 { 00111 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00112 status = ARM_MATH_SIZE_MISMATCH; 00113 } 00114 else 00115 #endif 00116 { 00117 /* Matrix transpose */ 00118 do 00119 { 00120 /* Apply loop unrolling and exchange the columns with row elements */ 00121 col = numColsB >> 2; 00122 00123 /* The pointer px is set to starting address of the column being processed */ 00124 px = pSrcBT + i; 00125 00126 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00127 ** a second loop below computes the remaining 1 to 3 samples. */ 00128 while (col > 0U) 00129 { 00130 #ifndef UNALIGNED_SUPPORT_DISABLE 00131 /* Read two elements from the row */ 00132 in = *__SIMD32(pInB)++; 00133 00134 /* Unpack and store one element in the destination */ 00135 #ifndef ARM_MATH_BIG_ENDIAN 00136 00137 *px = (q15_t) in; 00138 00139 #else 00140 00141 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00142 00143 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00144 00145 /* Update the pointer px to point to the next row of the transposed matrix */ 00146 px += numRowsB; 00147 00148 /* Unpack and store the second element in the destination */ 00149 #ifndef ARM_MATH_BIG_ENDIAN 00150 00151 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00152 00153 #else 00154 00155 *px = (q15_t) in; 00156 00157 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00158 00159 /* Update the pointer px to point to the next row of the transposed matrix */ 00160 px += numRowsB; 00161 00162 /* Read two elements from the row */ 00163 in = *__SIMD32(pInB)++; 00164 00165 /* Unpack and store one element in the destination */ 00166 #ifndef ARM_MATH_BIG_ENDIAN 00167 00168 *px = (q15_t) in; 00169 00170 #else 00171 00172 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00173 00174 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00175 00176 /* Update the pointer px to point to the next row of the transposed matrix */ 00177 px += numRowsB; 00178 00179 /* Unpack and store the second element in the destination */ 00180 00181 #ifndef ARM_MATH_BIG_ENDIAN 00182 00183 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00184 00185 #else 00186 00187 *px = (q15_t) in; 00188 00189 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00190 00191 #else 00192 00193 /* Read one element from the row */ 00194 in = *pInB++; 00195 00196 /* Store one element in the destination */ 00197 *px = in; 00198 00199 /* Update the pointer px to point to the next row of the transposed matrix */ 00200 px += numRowsB; 00201 00202 /* Read one element from the row */ 00203 in = *pInB++; 00204 00205 /* Store one element in the destination */ 00206 *px = in; 00207 00208 /* Update the pointer px to point to the next row of the transposed matrix */ 00209 px += numRowsB; 00210 00211 /* Read one element from the row */ 00212 in = *pInB++; 00213 00214 /* Store one element in the destination */ 00215 *px = in; 00216 00217 /* Update the pointer px to point to the next row of the transposed matrix */ 00218 px += numRowsB; 00219 00220 /* Read one element from the row */ 00221 in = *pInB++; 00222 00223 /* Store one element in the destination */ 00224 *px = in; 00225 00226 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00227 00228 /* Update the pointer px to point to the next row of the transposed matrix */ 00229 px += numRowsB; 00230 00231 /* Decrement the column loop counter */ 00232 col--; 00233 } 00234 00235 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00236 ** No loop unrolling is used. */ 00237 col = numColsB % 0x4U; 00238 00239 while (col > 0U) 00240 { 00241 /* Read and store the input element in the destination */ 00242 *px = *pInB++; 00243 00244 /* Update the pointer px to point to the next row of the transposed matrix */ 00245 px += numRowsB; 00246 00247 /* Decrement the column loop counter */ 00248 col--; 00249 } 00250 00251 i++; 00252 00253 /* Decrement the row loop counter */ 00254 row--; 00255 00256 } while (row > 0U); 00257 00258 /* Reset the variables for the usage in the following multiplication process */ 00259 row = numRowsA; 00260 i = 0U; 00261 px = pDst->pData; 00262 00263 #ifndef UNALIGNED_SUPPORT_DISABLE 00264 /* Process two rows from matrix A at a time and output two rows at a time */ 00265 row = row >> 1; 00266 px2 = px + numColsB; 00267 #endif 00268 00269 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00270 /* row loop */ 00271 while (row > 0U) 00272 { 00273 /* For every row wise process, the column loop counter is to be initiated */ 00274 col = numColsB; 00275 00276 /* For every row wise process, the pIn2 pointer is set 00277 ** to the starting address of the transposed pSrcB data */ 00278 pInB = pSrcBT; 00279 00280 #ifndef UNALIGNED_SUPPORT_DISABLE 00281 /* Process two (transposed) columns from matrix B at a time */ 00282 col = col >> 1; 00283 j = 0; 00284 #endif 00285 00286 /* column loop */ 00287 while (col > 0U) 00288 { 00289 /* Set the variable sum, that acts as accumulator, to zero */ 00290 sum = 0; 00291 00292 /* Initiate the pointer pInA to point to the starting address of the column being processed */ 00293 pInA = pSrcA->pData + i; 00294 00295 #ifndef UNALIGNED_SUPPORT_DISABLE 00296 sum2 = 0; 00297 sum3 = 0; 00298 sum4 = 0; 00299 pInB = pSrcBT + j; 00300 pInA2 = pInA + numColsA; 00301 pInB2 = pInB + numRowsB; 00302 00303 /* Read in two elements at once - alows dual MAC instruction */ 00304 colCnt = numColsA >> 1; 00305 #else 00306 colCnt = numColsA >> 2; 00307 #endif 00308 00309 /* matrix multiplication */ 00310 while (colCnt > 0U) 00311 { 00312 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00313 #ifndef UNALIGNED_SUPPORT_DISABLE 00314 00315 inA1 = *__SIMD32(pInA)++; 00316 inB1 = *__SIMD32(pInB)++; 00317 inA2 = *__SIMD32(pInA2)++; 00318 inB2 = *__SIMD32(pInB2)++; 00319 00320 sum = __SMLAD(inA1, inB1, sum); 00321 sum2 = __SMLAD(inA1, inB2, sum2); 00322 sum3 = __SMLAD(inA2, inB1, sum3); 00323 sum4 = __SMLAD(inA2, inB2, sum4); 00324 00325 #else 00326 00327 inA1 = *pInA; 00328 inB1 = *pInB; 00329 sum += inA1 * inB1; 00330 00331 inA2 = pInA[1]; 00332 inB2 = pInB[1]; 00333 sum += inA2 * inB2; 00334 00335 inA1 = pInA[2]; 00336 inB1 = pInB[2]; 00337 sum += inA1 * inB1; 00338 00339 inA2 = pInA[3]; 00340 inB2 = pInB[3]; 00341 sum += inA2 * inB2; 00342 00343 pInA += 4; 00344 pInB += 4; 00345 00346 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00347 00348 /* Decrement the loop counter */ 00349 colCnt--; 00350 } 00351 00352 /* process odd column samples */ 00353 #ifndef UNALIGNED_SUPPORT_DISABLE 00354 if (numColsA & 1U) { 00355 inA1 = *pInA++; 00356 inB1 = *pInB++; 00357 inA2 = *pInA2++; 00358 inB2 = *pInB2++; 00359 sum += inA1 * inB1; 00360 sum2 += inA1 * inB2; 00361 sum3 += inA2 * inB1; 00362 sum4 += inA2 * inB2; 00363 } 00364 #else 00365 colCnt = numColsA % 0x4U; 00366 00367 while (colCnt > 0U) 00368 { 00369 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00370 sum += (q31_t) (*pInA++) * (*pInB++); 00371 00372 colCnt--; 00373 } 00374 #endif 00375 00376 /* Saturate and store the result in the destination buffer */ 00377 *px++ = (q15_t) (sum >> 15); 00378 00379 #ifndef UNALIGNED_SUPPORT_DISABLE 00380 *px++ = (q15_t) (sum2 >> 15); 00381 *px2++ = (q15_t) (sum3 >> 15); 00382 *px2++ = (q15_t) (sum4 >> 15); 00383 j += numRowsB * 2; 00384 #endif 00385 00386 /* Decrement the column loop counter */ 00387 col--; 00388 00389 } 00390 00391 i = i + numColsA; 00392 00393 #ifndef UNALIGNED_SUPPORT_DISABLE 00394 i = i + numColsA; 00395 px = px2 + (numColsB & 1U); 00396 px2 = px + numColsB; 00397 #endif 00398 00399 /* Decrement the row loop counter */ 00400 row--; 00401 00402 } 00403 00404 /* Compute any remaining odd row/column below */ 00405 00406 #ifndef UNALIGNED_SUPPORT_DISABLE 00407 00408 /* Compute remaining output column */ 00409 if (numColsB & 1U) { 00410 00411 /* Avoid redundant computation of last element */ 00412 row = numRowsA & (~0x1); 00413 00414 /* Point to remaining unfilled column in output matrix */ 00415 px = pDst->pData+numColsB-1; 00416 pInA = pSrcA->pData; 00417 00418 /* row loop */ 00419 while (row > 0) 00420 { 00421 00422 /* point to last column in matrix B */ 00423 pInB = pSrcBT + numRowsB*(numColsB-1); 00424 00425 /* Set the variable sum, that acts as accumulator, to zero */ 00426 sum = 0; 00427 00428 /* Compute 4 columns at once */ 00429 colCnt = numColsA >> 2; 00430 00431 /* matrix multiplication */ 00432 while (colCnt > 0U) 00433 { 00434 inA1 = *__SIMD32(pInA)++; 00435 inA2 = *__SIMD32(pInA)++; 00436 inB1 = *__SIMD32(pInB)++; 00437 inB2 = *__SIMD32(pInB)++; 00438 00439 sum = __SMLAD(inA1, inB1, sum); 00440 sum = __SMLAD(inA2, inB2, sum); 00441 00442 /* Decrement the loop counter */ 00443 colCnt--; 00444 } 00445 00446 colCnt = numColsA & 3U; 00447 while (colCnt > 0U) { 00448 sum += (q31_t) (*pInA++) * (*pInB++); 00449 colCnt--; 00450 } 00451 00452 /* Store the result in the destination buffer */ 00453 *px = (q15_t) (sum >> 15); 00454 px += numColsB; 00455 00456 /* Decrement the row loop counter */ 00457 row--; 00458 } 00459 } 00460 00461 /* Compute remaining output row */ 00462 if (numRowsA & 1U) { 00463 00464 /* point to last row in output matrix */ 00465 px = pDst->pData+(numColsB)*(numRowsA-1); 00466 00467 pInB = pSrcBT; 00468 col = numColsB; 00469 i = 0U; 00470 00471 /* col loop */ 00472 while (col > 0) 00473 { 00474 00475 /* point to last row in matrix A */ 00476 pInA = pSrcA->pData + (numRowsA-1)*numColsA; 00477 00478 /* Set the variable sum, that acts as accumulator, to zero */ 00479 sum = 0; 00480 00481 /* Compute 4 columns at once */ 00482 colCnt = numColsA >> 2; 00483 00484 /* matrix multiplication */ 00485 while (colCnt > 0U) 00486 { 00487 inA1 = *__SIMD32(pInA)++; 00488 inA2 = *__SIMD32(pInA)++; 00489 inB1 = *__SIMD32(pInB)++; 00490 inB2 = *__SIMD32(pInB)++; 00491 00492 sum = __SMLAD(inA1, inB1, sum); 00493 sum = __SMLAD(inA2, inB2, sum); 00494 00495 /* Decrement the loop counter */ 00496 colCnt--; 00497 } 00498 00499 colCnt = numColsA & 3U; 00500 while (colCnt > 0U) { 00501 sum += (q31_t) (*pInA++) * (*pInB++); 00502 colCnt--; 00503 } 00504 00505 /* Store the result in the destination buffer */ 00506 *px++ = (q15_t) (sum >> 15); 00507 00508 /* Decrement the col loop counter */ 00509 col--; 00510 } 00511 } 00512 00513 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00514 00515 /* set status as ARM_MATH_SUCCESS */ 00516 status = ARM_MATH_SUCCESS; 00517 } 00518 00519 /* Return to application */ 00520 return (status); 00521 } 00522 00523 /** 00524 * @} end of MatrixMult group 00525 */ 00526
Generated on Tue Jul 12 2022 16:47:27 by 1.7.2