Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
arm_mat_mult_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_mat_mult_q31.c 00004 * Description: Q31 matrix multiplication 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupMatrix 00033 */ 00034 00035 /** 00036 * @addtogroup MatrixMult 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Q31 matrix multiplication 00042 * @param[in] *pSrcA points to the first input matrix structure 00043 * @param[in] *pSrcB points to the second input matrix structure 00044 * @param[out] *pDst points to output matrix structure 00045 * @return The function returns either 00046 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 00047 * 00048 * @details 00049 * <b>Scaling and Overflow Behavior:</b> 00050 * 00051 * \par 00052 * The function is implemented using an internal 64-bit accumulator. 00053 * The accumulator has a 2.62 format and maintains full precision of the intermediate 00054 * multiplication results but provides only a single guard bit. There is no saturation 00055 * on intermediate additions. Thus, if the accumulator overflows it wraps around and 00056 * distorts the result. The input signals should be scaled down to avoid intermediate 00057 * overflows. The input is thus scaled down by log2(numColsA) bits 00058 * to avoid overflows, as a total of numColsA additions are performed internally. 00059 * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. 00060 * 00061 * \par 00062 * See <code>arm_mat_mult_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. 00063 * 00064 */ 00065 00066 arm_status arm_mat_mult_q31( 00067 const arm_matrix_instance_q31 * pSrcA, 00068 const arm_matrix_instance_q31 * pSrcB, 00069 arm_matrix_instance_q31 * pDst) 00070 { 00071 q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */ 00072 q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */ 00073 q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */ 00074 q31_t *pOut = pDst->pData; /* output data matrix pointer */ 00075 q31_t *px; /* Temporary output data matrix pointer */ 00076 q63_t sum; /* Accumulator */ 00077 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00078 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00079 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00080 00081 #if defined (ARM_MATH_DSP) 00082 00083 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00084 00085 uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */ 00086 arm_status status; /* status of matrix multiplication */ 00087 q31_t a0, a1, a2, a3, b0, b1, b2, b3; 00088 00089 #ifdef ARM_MATH_MATRIX_CHECK 00090 00091 00092 /* Check for matrix mismatch condition */ 00093 if ((pSrcA->numCols != pSrcB->numRows) || 00094 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00095 { 00096 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00097 status = ARM_MATH_SIZE_MISMATCH; 00098 } 00099 else 00100 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00101 00102 { 00103 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00104 /* row loop */ 00105 do 00106 { 00107 /* Output pointer is set to starting address of the row being processed */ 00108 px = pOut + i; 00109 00110 /* For every row wise process, the column loop counter is to be initiated */ 00111 col = numColsB; 00112 00113 /* For every row wise process, the pIn2 pointer is set 00114 ** to the starting address of the pSrcB data */ 00115 pIn2 = pSrcB->pData; 00116 00117 j = 0U; 00118 00119 /* column loop */ 00120 do 00121 { 00122 /* Set the variable sum, that acts as accumulator, to zero */ 00123 sum = 0; 00124 00125 /* Initiate the pointer pIn1 to point to the starting address of pInA */ 00126 pIn1 = pInA; 00127 00128 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00129 colCnt = numColsA >> 2; 00130 00131 00132 /* matrix multiplication */ 00133 while (colCnt > 0U) 00134 { 00135 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00136 /* Perform the multiply-accumulates */ 00137 b0 = *pIn2; 00138 pIn2 += numColsB; 00139 00140 a0 = *pIn1++; 00141 a1 = *pIn1++; 00142 00143 b1 = *pIn2; 00144 pIn2 += numColsB; 00145 b2 = *pIn2; 00146 pIn2 += numColsB; 00147 00148 sum += (q63_t) a0 *b0; 00149 sum += (q63_t) a1 *b1; 00150 00151 a2 = *pIn1++; 00152 a3 = *pIn1++; 00153 00154 b3 = *pIn2; 00155 pIn2 += numColsB; 00156 00157 sum += (q63_t) a2 *b2; 00158 sum += (q63_t) a3 *b3; 00159 00160 /* Decrement the loop counter */ 00161 colCnt--; 00162 } 00163 00164 /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here. 00165 ** No loop unrolling is used. */ 00166 colCnt = numColsA % 0x4U; 00167 00168 while (colCnt > 0U) 00169 { 00170 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00171 /* Perform the multiply-accumulates */ 00172 sum += (q63_t) * pIn1++ * *pIn2; 00173 pIn2 += numColsB; 00174 00175 /* Decrement the loop counter */ 00176 colCnt--; 00177 } 00178 00179 /* Convert the result from 2.62 to 1.31 format and store in destination buffer */ 00180 *px++ = (q31_t) (sum >> 31); 00181 00182 /* Update the pointer pIn2 to point to the starting address of the next column */ 00183 j++; 00184 pIn2 = (pSrcB->pData) + j; 00185 00186 /* Decrement the column loop counter */ 00187 col--; 00188 00189 } while (col > 0U); 00190 00191 #else 00192 00193 /* Run the below code for Cortex-M0 */ 00194 00195 q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */ 00196 uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */ 00197 arm_status status; /* status of matrix multiplication */ 00198 00199 00200 #ifdef ARM_MATH_MATRIX_CHECK 00201 00202 /* Check for matrix mismatch condition */ 00203 if ((pSrcA->numCols != pSrcB->numRows) || 00204 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00205 { 00206 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00207 status = ARM_MATH_SIZE_MISMATCH; 00208 } 00209 else 00210 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00211 00212 { 00213 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00214 /* row loop */ 00215 do 00216 { 00217 /* Output pointer is set to starting address of the row being processed */ 00218 px = pOut + i; 00219 00220 /* For every row wise process, the column loop counter is to be initiated */ 00221 col = numColsB; 00222 00223 /* For every row wise process, the pIn2 pointer is set 00224 ** to the starting address of the pSrcB data */ 00225 pIn2 = pSrcB->pData; 00226 00227 /* column loop */ 00228 do 00229 { 00230 /* Set the variable sum, that acts as accumulator, to zero */ 00231 sum = 0; 00232 00233 /* Initiate the pointer pIn1 to point to the starting address of pInA */ 00234 pIn1 = pInA; 00235 00236 /* Matrix A columns number of MAC operations are to be performed */ 00237 colCnt = numColsA; 00238 00239 /* matrix multiplication */ 00240 while (colCnt > 0U) 00241 { 00242 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00243 /* Perform the multiply-accumulates */ 00244 sum += (q63_t) * pIn1++ * *pIn2; 00245 pIn2 += numColsB; 00246 00247 /* Decrement the loop counter */ 00248 colCnt--; 00249 } 00250 00251 /* Convert the result from 2.62 to 1.31 format and store in destination buffer */ 00252 *px++ = (q31_t) clip_q63_to_q31(sum >> 31); 00253 00254 /* Decrement the column loop counter */ 00255 col--; 00256 00257 /* Update the pointer pIn2 to point to the starting address of the next column */ 00258 pIn2 = pInB + (numColsB - col); 00259 00260 } while (col > 0U); 00261 00262 #endif 00263 00264 /* Update the pointer pInA to point to the starting address of the next row */ 00265 i = i + numColsB; 00266 pInA = pInA + numColsA; 00267 00268 /* Decrement the row loop counter */ 00269 row--; 00270 00271 } while (row > 0U); 00272 00273 /* set status as ARM_MATH_SUCCESS */ 00274 status = ARM_MATH_SUCCESS; 00275 } 00276 /* Return to application */ 00277 return (status); 00278 } 00279 00280 /** 00281 * @} end of MatrixMult group 00282 */ 00283
Generated on Tue Jul 12 2022 16:47:27 by
