Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-os by
arm_mat_mult_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_mat_mult_q31.c 00009 * 00010 * Description: Q31 matrix multiplication. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupMatrix 00045 */ 00046 00047 /** 00048 * @addtogroup MatrixMult 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Q31 matrix multiplication 00054 * @param[in] *pSrcA points to the first input matrix structure 00055 * @param[in] *pSrcB points to the second input matrix structure 00056 * @param[out] *pDst points to output matrix structure 00057 * @return The function returns either 00058 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 00059 * 00060 * @details 00061 * <b>Scaling and Overflow Behavior:</b> 00062 * 00063 * \par 00064 * The function is implemented using an internal 64-bit accumulator. 00065 * The accumulator has a 2.62 format and maintains full precision of the intermediate 00066 * multiplication results but provides only a single guard bit. There is no saturation 00067 * on intermediate additions. Thus, if the accumulator overflows it wraps around and 00068 * distorts the result. The input signals should be scaled down to avoid intermediate 00069 * overflows. The input is thus scaled down by log2(numColsA) bits 00070 * to avoid overflows, as a total of numColsA additions are performed internally. 00071 * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. 00072 * 00073 * \par 00074 * See <code>arm_mat_mult_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. 00075 * 00076 */ 00077 00078 arm_status arm_mat_mult_q31( 00079 const arm_matrix_instance_q31 * pSrcA, 00080 const arm_matrix_instance_q31 * pSrcB, 00081 arm_matrix_instance_q31 * pDst) 00082 { 00083 q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */ 00084 q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */ 00085 q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */ 00086 q31_t *pOut = pDst->pData; /* output data matrix pointer */ 00087 q31_t *px; /* Temporary output data matrix pointer */ 00088 q63_t sum; /* Accumulator */ 00089 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00090 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00091 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00092 00093 #ifndef ARM_MATH_CM0_FAMILY 00094 00095 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00096 00097 uint16_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */ 00098 arm_status status; /* status of matrix multiplication */ 00099 q31_t a0, a1, a2, a3, b0, b1, b2, b3; 00100 00101 #ifdef ARM_MATH_MATRIX_CHECK 00102 00103 00104 /* Check for matrix mismatch condition */ 00105 if((pSrcA->numCols != pSrcB->numRows) || 00106 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00107 { 00108 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00109 status = ARM_MATH_SIZE_MISMATCH; 00110 } 00111 else 00112 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00113 00114 { 00115 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00116 /* row loop */ 00117 do 00118 { 00119 /* Output pointer is set to starting address of the row being processed */ 00120 px = pOut + i; 00121 00122 /* For every row wise process, the column loop counter is to be initiated */ 00123 col = numColsB; 00124 00125 /* For every row wise process, the pIn2 pointer is set 00126 ** to the starting address of the pSrcB data */ 00127 pIn2 = pSrcB->pData; 00128 00129 j = 0u; 00130 00131 /* column loop */ 00132 do 00133 { 00134 /* Set the variable sum, that acts as accumulator, to zero */ 00135 sum = 0; 00136 00137 /* Initiate the pointer pIn1 to point to the starting address of pInA */ 00138 pIn1 = pInA; 00139 00140 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00141 colCnt = numColsA >> 2; 00142 00143 00144 /* matrix multiplication */ 00145 while(colCnt > 0u) 00146 { 00147 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00148 /* Perform the multiply-accumulates */ 00149 b0 = *pIn2; 00150 pIn2 += numColsB; 00151 00152 a0 = *pIn1++; 00153 a1 = *pIn1++; 00154 00155 b1 = *pIn2; 00156 pIn2 += numColsB; 00157 b2 = *pIn2; 00158 pIn2 += numColsB; 00159 00160 sum += (q63_t) a0 *b0; 00161 sum += (q63_t) a1 *b1; 00162 00163 a2 = *pIn1++; 00164 a3 = *pIn1++; 00165 00166 b3 = *pIn2; 00167 pIn2 += numColsB; 00168 00169 sum += (q63_t) a2 *b2; 00170 sum += (q63_t) a3 *b3; 00171 00172 /* Decrement the loop counter */ 00173 colCnt--; 00174 } 00175 00176 /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here. 00177 ** No loop unrolling is used. */ 00178 colCnt = numColsA % 0x4u; 00179 00180 while(colCnt > 0u) 00181 { 00182 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00183 /* Perform the multiply-accumulates */ 00184 sum += (q63_t) * pIn1++ * *pIn2; 00185 pIn2 += numColsB; 00186 00187 /* Decrement the loop counter */ 00188 colCnt--; 00189 } 00190 00191 /* Convert the result from 2.62 to 1.31 format and store in destination buffer */ 00192 *px++ = (q31_t) (sum >> 31); 00193 00194 /* Update the pointer pIn2 to point to the starting address of the next column */ 00195 j++; 00196 pIn2 = (pSrcB->pData) + j; 00197 00198 /* Decrement the column loop counter */ 00199 col--; 00200 00201 } while(col > 0u); 00202 00203 #else 00204 00205 /* Run the below code for Cortex-M0 */ 00206 00207 q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */ 00208 uint16_t col, i = 0u, row = numRowsA, colCnt; /* loop counters */ 00209 arm_status status; /* status of matrix multiplication */ 00210 00211 00212 #ifdef ARM_MATH_MATRIX_CHECK 00213 00214 /* Check for matrix mismatch condition */ 00215 if((pSrcA->numCols != pSrcB->numRows) || 00216 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00217 { 00218 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00219 status = ARM_MATH_SIZE_MISMATCH; 00220 } 00221 else 00222 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00223 00224 { 00225 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00226 /* row loop */ 00227 do 00228 { 00229 /* Output pointer is set to starting address of the row being processed */ 00230 px = pOut + i; 00231 00232 /* For every row wise process, the column loop counter is to be initiated */ 00233 col = numColsB; 00234 00235 /* For every row wise process, the pIn2 pointer is set 00236 ** to the starting address of the pSrcB data */ 00237 pIn2 = pSrcB->pData; 00238 00239 /* column loop */ 00240 do 00241 { 00242 /* Set the variable sum, that acts as accumulator, to zero */ 00243 sum = 0; 00244 00245 /* Initiate the pointer pIn1 to point to the starting address of pInA */ 00246 pIn1 = pInA; 00247 00248 /* Matrix A columns number of MAC operations are to be performed */ 00249 colCnt = numColsA; 00250 00251 /* matrix multiplication */ 00252 while(colCnt > 0u) 00253 { 00254 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00255 /* Perform the multiply-accumulates */ 00256 sum += (q63_t) * pIn1++ * *pIn2; 00257 pIn2 += numColsB; 00258 00259 /* Decrement the loop counter */ 00260 colCnt--; 00261 } 00262 00263 /* Convert the result from 2.62 to 1.31 format and store in destination buffer */ 00264 *px++ = (q31_t) clip_q63_to_q31(sum >> 31); 00265 00266 /* Decrement the column loop counter */ 00267 col--; 00268 00269 /* Update the pointer pIn2 to point to the starting address of the next column */ 00270 pIn2 = pInB + (numColsB - col); 00271 00272 } while(col > 0u); 00273 00274 #endif 00275 00276 /* Update the pointer pInA to point to the starting address of the next row */ 00277 i = i + numColsB; 00278 pInA = pInA + numColsA; 00279 00280 /* Decrement the row loop counter */ 00281 row--; 00282 00283 } while(row > 0u); 00284 00285 /* set status as ARM_MATH_SUCCESS */ 00286 status = ARM_MATH_SUCCESS; 00287 } 00288 /* Return to application */ 00289 return (status); 00290 } 00291 00292 /** 00293 * @} end of MatrixMult group 00294 */
Generated on Tue Jul 12 2022 13:15:25 by
