Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_mat_mult_fast_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_mat_mult_fast_q31.c 00009 * 00010 * Description: Q31 matrix multiplication (fast variant). 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupMatrix 00045 */ 00046 00047 /** 00048 * @addtogroup MatrixMult 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4 00054 * @param[in] *pSrcA points to the first input matrix structure 00055 * @param[in] *pSrcB points to the second input matrix structure 00056 * @param[out] *pDst points to output matrix structure 00057 * @return The function returns either 00058 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 00059 * 00060 * @details 00061 * <b>Scaling and Overflow Behavior:</b> 00062 * 00063 * \par 00064 * The difference between the function arm_mat_mult_q31() and this fast variant is that 00065 * the fast variant use a 32-bit rather than a 64-bit accumulator. 00066 * The result of each 1.31 x 1.31 multiplication is truncated to 00067 * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30 00068 * format. Finally, the accumulator is saturated and converted to a 1.31 result. 00069 * 00070 * \par 00071 * The fast version has the same overflow behavior as the standard version but provides 00072 * less precision since it discards the low 32 bits of each multiplication result. 00073 * In order to avoid overflows completely the input signals must be scaled down. 00074 * Scale down one of the input matrices by log2(numColsA) bits to 00075 * avoid overflows, as a total of numColsA additions are computed internally for each 00076 * output element. 00077 * 00078 * \par 00079 * See <code>arm_mat_mult_q31()</code> for a slower implementation of this function 00080 * which uses 64-bit accumulation to provide higher precision. 00081 */ 00082 00083 arm_status arm_mat_mult_fast_q31( 00084 const arm_matrix_instance_q31 * pSrcA, 00085 const arm_matrix_instance_q31 * pSrcB, 00086 arm_matrix_instance_q31 * pDst) 00087 { 00088 q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */ 00089 q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */ 00090 q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */ 00091 // q31_t *pSrcB = pSrcB->pData; /* input data matrix pointer B */ 00092 q31_t *pOut = pDst->pData; /* output data matrix pointer */ 00093 q31_t *px; /* Temporary output data matrix pointer */ 00094 q31_t sum; /* Accumulator */ 00095 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00096 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00097 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00098 uint16_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */ 00099 arm_status status; /* status of matrix multiplication */ 00100 q31_t inA1, inA2, inA3, inA4, inB1, inB2, inB3, inB4; 00101 00102 #ifdef ARM_MATH_MATRIX_CHECK 00103 00104 00105 /* Check for matrix mismatch condition */ 00106 if((pSrcA->numCols != pSrcB->numRows) || 00107 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00108 { 00109 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00110 status = ARM_MATH_SIZE_MISMATCH; 00111 } 00112 else 00113 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00114 00115 { 00116 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00117 /* row loop */ 00118 do 00119 { 00120 /* Output pointer is set to starting address of the row being processed */ 00121 px = pOut + i; 00122 00123 /* For every row wise process, the column loop counter is to be initiated */ 00124 col = numColsB; 00125 00126 /* For every row wise process, the pIn2 pointer is set 00127 ** to the starting address of the pSrcB data */ 00128 pIn2 = pSrcB->pData; 00129 00130 j = 0u; 00131 00132 /* column loop */ 00133 do 00134 { 00135 /* Set the variable sum, that acts as accumulator, to zero */ 00136 sum = 0; 00137 00138 /* Initiate the pointer pIn1 to point to the starting address of pInA */ 00139 pIn1 = pInA; 00140 00141 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00142 colCnt = numColsA >> 2; 00143 00144 00145 /* matrix multiplication */ 00146 while(colCnt > 0u) 00147 { 00148 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00149 /* Perform the multiply-accumulates */ 00150 inB1 = *pIn2; 00151 pIn2 += numColsB; 00152 00153 inA1 = pIn1[0]; 00154 inA2 = pIn1[1]; 00155 00156 inB2 = *pIn2; 00157 pIn2 += numColsB; 00158 00159 inB3 = *pIn2; 00160 pIn2 += numColsB; 00161 00162 sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA1 * inB1)) >> 32); 00163 sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA2 * inB2)) >> 32); 00164 00165 inA3 = pIn1[2]; 00166 inA4 = pIn1[3]; 00167 00168 inB4 = *pIn2; 00169 pIn2 += numColsB; 00170 00171 sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA3 * inB3)) >> 32); 00172 sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA4 * inB4)) >> 32); 00173 00174 pIn1 += 4u; 00175 00176 /* Decrement the loop counter */ 00177 colCnt--; 00178 } 00179 00180 /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here. 00181 ** No loop unrolling is used. */ 00182 colCnt = numColsA % 0x4u; 00183 00184 while(colCnt > 0u) 00185 { 00186 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00187 /* Perform the multiply-accumulates */ 00188 sum = (q31_t) ((((q63_t) sum << 32) + 00189 ((q63_t) * pIn1++ * (*pIn2))) >> 32); 00190 pIn2 += numColsB; 00191 00192 /* Decrement the loop counter */ 00193 colCnt--; 00194 } 00195 00196 /* Convert the result from 2.30 to 1.31 format and store in destination buffer */ 00197 *px++ = sum << 1; 00198 00199 /* Update the pointer pIn2 to point to the starting address of the next column */ 00200 j++; 00201 pIn2 = pSrcB->pData + j; 00202 00203 /* Decrement the column loop counter */ 00204 col--; 00205 00206 } while(col > 0u); 00207 00208 /* Update the pointer pInA to point to the starting address of the next row */ 00209 i = i + numColsB; 00210 pInA = pInA + numColsA; 00211 00212 /* Decrement the row loop counter */ 00213 row--; 00214 00215 } while(row > 0u); 00216 00217 /* set status as ARM_MATH_SUCCESS */ 00218 status = ARM_MATH_SUCCESS; 00219 } 00220 /* Return to application */ 00221 return (status); 00222 } 00223 00224 /** 00225 * @} end of MatrixMult group 00226 */
Generated on Tue Jul 12 2022 18:44:09 by
1.7.2
