Aded CMSIS5 DSP and NN folder. Needs some work
Embed:
(wiki syntax)
Show/hide line numbers
arm_mat_cmplx_mult_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_cmplx_mat_mult_q15.c 00004 * Description: Q15 complex matrix multiplication 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupMatrix 00033 */ 00034 00035 /** 00036 * @addtogroup CmplxMatrixMult 00037 * @{ 00038 */ 00039 00040 00041 /** 00042 * @brief Q15 Complex matrix multiplication 00043 * @param[in] *pSrcA points to the first input complex matrix structure 00044 * @param[in] *pSrcB points to the second input complex matrix structure 00045 * @param[out] *pDst points to output complex matrix structure 00046 * @param[in] *pScratch points to the array for storing intermediate results 00047 * @return The function returns either 00048 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 00049 * 00050 * \par Conditions for optimum performance 00051 * Input, output and state buffers should be aligned by 32-bit 00052 * 00053 * \par Restrictions 00054 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00055 * In this case input, output, scratch buffers should be aligned by 32-bit 00056 * 00057 * @details 00058 * <b>Scaling and Overflow Behavior:</b> 00059 * 00060 * \par 00061 * The function is implemented using a 64-bit internal accumulator. The inputs to the 00062 * multiplications are in 1.15 format and multiplications yield a 2.30 result. 00063 * The 2.30 intermediate 00064 * results are accumulated in a 64-bit accumulator in 34.30 format. This approach 00065 * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then 00066 * truncated to 34.15 format by discarding the low 15 bits and then saturated to 00067 * 1.15 format. 00068 * 00069 * \par 00070 * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function. 00071 * 00072 */ 00073 00074 00075 00076 00077 arm_status arm_mat_cmplx_mult_q15( 00078 const arm_matrix_instance_q15 * pSrcA, 00079 const arm_matrix_instance_q15 * pSrcB, 00080 arm_matrix_instance_q15 * pDst, 00081 q15_t * pScratch) 00082 { 00083 /* accumulator */ 00084 q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */ 00085 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00086 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00087 q15_t *px; /* Temporary output data matrix pointer */ 00088 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00089 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00090 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00091 uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ 00092 uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */ 00093 arm_status status; /* status of matrix multiplication */ 00094 q63_t sumReal, sumImag; 00095 00096 #ifdef UNALIGNED_SUPPORT_DISABLE 00097 q15_t in; /* Temporary variable to hold the input value */ 00098 q15_t a, b, c, d; 00099 #else 00100 q31_t in; /* Temporary variable to hold the input value */ 00101 q31_t prod1, prod2; 00102 q31_t pSourceA, pSourceB; 00103 #endif 00104 00105 #ifdef ARM_MATH_MATRIX_CHECK 00106 /* Check for matrix mismatch condition */ 00107 if ((pSrcA->numCols != pSrcB->numRows) || 00108 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00109 { 00110 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00111 status = ARM_MATH_SIZE_MISMATCH; 00112 } 00113 else 00114 #endif 00115 { 00116 /* Matrix transpose */ 00117 do 00118 { 00119 /* Apply loop unrolling and exchange the columns with row elements */ 00120 col = numColsB >> 2; 00121 00122 /* The pointer px is set to starting address of the column being processed */ 00123 px = pSrcBT + i; 00124 00125 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00126 ** a second loop below computes the remaining 1 to 3 samples. */ 00127 while (col > 0U) 00128 { 00129 #ifdef UNALIGNED_SUPPORT_DISABLE 00130 /* Read two elements from the row */ 00131 in = *pInB++; 00132 *px = in; 00133 in = *pInB++; 00134 px[1] = in; 00135 00136 /* Update the pointer px to point to the next row of the transposed matrix */ 00137 px += numRowsB * 2; 00138 00139 /* Read two elements from the row */ 00140 in = *pInB++; 00141 *px = in; 00142 in = *pInB++; 00143 px[1] = in; 00144 00145 /* Update the pointer px to point to the next row of the transposed matrix */ 00146 px += numRowsB * 2; 00147 00148 /* Read two elements from the row */ 00149 in = *pInB++; 00150 *px = in; 00151 in = *pInB++; 00152 px[1] = in; 00153 00154 /* Update the pointer px to point to the next row of the transposed matrix */ 00155 px += numRowsB * 2; 00156 00157 /* Read two elements from the row */ 00158 in = *pInB++; 00159 *px = in; 00160 in = *pInB++; 00161 px[1] = in; 00162 00163 /* Update the pointer px to point to the next row of the transposed matrix */ 00164 px += numRowsB * 2; 00165 00166 /* Decrement the column loop counter */ 00167 col--; 00168 } 00169 00170 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00171 ** No loop unrolling is used. */ 00172 col = numColsB % 0x4U; 00173 00174 while (col > 0U) 00175 { 00176 /* Read two elements from the row */ 00177 in = *pInB++; 00178 *px = in; 00179 in = *pInB++; 00180 px[1] = in; 00181 #else 00182 00183 /* Read two elements from the row */ 00184 in = *__SIMD32(pInB)++; 00185 00186 *__SIMD32(px) = in; 00187 00188 /* Update the pointer px to point to the next row of the transposed matrix */ 00189 px += numRowsB * 2; 00190 00191 00192 /* Read two elements from the row */ 00193 in = *__SIMD32(pInB)++; 00194 00195 *__SIMD32(px) = in; 00196 00197 /* Update the pointer px to point to the next row of the transposed matrix */ 00198 px += numRowsB * 2; 00199 00200 /* Read two elements from the row */ 00201 in = *__SIMD32(pInB)++; 00202 00203 *__SIMD32(px) = in; 00204 00205 /* Update the pointer px to point to the next row of the transposed matrix */ 00206 px += numRowsB * 2; 00207 00208 /* Read two elements from the row */ 00209 in = *__SIMD32(pInB)++; 00210 00211 *__SIMD32(px) = in; 00212 00213 /* Update the pointer px to point to the next row of the transposed matrix */ 00214 px += numRowsB * 2; 00215 00216 /* Decrement the column loop counter */ 00217 col--; 00218 } 00219 00220 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00221 ** No loop unrolling is used. */ 00222 col = numColsB % 0x4U; 00223 00224 while (col > 0U) 00225 { 00226 /* Read two elements from the row */ 00227 in = *__SIMD32(pInB)++; 00228 00229 *__SIMD32(px) = in; 00230 #endif 00231 00232 /* Update the pointer px to point to the next row of the transposed matrix */ 00233 px += numRowsB * 2; 00234 00235 /* Decrement the column loop counter */ 00236 col--; 00237 } 00238 00239 i = i + 2U; 00240 00241 /* Decrement the row loop counter */ 00242 row--; 00243 00244 } while (row > 0U); 00245 00246 /* Reset the variables for the usage in the following multiplication process */ 00247 row = numRowsA; 00248 i = 0U; 00249 px = pDst->pData; 00250 00251 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00252 /* row loop */ 00253 do 00254 { 00255 /* For every row wise process, the column loop counter is to be initiated */ 00256 col = numColsB; 00257 00258 /* For every row wise process, the pIn2 pointer is set 00259 ** to the starting address of the transposed pSrcB data */ 00260 pInB = pSrcBT; 00261 00262 /* column loop */ 00263 do 00264 { 00265 /* Set the variable sum, that acts as accumulator, to zero */ 00266 sumReal = 0; 00267 sumImag = 0; 00268 00269 /* Apply loop unrolling and compute 2 MACs simultaneously. */ 00270 colCnt = numColsA >> 1; 00271 00272 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ 00273 pInA = pSrcA->pData + i * 2; 00274 00275 00276 /* matrix multiplication */ 00277 while (colCnt > 0U) 00278 { 00279 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00280 00281 #ifdef UNALIGNED_SUPPORT_DISABLE 00282 00283 /* read real and imag values from pSrcA buffer */ 00284 a = *pInA; 00285 b = *(pInA + 1U); 00286 /* read real and imag values from pSrcB buffer */ 00287 c = *pInB; 00288 d = *(pInB + 1U); 00289 00290 /* Multiply and Accumlates */ 00291 sumReal += (q31_t) a *c; 00292 sumImag += (q31_t) a *d; 00293 sumReal -= (q31_t) b *d; 00294 sumImag += (q31_t) b *c; 00295 00296 /* read next real and imag values from pSrcA buffer */ 00297 a = *(pInA + 2U); 00298 b = *(pInA + 3U); 00299 /* read next real and imag values from pSrcB buffer */ 00300 c = *(pInB + 2U); 00301 d = *(pInB + 3U); 00302 00303 /* update pointer */ 00304 pInA += 4U; 00305 00306 /* Multiply and Accumlates */ 00307 sumReal += (q31_t) a *c; 00308 sumImag += (q31_t) a *d; 00309 sumReal -= (q31_t) b *d; 00310 sumImag += (q31_t) b *c; 00311 /* update pointer */ 00312 pInB += 4U; 00313 #else 00314 /* read real and imag values from pSrcA and pSrcB buffer */ 00315 pSourceA = *__SIMD32(pInA)++; 00316 pSourceB = *__SIMD32(pInB)++; 00317 00318 /* Multiply and Accumlates */ 00319 #ifdef ARM_MATH_BIG_ENDIAN 00320 prod1 = -__SMUSD(pSourceA, pSourceB); 00321 #else 00322 prod1 = __SMUSD(pSourceA, pSourceB); 00323 #endif 00324 prod2 = __SMUADX(pSourceA, pSourceB); 00325 sumReal += (q63_t) prod1; 00326 sumImag += (q63_t) prod2; 00327 00328 /* read real and imag values from pSrcA and pSrcB buffer */ 00329 pSourceA = *__SIMD32(pInA)++; 00330 pSourceB = *__SIMD32(pInB)++; 00331 00332 /* Multiply and Accumlates */ 00333 #ifdef ARM_MATH_BIG_ENDIAN 00334 prod1 = -__SMUSD(pSourceA, pSourceB); 00335 #else 00336 prod1 = __SMUSD(pSourceA, pSourceB); 00337 #endif 00338 prod2 = __SMUADX(pSourceA, pSourceB); 00339 sumReal += (q63_t) prod1; 00340 sumImag += (q63_t) prod2; 00341 00342 #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */ 00343 00344 /* Decrement the loop counter */ 00345 colCnt--; 00346 } 00347 00348 /* process odd column samples */ 00349 if ((numColsA & 0x1U) > 0U) 00350 { 00351 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00352 00353 #ifdef UNALIGNED_SUPPORT_DISABLE 00354 00355 /* read real and imag values from pSrcA and pSrcB buffer */ 00356 a = *pInA++; 00357 b = *pInA++; 00358 c = *pInB++; 00359 d = *pInB++; 00360 00361 /* Multiply and Accumlates */ 00362 sumReal += (q31_t) a *c; 00363 sumImag += (q31_t) a *d; 00364 sumReal -= (q31_t) b *d; 00365 sumImag += (q31_t) b *c; 00366 00367 #else 00368 /* read real and imag values from pSrcA and pSrcB buffer */ 00369 pSourceA = *__SIMD32(pInA)++; 00370 pSourceB = *__SIMD32(pInB)++; 00371 00372 /* Multiply and Accumlates */ 00373 #ifdef ARM_MATH_BIG_ENDIAN 00374 prod1 = -__SMUSD(pSourceA, pSourceB); 00375 #else 00376 prod1 = __SMUSD(pSourceA, pSourceB); 00377 #endif 00378 prod2 = __SMUADX(pSourceA, pSourceB); 00379 sumReal += (q63_t) prod1; 00380 sumImag += (q63_t) prod2; 00381 00382 #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */ 00383 00384 } 00385 00386 /* Saturate and store the result in the destination buffer */ 00387 00388 *px++ = (q15_t) (__SSAT(sumReal >> 15, 16)); 00389 *px++ = (q15_t) (__SSAT(sumImag >> 15, 16)); 00390 00391 /* Decrement the column loop counter */ 00392 col--; 00393 00394 } while (col > 0U); 00395 00396 i = i + numColsA; 00397 00398 /* Decrement the row loop counter */ 00399 row--; 00400 00401 } while (row > 0U); 00402 00403 /* set status as ARM_MATH_SUCCESS */ 00404 status = ARM_MATH_SUCCESS; 00405 } 00406 00407 /* Return to application */ 00408 return (status); 00409 } 00410 00411 /** 00412 * @} end of MatrixMult group 00413 */ 00414
Generated on Tue Jul 12 2022 16:47:27 by 1.7.2