CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_mat_cmplx_mult_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_cmplx_mat_mult_q15.c 00009 * 00010 * Description: Q15 complex matrix multiplication. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 #include "arm_math.h" 00041 00042 /** 00043 * @ingroup groupMatrix 00044 */ 00045 00046 /** 00047 * @addtogroup CmplxMatrixMult 00048 * @{ 00049 */ 00050 00051 00052 /** 00053 * @brief Q15 Complex matrix multiplication 00054 * @param[in] *pSrcA points to the first input complex matrix structure 00055 * @param[in] *pSrcB points to the second input complex matrix structure 00056 * @param[out] *pDst points to output complex matrix structure 00057 * @param[in] *pScratch points to the array for storing intermediate results 00058 * @return The function returns either 00059 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 00060 * 00061 * \par Conditions for optimum performance 00062 * Input, output and state buffers should be aligned by 32-bit 00063 * 00064 * \par Restrictions 00065 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00066 * In this case input, output, scratch buffers should be aligned by 32-bit 00067 * 00068 * @details 00069 * <b>Scaling and Overflow Behavior:</b> 00070 * 00071 * \par 00072 * The function is implemented using a 64-bit internal accumulator. The inputs to the 00073 * multiplications are in 1.15 format and multiplications yield a 2.30 result. 00074 * The 2.30 intermediate 00075 * results are accumulated in a 64-bit accumulator in 34.30 format. This approach 00076 * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then 00077 * truncated to 34.15 format by discarding the low 15 bits and then saturated to 00078 * 1.15 format. 00079 * 00080 * \par 00081 * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function. 00082 * 00083 */ 00084 00085 00086 00087 00088 arm_status arm_mat_cmplx_mult_q15( 00089 const arm_matrix_instance_q15 * pSrcA, 00090 const arm_matrix_instance_q15 * pSrcB, 00091 arm_matrix_instance_q15 * pDst, 00092 q15_t * pScratch) 00093 { 00094 /* accumulator */ 00095 q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */ 00096 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00097 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00098 q15_t *px; /* Temporary output data matrix pointer */ 00099 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00100 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00101 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00102 uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ 00103 uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */ 00104 arm_status status; /* status of matrix multiplication */ 00105 q63_t sumReal, sumImag; 00106 00107 #ifdef UNALIGNED_SUPPORT_DISABLE 00108 q15_t in; /* Temporary variable to hold the input value */ 00109 q15_t a, b, c, d; 00110 #else 00111 q31_t in; /* Temporary variable to hold the input value */ 00112 q31_t prod1, prod2; 00113 q31_t pSourceA, pSourceB; 00114 #endif 00115 00116 #ifdef ARM_MATH_MATRIX_CHECK 00117 /* Check for matrix mismatch condition */ 00118 if((pSrcA->numCols != pSrcB->numRows) || 00119 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00120 { 00121 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00122 status = ARM_MATH_SIZE_MISMATCH; 00123 } 00124 else 00125 #endif 00126 { 00127 /* Matrix transpose */ 00128 do 00129 { 00130 /* Apply loop unrolling and exchange the columns with row elements */ 00131 col = numColsB >> 2; 00132 00133 /* The pointer px is set to starting address of the column being processed */ 00134 px = pSrcBT + i; 00135 00136 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00137 ** a second loop below computes the remaining 1 to 3 samples. */ 00138 while(col > 0u) 00139 { 00140 #ifdef UNALIGNED_SUPPORT_DISABLE 00141 /* Read two elements from the row */ 00142 in = *pInB++; 00143 *px = in; 00144 in = *pInB++; 00145 px[1] = in; 00146 00147 /* Update the pointer px to point to the next row of the transposed matrix */ 00148 px += numRowsB * 2; 00149 00150 /* Read two elements from the row */ 00151 in = *pInB++; 00152 *px = in; 00153 in = *pInB++; 00154 px[1] = in; 00155 00156 /* Update the pointer px to point to the next row of the transposed matrix */ 00157 px += numRowsB * 2; 00158 00159 /* Read two elements from the row */ 00160 in = *pInB++; 00161 *px = in; 00162 in = *pInB++; 00163 px[1] = in; 00164 00165 /* Update the pointer px to point to the next row of the transposed matrix */ 00166 px += numRowsB * 2; 00167 00168 /* Read two elements from the row */ 00169 in = *pInB++; 00170 *px = in; 00171 in = *pInB++; 00172 px[1] = in; 00173 00174 /* Update the pointer px to point to the next row of the transposed matrix */ 00175 px += numRowsB * 2; 00176 00177 /* Decrement the column loop counter */ 00178 col--; 00179 } 00180 00181 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00182 ** No loop unrolling is used. */ 00183 col = numColsB % 0x4u; 00184 00185 while(col > 0u) 00186 { 00187 /* Read two elements from the row */ 00188 in = *pInB++; 00189 *px = in; 00190 in = *pInB++; 00191 px[1] = in; 00192 #else 00193 00194 /* Read two elements from the row */ 00195 in = *__SIMD32(pInB)++; 00196 00197 *__SIMD32(px) = in; 00198 00199 /* Update the pointer px to point to the next row of the transposed matrix */ 00200 px += numRowsB * 2; 00201 00202 00203 /* Read two elements from the row */ 00204 in = *__SIMD32(pInB)++; 00205 00206 *__SIMD32(px) = in; 00207 00208 /* Update the pointer px to point to the next row of the transposed matrix */ 00209 px += numRowsB * 2; 00210 00211 /* Read two elements from the row */ 00212 in = *__SIMD32(pInB)++; 00213 00214 *__SIMD32(px) = in; 00215 00216 /* Update the pointer px to point to the next row of the transposed matrix */ 00217 px += numRowsB * 2; 00218 00219 /* Read two elements from the row */ 00220 in = *__SIMD32(pInB)++; 00221 00222 *__SIMD32(px) = in; 00223 00224 /* Update the pointer px to point to the next row of the transposed matrix */ 00225 px += numRowsB * 2; 00226 00227 /* Decrement the column loop counter */ 00228 col--; 00229 } 00230 00231 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00232 ** No loop unrolling is used. */ 00233 col = numColsB % 0x4u; 00234 00235 while(col > 0u) 00236 { 00237 /* Read two elements from the row */ 00238 in = *__SIMD32(pInB)++; 00239 00240 *__SIMD32(px) = in; 00241 #endif 00242 00243 /* Update the pointer px to point to the next row of the transposed matrix */ 00244 px += numRowsB * 2; 00245 00246 /* Decrement the column loop counter */ 00247 col--; 00248 } 00249 00250 i = i + 2u; 00251 00252 /* Decrement the row loop counter */ 00253 row--; 00254 00255 } while(row > 0u); 00256 00257 /* Reset the variables for the usage in the following multiplication process */ 00258 row = numRowsA; 00259 i = 0u; 00260 px = pDst->pData; 00261 00262 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00263 /* row loop */ 00264 do 00265 { 00266 /* For every row wise process, the column loop counter is to be initiated */ 00267 col = numColsB; 00268 00269 /* For every row wise process, the pIn2 pointer is set 00270 ** to the starting address of the transposed pSrcB data */ 00271 pInB = pSrcBT; 00272 00273 /* column loop */ 00274 do 00275 { 00276 /* Set the variable sum, that acts as accumulator, to zero */ 00277 sumReal = 0; 00278 sumImag = 0; 00279 00280 /* Apply loop unrolling and compute 2 MACs simultaneously. */ 00281 colCnt = numColsA >> 1; 00282 00283 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ 00284 pInA = pSrcA->pData + i * 2; 00285 00286 00287 /* matrix multiplication */ 00288 while(colCnt > 0u) 00289 { 00290 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00291 00292 #ifdef UNALIGNED_SUPPORT_DISABLE 00293 00294 /* read real and imag values from pSrcA buffer */ 00295 a = *pInA; 00296 b = *(pInA + 1u); 00297 /* read real and imag values from pSrcB buffer */ 00298 c = *pInB; 00299 d = *(pInB + 1u); 00300 00301 /* Multiply and Accumlates */ 00302 sumReal += (q31_t) a *c; 00303 sumImag += (q31_t) a *d; 00304 sumReal -= (q31_t) b *d; 00305 sumImag += (q31_t) b *c; 00306 00307 /* read next real and imag values from pSrcA buffer */ 00308 a = *(pInA + 2u); 00309 b = *(pInA + 3u); 00310 /* read next real and imag values from pSrcB buffer */ 00311 c = *(pInB + 2u); 00312 d = *(pInB + 3u); 00313 00314 /* update pointer */ 00315 pInA += 4u; 00316 00317 /* Multiply and Accumlates */ 00318 sumReal += (q31_t) a *c; 00319 sumImag += (q31_t) a *d; 00320 sumReal -= (q31_t) b *d; 00321 sumImag += (q31_t) b *c; 00322 /* update pointer */ 00323 pInB += 4u; 00324 #else 00325 /* read real and imag values from pSrcA and pSrcB buffer */ 00326 pSourceA = *__SIMD32(pInA)++; 00327 pSourceB = *__SIMD32(pInB)++; 00328 00329 /* Multiply and Accumlates */ 00330 #ifdef ARM_MATH_BIG_ENDIAN 00331 prod1 = -__SMUSD(pSourceA, pSourceB); 00332 #else 00333 prod1 = __SMUSD(pSourceA, pSourceB); 00334 #endif 00335 prod2 = __SMUADX(pSourceA, pSourceB); 00336 sumReal += (q63_t) prod1; 00337 sumImag += (q63_t) prod2; 00338 00339 /* read real and imag values from pSrcA and pSrcB buffer */ 00340 pSourceA = *__SIMD32(pInA)++; 00341 pSourceB = *__SIMD32(pInB)++; 00342 00343 /* Multiply and Accumlates */ 00344 #ifdef ARM_MATH_BIG_ENDIAN 00345 prod1 = -__SMUSD(pSourceA, pSourceB); 00346 #else 00347 prod1 = __SMUSD(pSourceA, pSourceB); 00348 #endif 00349 prod2 = __SMUADX(pSourceA, pSourceB); 00350 sumReal += (q63_t) prod1; 00351 sumImag += (q63_t) prod2; 00352 00353 #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */ 00354 00355 /* Decrement the loop counter */ 00356 colCnt--; 00357 } 00358 00359 /* process odd column samples */ 00360 if((numColsA & 0x1u) > 0u) 00361 { 00362 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00363 00364 #ifdef UNALIGNED_SUPPORT_DISABLE 00365 00366 /* read real and imag values from pSrcA and pSrcB buffer */ 00367 a = *pInA++; 00368 b = *pInA++; 00369 c = *pInB++; 00370 d = *pInB++; 00371 00372 /* Multiply and Accumlates */ 00373 sumReal += (q31_t) a *c; 00374 sumImag += (q31_t) a *d; 00375 sumReal -= (q31_t) b *d; 00376 sumImag += (q31_t) b *c; 00377 00378 #else 00379 /* read real and imag values from pSrcA and pSrcB buffer */ 00380 pSourceA = *__SIMD32(pInA)++; 00381 pSourceB = *__SIMD32(pInB)++; 00382 00383 /* Multiply and Accumlates */ 00384 #ifdef ARM_MATH_BIG_ENDIAN 00385 prod1 = -__SMUSD(pSourceA, pSourceB); 00386 #else 00387 prod1 = __SMUSD(pSourceA, pSourceB); 00388 #endif 00389 prod2 = __SMUADX(pSourceA, pSourceB); 00390 sumReal += (q63_t) prod1; 00391 sumImag += (q63_t) prod2; 00392 00393 #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */ 00394 00395 } 00396 00397 /* Saturate and store the result in the destination buffer */ 00398 00399 *px++ = (q15_t) (__SSAT(sumReal >> 15, 16)); 00400 *px++ = (q15_t) (__SSAT(sumImag >> 15, 16)); 00401 00402 /* Decrement the column loop counter */ 00403 col--; 00404 00405 } while(col > 0u); 00406 00407 i = i + numColsA; 00408 00409 /* Decrement the row loop counter */ 00410 row--; 00411 00412 } while(row > 0u); 00413 00414 /* set status as ARM_MATH_SUCCESS */ 00415 status = ARM_MATH_SUCCESS; 00416 } 00417 00418 /* Return to application */ 00419 return (status); 00420 } 00421 00422 /** 00423 * @} end of MatrixMult group 00424 */
Generated on Tue Jul 12 2022 11:59:18 by 1.7.2