Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-os by
arm_mat_cmplx_mult_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_cmplx_mat_mult_q15.c 00009 * 00010 * Description: Q15 complex matrix multiplication. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 #include "arm_math.h" 00041 00042 /** 00043 * @ingroup groupMatrix 00044 */ 00045 00046 /** 00047 * @addtogroup CmplxMatrixMult 00048 * @{ 00049 */ 00050 00051 00052 /** 00053 * @brief Q15 Complex matrix multiplication 00054 * @param[in] *pSrcA points to the first input complex matrix structure 00055 * @param[in] *pSrcB points to the second input complex matrix structure 00056 * @param[out] *pDst points to output complex matrix structure 00057 * @param[in] *pScratch points to the array for storing intermediate results 00058 * @return The function returns either 00059 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 00060 * 00061 * \par Conditions for optimum performance 00062 * Input, output and state buffers should be aligned by 32-bit 00063 * 00064 * \par Restrictions 00065 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00066 * In this case input, output, scratch buffers should be aligned by 32-bit 00067 * 00068 * @details 00069 * <b>Scaling and Overflow Behavior:</b> 00070 * 00071 * \par 00072 * The function is implemented using a 64-bit internal accumulator. The inputs to the 00073 * multiplications are in 1.15 format and multiplications yield a 2.30 result. 00074 * The 2.30 intermediate 00075 * results are accumulated in a 64-bit accumulator in 34.30 format. This approach 00076 * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then 00077 * truncated to 34.15 format by discarding the low 15 bits and then saturated to 00078 * 1.15 format. 00079 * 00080 * \par 00081 * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function. 00082 * 00083 */ 00084 00085 00086 00087 00088 arm_status arm_mat_cmplx_mult_q15( 00089 const arm_matrix_instance_q15 * pSrcA, 00090 const arm_matrix_instance_q15 * pSrcB, 00091 arm_matrix_instance_q15 * pDst, 00092 q15_t * pScratch) 00093 { 00094 /* accumulator */ 00095 q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */ 00096 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00097 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00098 q15_t *px; /* Temporary output data matrix pointer */ 00099 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00100 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00101 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00102 uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ 00103 uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */ 00104 arm_status status; /* status of matrix multiplication */ 00105 q63_t sumReal, sumImag; 00106 00107 #ifdef UNALIGNED_SUPPORT_DISABLE 00108 q15_t in; /* Temporary variable to hold the input value */ 00109 q15_t a, b, c, d; 00110 #else 00111 q31_t in; /* Temporary variable to hold the input value */ 00112 q31_t prod1, prod2; 00113 q31_t pSourceA, pSourceB; 00114 #endif 00115 00116 #ifdef ARM_MATH_MATRIX_CHECK 00117 /* Check for matrix mismatch condition */ 00118 if((pSrcA->numCols != pSrcB->numRows) || 00119 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00120 { 00121 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00122 status = ARM_MATH_SIZE_MISMATCH; 00123 } 00124 else 00125 #endif 00126 { 00127 /* Matrix transpose */ 00128 do 00129 { 00130 /* Apply loop unrolling and exchange the columns with row elements */ 00131 col = numColsB >> 2; 00132 00133 /* The pointer px is set to starting address of the column being processed */ 00134 px = pSrcBT + i; 00135 00136 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00137 ** a second loop below computes the remaining 1 to 3 samples. */ 00138 while(col > 0u) 00139 { 00140 #ifdef UNALIGNED_SUPPORT_DISABLE 00141 /* Read two elements from the row */ 00142 in = *pInB++; 00143 *px = in; 00144 in = *pInB++; 00145 px[1] = in; 00146 00147 /* Update the pointer px to point to the next row of the transposed matrix */ 00148 px += numRowsB * 2; 00149 00150 /* Read two elements from the row */ 00151 in = *pInB++; 00152 *px = in; 00153 in = *pInB++; 00154 px[1] = in; 00155 00156 /* Update the pointer px to point to the next row of the transposed matrix */ 00157 px += numRowsB * 2; 00158 00159 /* Read two elements from the row */ 00160 in = *pInB++; 00161 *px = in; 00162 in = *pInB++; 00163 px[1] = in; 00164 00165 /* Update the pointer px to point to the next row of the transposed matrix */ 00166 px += numRowsB * 2; 00167 00168 /* Read two elements from the row */ 00169 in = *pInB++; 00170 *px = in; 00171 in = *pInB++; 00172 px[1] = in; 00173 00174 /* Update the pointer px to point to the next row of the transposed matrix */ 00175 px += numRowsB * 2; 00176 00177 /* Decrement the column loop counter */ 00178 col--; 00179 } 00180 00181 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00182 ** No loop unrolling is used. */ 00183 col = numColsB % 0x4u; 00184 00185 while(col > 0u) 00186 { 00187 /* Read two elements from the row */ 00188 in = *pInB++; 00189 *px = in; 00190 in = *pInB++; 00191 px[1] = in; 00192 #else 00193 00194 /* Read two elements from the row */ 00195 in = *__SIMD32(pInB)++; 00196 00197 *__SIMD32(px) = in; 00198 00199 /* Update the pointer px to point to the next row of the transposed matrix */ 00200 px += numRowsB * 2; 00201 00202 00203 /* Read two elements from the row */ 00204 in = *__SIMD32(pInB)++; 00205 00206 *__SIMD32(px) = in; 00207 00208 /* Update the pointer px to point to the next row of the transposed matrix */ 00209 px += numRowsB * 2; 00210 00211 /* Read two elements from the row */ 00212 in = *__SIMD32(pInB)++; 00213 00214 *__SIMD32(px) = in; 00215 00216 /* Update the pointer px to point to the next row of the transposed matrix */ 00217 px += numRowsB * 2; 00218 00219 /* Read two elements from the row */ 00220 in = *__SIMD32(pInB)++; 00221 00222 *__SIMD32(px) = in; 00223 00224 /* Update the pointer px to point to the next row of the transposed matrix */ 00225 px += numRowsB * 2; 00226 00227 /* Decrement the column loop counter */ 00228 col--; 00229 } 00230 00231 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00232 ** No loop unrolling is used. */ 00233 col = numColsB % 0x4u; 00234 00235 while(col > 0u) 00236 { 00237 /* Read two elements from the row */ 00238 in = *__SIMD32(pInB)++; 00239 00240 *__SIMD32(px) = in; 00241 #endif 00242 00243 /* Update the pointer px to point to the next row of the transposed matrix */ 00244 px += numRowsB * 2; 00245 00246 /* Decrement the column loop counter */ 00247 col--; 00248 } 00249 00250 i = i + 2u; 00251 00252 /* Decrement the row loop counter */ 00253 row--; 00254 00255 } while(row > 0u); 00256 00257 /* Reset the variables for the usage in the following multiplication process */ 00258 row = numRowsA; 00259 i = 0u; 00260 px = pDst->pData; 00261 00262 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00263 /* row loop */ 00264 do 00265 { 00266 /* For every row wise process, the column loop counter is to be initiated */ 00267 col = numColsB; 00268 00269 /* For every row wise process, the pIn2 pointer is set 00270 ** to the starting address of the transposed pSrcB data */ 00271 pInB = pSrcBT; 00272 00273 /* column loop */ 00274 do 00275 { 00276 /* Set the variable sum, that acts as accumulator, to zero */ 00277 sumReal = 0; 00278 sumImag = 0; 00279 00280 /* Apply loop unrolling and compute 2 MACs simultaneously. */ 00281 colCnt = numColsA >> 1; 00282 00283 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ 00284 pInA = pSrcA->pData + i * 2; 00285 00286 00287 /* matrix multiplication */ 00288 while(colCnt > 0u) 00289 { 00290 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00291 00292 #ifdef UNALIGNED_SUPPORT_DISABLE 00293 00294 /* read real and imag values from pSrcA buffer */ 00295 a = *pInA; 00296 b = *(pInA + 1u); 00297 /* read real and imag values from pSrcB buffer */ 00298 c = *pInB; 00299 d = *(pInB + 1u); 00300 00301 /* Multiply and Accumlates */ 00302 sumReal += (q31_t) a *c; 00303 sumImag += (q31_t) a *d; 00304 sumReal -= (q31_t) b *d; 00305 sumImag += (q31_t) b *c; 00306 00307 /* read next real and imag values from pSrcA buffer */ 00308 a = *(pInA + 2u); 00309 b = *(pInA + 3u); 00310 /* read next real and imag values from pSrcB buffer */ 00311 c = *(pInB + 2u); 00312 d = *(pInB + 3u); 00313 00314 /* update pointer */ 00315 pInA += 4u; 00316 00317 /* Multiply and Accumlates */ 00318 sumReal += (q31_t) a *c; 00319 sumImag += (q31_t) a *d; 00320 sumReal -= (q31_t) b *d; 00321 sumImag += (q31_t) b *c; 00322 /* update pointer */ 00323 pInB += 4u; 00324 #else 00325 /* read real and imag values from pSrcA and pSrcB buffer */ 00326 pSourceA = *__SIMD32(pInA)++; 00327 pSourceB = *__SIMD32(pInB)++; 00328 00329 /* Multiply and Accumlates */ 00330 #ifdef ARM_MATH_BIG_ENDIAN 00331 prod1 = -__SMUSD(pSourceA, pSourceB); 00332 #else 00333 prod1 = __SMUSD(pSourceA, pSourceB); 00334 #endif 00335 prod2 = __SMUADX(pSourceA, pSourceB); 00336 sumReal += (q63_t) prod1; 00337 sumImag += (q63_t) prod2; 00338 00339 /* read real and imag values from pSrcA and pSrcB buffer */ 00340 pSourceA = *__SIMD32(pInA)++; 00341 pSourceB = *__SIMD32(pInB)++; 00342 00343 /* Multiply and Accumlates */ 00344 #ifdef ARM_MATH_BIG_ENDIAN 00345 prod1 = -__SMUSD(pSourceA, pSourceB); 00346 #else 00347 prod1 = __SMUSD(pSourceA, pSourceB); 00348 #endif 00349 prod2 = __SMUADX(pSourceA, pSourceB); 00350 sumReal += (q63_t) prod1; 00351 sumImag += (q63_t) prod2; 00352 00353 #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */ 00354 00355 /* Decrement the loop counter */ 00356 colCnt--; 00357 } 00358 00359 /* process odd column samples */ 00360 if((numColsA & 0x1u) > 0u) 00361 { 00362 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00363 00364 #ifdef UNALIGNED_SUPPORT_DISABLE 00365 00366 /* read real and imag values from pSrcA and pSrcB buffer */ 00367 a = *pInA++; 00368 b = *pInA++; 00369 c = *pInB++; 00370 d = *pInB++; 00371 00372 /* Multiply and Accumlates */ 00373 sumReal += (q31_t) a *c; 00374 sumImag += (q31_t) a *d; 00375 sumReal -= (q31_t) b *d; 00376 sumImag += (q31_t) b *c; 00377 00378 #else 00379 /* read real and imag values from pSrcA and pSrcB buffer */ 00380 pSourceA = *__SIMD32(pInA)++; 00381 pSourceB = *__SIMD32(pInB)++; 00382 00383 /* Multiply and Accumlates */ 00384 #ifdef ARM_MATH_BIG_ENDIAN 00385 prod1 = -__SMUSD(pSourceA, pSourceB); 00386 #else 00387 prod1 = __SMUSD(pSourceA, pSourceB); 00388 #endif 00389 prod2 = __SMUADX(pSourceA, pSourceB); 00390 sumReal += (q63_t) prod1; 00391 sumImag += (q63_t) prod2; 00392 00393 #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */ 00394 00395 } 00396 00397 /* Saturate and store the result in the destination buffer */ 00398 00399 *px++ = (q15_t) (__SSAT(sumReal >> 15, 16)); 00400 *px++ = (q15_t) (__SSAT(sumImag >> 15, 16)); 00401 00402 /* Decrement the column loop counter */ 00403 col--; 00404 00405 } while(col > 0u); 00406 00407 i = i + numColsA; 00408 00409 /* Decrement the row loop counter */ 00410 row--; 00411 00412 } while(row > 0u); 00413 00414 /* set status as ARM_MATH_SUCCESS */ 00415 status = ARM_MATH_SUCCESS; 00416 } 00417 00418 /* Return to application */ 00419 return (status); 00420 } 00421 00422 /** 00423 * @} end of MatrixMult group 00424 */
Generated on Tue Jul 12 2022 13:15:25 by
1.7.2
