Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_mat_cmplx_mult_q15.c Source File

arm_mat_cmplx_mult_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_cmplx_mat_mult_q15.c
00004  * Description:  Q15 complex matrix multiplication
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupMatrix
00033  */
00034 
00035 /**
00036  * @addtogroup CmplxMatrixMult
00037  * @{
00038  */
00039 
00040 
00041 /**
00042  * @brief Q15 Complex matrix multiplication
00043  * @param[in]       *pSrcA points to the first input complex matrix structure
00044  * @param[in]       *pSrcB points to the second input complex matrix structure
00045  * @param[out]      *pDst points to output complex matrix structure
00046  * @param[in]       *pScratch points to the array for storing intermediate results
00047  * @return          The function returns either
00048  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
00049  *
00050  * \par Conditions for optimum performance
00051  *  Input, output and state buffers should be aligned by 32-bit
00052  *
00053  * \par Restrictions
00054  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
00055  *  In this case input, output, scratch buffers should be aligned by 32-bit
00056  *
00057  * @details
00058  * <b>Scaling and Overflow Behavior:</b>
00059  *
00060  * \par
00061  * The function is implemented using a 64-bit internal accumulator. The inputs to the
00062  * multiplications are in 1.15 format and multiplications yield a 2.30 result.
00063  * The 2.30 intermediate
00064  * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
00065  * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
00066  * truncated to 34.15 format by discarding the low 15 bits and then saturated to
00067  * 1.15 format.
00068  *
00069  * \par
00070  * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function.
00071  *
00072  */
00073 
00074 
00075 
00076 
00077 arm_status arm_mat_cmplx_mult_q15(
00078   const arm_matrix_instance_q15 * pSrcA,
00079   const arm_matrix_instance_q15 * pSrcB,
00080   arm_matrix_instance_q15 * pDst,
00081   q15_t * pScratch)
00082 {
00083   /* accumulator */
00084   q15_t *pSrcBT = pScratch;                      /* input data matrix pointer for transpose */
00085   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
00086   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
00087   q15_t *px;                                     /* Temporary output data matrix pointer */
00088   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
00089   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
00090   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
00091   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
00092   uint16_t col, i = 0U, row = numRowsB, colCnt;  /* loop counters */
00093   arm_status status;                             /* status of matrix multiplication */
00094   q63_t sumReal, sumImag;
00095 
00096 #ifdef UNALIGNED_SUPPORT_DISABLE
00097   q15_t in;                                      /* Temporary variable to hold the input value */
00098   q15_t a, b, c, d;
00099 #else
00100   q31_t in;                                      /* Temporary variable to hold the input value */
00101   q31_t prod1, prod2;
00102   q31_t pSourceA, pSourceB;
00103 #endif
00104 
00105 #ifdef ARM_MATH_MATRIX_CHECK
00106   /* Check for matrix mismatch condition */
00107   if ((pSrcA->numCols != pSrcB->numRows) ||
00108      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
00109   {
00110     /* Set status as ARM_MATH_SIZE_MISMATCH */
00111     status = ARM_MATH_SIZE_MISMATCH;
00112   }
00113   else
00114 #endif
00115   {
00116     /* Matrix transpose */
00117     do
00118     {
00119       /* Apply loop unrolling and exchange the columns with row elements */
00120       col = numColsB >> 2;
00121 
00122       /* The pointer px is set to starting address of the column being processed */
00123       px = pSrcBT + i;
00124 
00125       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
00126        ** a second loop below computes the remaining 1 to 3 samples. */
00127       while (col > 0U)
00128       {
00129 #ifdef UNALIGNED_SUPPORT_DISABLE
00130         /* Read two elements from the row */
00131         in = *pInB++;
00132         *px = in;
00133         in = *pInB++;
00134         px[1] = in;
00135 
00136         /* Update the pointer px to point to the next row of the transposed matrix */
00137         px += numRowsB * 2;
00138 
00139         /* Read two elements from the row */
00140         in = *pInB++;
00141         *px = in;
00142         in = *pInB++;
00143         px[1] = in;
00144 
00145         /* Update the pointer px to point to the next row of the transposed matrix */
00146         px += numRowsB * 2;
00147 
00148         /* Read two elements from the row */
00149         in = *pInB++;
00150         *px = in;
00151         in = *pInB++;
00152         px[1] = in;
00153 
00154         /* Update the pointer px to point to the next row of the transposed matrix */
00155         px += numRowsB * 2;
00156 
00157         /* Read two elements from the row */
00158         in = *pInB++;
00159         *px = in;
00160         in = *pInB++;
00161         px[1] = in;
00162 
00163         /* Update the pointer px to point to the next row of the transposed matrix */
00164         px += numRowsB * 2;
00165 
00166         /* Decrement the column loop counter */
00167         col--;
00168       }
00169 
00170       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
00171        ** No loop unrolling is used. */
00172       col = numColsB % 0x4U;
00173 
00174       while (col > 0U)
00175       {
00176         /* Read two elements from the row */
00177         in = *pInB++;
00178         *px = in;
00179         in = *pInB++;
00180         px[1] = in;
00181 #else
00182 
00183         /* Read two elements from the row */
00184         in = *__SIMD32(pInB)++;
00185 
00186         *__SIMD32(px) = in;
00187 
00188         /* Update the pointer px to point to the next row of the transposed matrix */
00189         px += numRowsB * 2;
00190 
00191 
00192         /* Read two elements from the row */
00193         in = *__SIMD32(pInB)++;
00194 
00195         *__SIMD32(px) = in;
00196 
00197         /* Update the pointer px to point to the next row of the transposed matrix */
00198         px += numRowsB * 2;
00199 
00200         /* Read two elements from the row */
00201         in = *__SIMD32(pInB)++;
00202 
00203         *__SIMD32(px) = in;
00204 
00205         /* Update the pointer px to point to the next row of the transposed matrix */
00206         px += numRowsB * 2;
00207 
00208         /* Read two elements from the row */
00209         in = *__SIMD32(pInB)++;
00210 
00211         *__SIMD32(px) = in;
00212 
00213         /* Update the pointer px to point to the next row of the transposed matrix */
00214         px += numRowsB * 2;
00215 
00216         /* Decrement the column loop counter */
00217         col--;
00218       }
00219 
00220       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
00221        ** No loop unrolling is used. */
00222       col = numColsB % 0x4U;
00223 
00224       while (col > 0U)
00225       {
00226         /* Read two elements from the row */
00227         in = *__SIMD32(pInB)++;
00228 
00229         *__SIMD32(px) = in;
00230 #endif
00231 
00232         /* Update the pointer px to point to the next row of the transposed matrix */
00233         px += numRowsB * 2;
00234 
00235         /* Decrement the column loop counter */
00236         col--;
00237       }
00238 
00239       i = i + 2U;
00240 
00241       /* Decrement the row loop counter */
00242       row--;
00243 
00244     } while (row > 0U);
00245 
00246     /* Reset the variables for the usage in the following multiplication process */
00247     row = numRowsA;
00248     i = 0U;
00249     px = pDst->pData;
00250 
00251     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
00252     /* row loop */
00253     do
00254     {
00255       /* For every row wise process, the column loop counter is to be initiated */
00256       col = numColsB;
00257 
00258       /* For every row wise process, the pIn2 pointer is set
00259        ** to the starting address of the transposed pSrcB data */
00260       pInB = pSrcBT;
00261 
00262       /* column loop */
00263       do
00264       {
00265         /* Set the variable sum, that acts as accumulator, to zero */
00266         sumReal = 0;
00267         sumImag = 0;
00268 
00269         /* Apply loop unrolling and compute 2 MACs simultaneously. */
00270         colCnt = numColsA >> 1;
00271 
00272         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
00273         pInA = pSrcA->pData + i * 2;
00274 
00275 
00276         /* matrix multiplication */
00277         while (colCnt > 0U)
00278         {
00279           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
00280 
00281 #ifdef UNALIGNED_SUPPORT_DISABLE
00282 
00283           /* read real and imag values from pSrcA buffer */
00284           a = *pInA;
00285           b = *(pInA + 1U);
00286           /* read real and imag values from pSrcB buffer */
00287           c = *pInB;
00288           d = *(pInB + 1U);
00289 
00290           /* Multiply and Accumlates */
00291           sumReal += (q31_t) a *c;
00292           sumImag += (q31_t) a *d;
00293           sumReal -= (q31_t) b *d;
00294           sumImag += (q31_t) b *c;
00295 
00296           /* read next real and imag values from pSrcA buffer */
00297           a = *(pInA + 2U);
00298           b = *(pInA + 3U);
00299           /* read next real and imag values from pSrcB buffer */
00300           c = *(pInB + 2U);
00301           d = *(pInB + 3U);
00302 
00303           /* update pointer */
00304           pInA += 4U;
00305 
00306           /* Multiply and Accumlates */
00307           sumReal += (q31_t) a *c;
00308           sumImag += (q31_t) a *d;
00309           sumReal -= (q31_t) b *d;
00310           sumImag += (q31_t) b *c;
00311           /* update pointer */
00312           pInB += 4U;
00313 #else
00314           /* read real and imag values from pSrcA and pSrcB buffer */
00315           pSourceA = *__SIMD32(pInA)++;
00316           pSourceB = *__SIMD32(pInB)++;
00317 
00318           /* Multiply and Accumlates */
00319 #ifdef ARM_MATH_BIG_ENDIAN
00320           prod1 = -__SMUSD(pSourceA, pSourceB);
00321 #else
00322           prod1 = __SMUSD(pSourceA, pSourceB);
00323 #endif
00324           prod2 = __SMUADX(pSourceA, pSourceB);
00325           sumReal += (q63_t) prod1;
00326           sumImag += (q63_t) prod2;
00327 
00328           /* read real and imag values from pSrcA and pSrcB buffer */
00329           pSourceA = *__SIMD32(pInA)++;
00330           pSourceB = *__SIMD32(pInB)++;
00331 
00332           /* Multiply and Accumlates */
00333 #ifdef ARM_MATH_BIG_ENDIAN
00334           prod1 = -__SMUSD(pSourceA, pSourceB);
00335 #else
00336           prod1 = __SMUSD(pSourceA, pSourceB);
00337 #endif
00338           prod2 = __SMUADX(pSourceA, pSourceB);
00339           sumReal += (q63_t) prod1;
00340           sumImag += (q63_t) prod2;
00341 
00342 #endif /*      #ifdef UNALIGNED_SUPPORT_DISABLE */
00343 
00344           /* Decrement the loop counter */
00345           colCnt--;
00346         }
00347 
00348         /* process odd column samples */
00349         if ((numColsA & 0x1U) > 0U)
00350         {
00351           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
00352 
00353 #ifdef UNALIGNED_SUPPORT_DISABLE
00354 
00355           /* read real and imag values from pSrcA and pSrcB buffer */
00356           a = *pInA++;
00357           b = *pInA++;
00358           c = *pInB++;
00359           d = *pInB++;
00360 
00361           /* Multiply and Accumlates */
00362           sumReal += (q31_t) a *c;
00363           sumImag += (q31_t) a *d;
00364           sumReal -= (q31_t) b *d;
00365           sumImag += (q31_t) b *c;
00366 
00367 #else
00368           /* read real and imag values from pSrcA and pSrcB buffer */
00369           pSourceA = *__SIMD32(pInA)++;
00370           pSourceB = *__SIMD32(pInB)++;
00371 
00372           /* Multiply and Accumlates */
00373 #ifdef ARM_MATH_BIG_ENDIAN
00374           prod1 = -__SMUSD(pSourceA, pSourceB);
00375 #else
00376           prod1 = __SMUSD(pSourceA, pSourceB);
00377 #endif
00378           prod2 = __SMUADX(pSourceA, pSourceB);
00379           sumReal += (q63_t) prod1;
00380           sumImag += (q63_t) prod2;
00381 
00382 #endif /*      #ifdef UNALIGNED_SUPPORT_DISABLE */
00383 
00384         }
00385 
00386         /* Saturate and store the result in the destination buffer */
00387 
00388         *px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
00389         *px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
00390 
00391         /* Decrement the column loop counter */
00392         col--;
00393 
00394       } while (col > 0U);
00395 
00396       i = i + numColsA;
00397 
00398       /* Decrement the row loop counter */
00399       row--;
00400 
00401     } while (row > 0U);
00402 
00403     /* set status as ARM_MATH_SUCCESS */
00404     status = ARM_MATH_SUCCESS;
00405   }
00406 
00407   /* Return to application */
00408   return (status);
00409 }
00410 
00411 /**
00412  * @} end of MatrixMult group
00413  */
00414