Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_mat_mult_q31.c Source File

arm_mat_mult_q31.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_mat_mult_q31.c
00004  * Description:  Q31 matrix multiplication
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupMatrix
00033  */
00034 
00035 /**
00036  * @addtogroup MatrixMult
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Q31 matrix multiplication
00042  * @param[in]       *pSrcA points to the first input matrix structure
00043  * @param[in]       *pSrcB points to the second input matrix structure
00044  * @param[out]      *pDst points to output matrix structure
00045  * @return          The function returns either
00046  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
00047  *
00048  * @details
00049  * <b>Scaling and Overflow Behavior:</b>
00050  *
00051  * \par
00052  * The function is implemented using an internal 64-bit accumulator.
00053  * The accumulator has a 2.62 format and maintains full precision of the intermediate
00054  * multiplication results but provides only a single guard bit. There is no saturation
00055  * on intermediate additions. Thus, if the accumulator overflows it wraps around and
00056  * distorts the result. The input signals should be scaled down to avoid intermediate
00057  * overflows. The input is thus scaled down by log2(numColsA) bits
00058  * to avoid overflows, as a total of numColsA additions are performed internally.
00059  * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
00060  *
00061  * \par
00062  * See <code>arm_mat_mult_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
00063  *
00064  */
00065 
00066 arm_status arm_mat_mult_q31(
00067   const arm_matrix_instance_q31 * pSrcA,
00068   const arm_matrix_instance_q31 * pSrcB,
00069   arm_matrix_instance_q31 * pDst)
00070 {
00071   q31_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
00072   q31_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
00073   q31_t *pInA = pSrcA->pData;                    /* input data matrix pointer A */
00074   q31_t *pOut = pDst->pData;                     /* output data matrix pointer */
00075   q31_t *px;                                     /* Temporary output data matrix pointer */
00076   q63_t sum;                                     /* Accumulator */
00077   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
00078   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
00079   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
00080 
00081 #if defined (ARM_MATH_DSP)
00082 
00083   /* Run the below code for Cortex-M4 and Cortex-M3 */
00084 
00085   uint16_t col, i = 0U, j, row = numRowsA, colCnt;      /* loop counters */
00086   arm_status status;                             /* status of matrix multiplication */
00087   q31_t a0, a1, a2, a3, b0, b1, b2, b3;
00088 
00089 #ifdef ARM_MATH_MATRIX_CHECK
00090 
00091 
00092   /* Check for matrix mismatch condition */
00093   if ((pSrcA->numCols != pSrcB->numRows) ||
00094      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
00095   {
00096     /* Set status as ARM_MATH_SIZE_MISMATCH */
00097     status = ARM_MATH_SIZE_MISMATCH;
00098   }
00099   else
00100 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
00101 
00102   {
00103     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
00104     /* row loop */
00105     do
00106     {
00107       /* Output pointer is set to starting address of the row being processed */
00108       px = pOut + i;
00109 
00110       /* For every row wise process, the column loop counter is to be initiated */
00111       col = numColsB;
00112 
00113       /* For every row wise process, the pIn2 pointer is set
00114        ** to the starting address of the pSrcB data */
00115       pIn2 = pSrcB->pData;
00116 
00117       j = 0U;
00118 
00119       /* column loop */
00120       do
00121       {
00122         /* Set the variable sum, that acts as accumulator, to zero */
00123         sum = 0;
00124 
00125         /* Initiate the pointer pIn1 to point to the starting address of pInA */
00126         pIn1 = pInA;
00127 
00128         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00129         colCnt = numColsA >> 2;
00130 
00131 
00132         /* matrix multiplication */
00133         while (colCnt > 0U)
00134         {
00135           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
00136           /* Perform the multiply-accumulates */
00137           b0 = *pIn2;
00138           pIn2 += numColsB;
00139 
00140           a0 = *pIn1++;
00141           a1 = *pIn1++;
00142 
00143           b1 = *pIn2;
00144           pIn2 += numColsB;
00145           b2 = *pIn2;
00146           pIn2 += numColsB;
00147 
00148           sum += (q63_t) a0 *b0;
00149           sum += (q63_t) a1 *b1;
00150 
00151           a2 = *pIn1++;
00152           a3 = *pIn1++;
00153 
00154           b3 = *pIn2;
00155           pIn2 += numColsB;
00156 
00157           sum += (q63_t) a2 *b2;
00158           sum += (q63_t) a3 *b3;
00159 
00160           /* Decrement the loop counter */
00161           colCnt--;
00162         }
00163 
00164         /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.
00165          ** No loop unrolling is used. */
00166         colCnt = numColsA % 0x4U;
00167 
00168         while (colCnt > 0U)
00169         {
00170           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
00171           /* Perform the multiply-accumulates */
00172           sum += (q63_t) * pIn1++ * *pIn2;
00173           pIn2 += numColsB;
00174 
00175           /* Decrement the loop counter */
00176           colCnt--;
00177         }
00178 
00179         /* Convert the result from 2.62 to 1.31 format and store in destination buffer */
00180         *px++ = (q31_t) (sum >> 31);
00181 
00182         /* Update the pointer pIn2 to point to the  starting address of the next column */
00183         j++;
00184         pIn2 = (pSrcB->pData) + j;
00185 
00186         /* Decrement the column loop counter */
00187         col--;
00188 
00189       } while (col > 0U);
00190 
00191 #else
00192 
00193   /* Run the below code for Cortex-M0 */
00194 
00195   q31_t *pInB = pSrcB->pData;                    /* input data matrix pointer B */
00196   uint16_t col, i = 0U, row = numRowsA, colCnt;  /* loop counters */
00197   arm_status status;                             /* status of matrix multiplication */
00198 
00199 
00200 #ifdef ARM_MATH_MATRIX_CHECK
00201 
00202   /* Check for matrix mismatch condition */
00203   if ((pSrcA->numCols != pSrcB->numRows) ||
00204      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
00205   {
00206     /* Set status as ARM_MATH_SIZE_MISMATCH */
00207     status = ARM_MATH_SIZE_MISMATCH;
00208   }
00209   else
00210 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
00211 
00212   {
00213     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
00214     /* row loop */
00215     do
00216     {
00217       /* Output pointer is set to starting address of the row being processed */
00218       px = pOut + i;
00219 
00220       /* For every row wise process, the column loop counter is to be initiated */
00221       col = numColsB;
00222 
00223       /* For every row wise process, the pIn2 pointer is set
00224        ** to the starting address of the pSrcB data */
00225       pIn2 = pSrcB->pData;
00226 
00227       /* column loop */
00228       do
00229       {
00230         /* Set the variable sum, that acts as accumulator, to zero */
00231         sum = 0;
00232 
00233         /* Initiate the pointer pIn1 to point to the starting address of pInA */
00234         pIn1 = pInA;
00235 
00236         /* Matrix A columns number of MAC operations are to be performed */
00237         colCnt = numColsA;
00238 
00239         /* matrix multiplication */
00240         while (colCnt > 0U)
00241         {
00242           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
00243           /* Perform the multiply-accumulates */
00244           sum += (q63_t) * pIn1++ * *pIn2;
00245           pIn2 += numColsB;
00246 
00247           /* Decrement the loop counter */
00248           colCnt--;
00249         }
00250 
00251         /* Convert the result from 2.62 to 1.31 format and store in destination buffer */
00252         *px++ = (q31_t) clip_q63_to_q31(sum >> 31);
00253 
00254         /* Decrement the column loop counter */
00255         col--;
00256 
00257         /* Update the pointer pIn2 to point to the  starting address of the next column */
00258         pIn2 = pInB + (numColsB - col);
00259 
00260       } while (col > 0U);
00261 
00262 #endif
00263 
00264       /* Update the pointer pInA to point to the  starting address of the next row */
00265       i = i + numColsB;
00266       pInA = pInA + numColsA;
00267 
00268       /* Decrement the row loop counter */
00269       row--;
00270 
00271     } while (row > 0U);
00272 
00273     /* set status as ARM_MATH_SUCCESS */
00274     status = ARM_MATH_SUCCESS;
00275   }
00276   /* Return to application */
00277   return (status);
00278 }
00279 
00280 /**
00281  * @} end of MatrixMult group
00282  */
00283