takashi kadono / Mbed OS Nucleo_446

Dependencies:   ssd1331

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_mat_mult_q15.c Source File

arm_mat_mult_q15.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_mat_mult_q15.c    
00009 *    
00010 * Description:   Q15 matrix multiplication.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.     
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupMatrix    
00045  */
00046 
00047 /**    
00048  * @addtogroup MatrixMult    
00049  * @{    
00050  */
00051 
00052 
00053 /**    
00054  * @brief Q15 matrix multiplication    
00055  * @param[in]       *pSrcA points to the first input matrix structure    
00056  * @param[in]       *pSrcB points to the second input matrix structure    
00057  * @param[out]      *pDst points to output matrix structure    
00058  * @param[in]       *pState points to the array for storing intermediate results (Unused)  
00059  * @return          The function returns either    
00060  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.    
00061  *    
00062  * @details    
00063  * <b>Scaling and Overflow Behavior:</b>    
00064  *    
00065  * \par    
00066  * The function is implemented using a 64-bit internal accumulator. The inputs to the    
00067  * multiplications are in 1.15 format and multiplications yield a 2.30 result.    
00068  * The 2.30 intermediate    
00069  * results are accumulated in a 64-bit accumulator in 34.30 format. This approach    
00070  * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then    
00071  * truncated to 34.15 format by discarding the low 15 bits and then saturated to    
00072  * 1.15 format.    
00073  *    
00074  * \par    
00075  * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.    
00076  *    
00077  */
00078 
00079 arm_status arm_mat_mult_q15(
00080   const arm_matrix_instance_q15 * pSrcA,
00081   const arm_matrix_instance_q15 * pSrcB,
00082   arm_matrix_instance_q15 * pDst,
00083   q15_t * pState CMSIS_UNUSED)
00084 {
00085   q63_t sum;                                     /* accumulator */
00086 
00087 #ifndef ARM_MATH_CM0_FAMILY
00088 
00089   /* Run the below code for Cortex-M4 and Cortex-M3 */
00090 
00091   q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
00092   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
00093   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
00094   q15_t *px;                                     /* Temporary output data matrix pointer */
00095   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
00096   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
00097   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
00098   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
00099   uint16_t col, i = 0u, row = numRowsB, colCnt;  /* loop counters */
00100   arm_status status;                             /* status of matrix multiplication */
00101 
00102 #ifndef UNALIGNED_SUPPORT_DISABLE
00103 
00104   q31_t in;                                      /* Temporary variable to hold the input value */
00105   q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2;
00106 
00107 #else
00108 
00109   q15_t in;                                      /* Temporary variable to hold the input value */
00110   q15_t inA1, inB1, inA2, inB2;
00111 
00112 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00113 
00114 #ifdef ARM_MATH_MATRIX_CHECK
00115   /* Check for matrix mismatch condition */
00116   if((pSrcA->numCols != pSrcB->numRows) ||
00117      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
00118   {
00119     /* Set status as ARM_MATH_SIZE_MISMATCH */
00120     status = ARM_MATH_SIZE_MISMATCH;
00121   }
00122   else
00123 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
00124   {
00125     /* Matrix transpose */
00126     do
00127     {
00128       /* Apply loop unrolling and exchange the columns with row elements */
00129       col = numColsB >> 2;
00130 
00131       /* The pointer px is set to starting address of the column being processed */
00132       px = pSrcBT + i;
00133 
00134       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
00135        ** a second loop below computes the remaining 1 to 3 samples. */
00136       while(col > 0u)
00137       {
00138 #ifndef UNALIGNED_SUPPORT_DISABLE
00139 
00140         /* Read two elements from the row */
00141         in = *__SIMD32(pInB)++;
00142 
00143         /* Unpack and store one element in the destination */
00144 #ifndef ARM_MATH_BIG_ENDIAN
00145 
00146         *px = (q15_t) in;
00147 
00148 #else
00149 
00150         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
00151 
00152 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
00153 
00154         /* Update the pointer px to point to the next row of the transposed matrix */
00155         px += numRowsB;
00156 
00157         /* Unpack and store the second element in the destination */
00158 #ifndef ARM_MATH_BIG_ENDIAN
00159 
00160         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
00161 
00162 #else
00163 
00164         *px = (q15_t) in;
00165 
00166 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
00167 
00168         /* Update the pointer px to point to the next row of the transposed matrix */
00169         px += numRowsB;
00170 
00171         /* Read two elements from the row */
00172         in = *__SIMD32(pInB)++;
00173 
00174         /* Unpack and store one element in the destination */
00175 #ifndef ARM_MATH_BIG_ENDIAN
00176 
00177         *px = (q15_t) in;
00178 
00179 #else
00180 
00181         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
00182 
00183 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
00184 
00185         /* Update the pointer px to point to the next row of the transposed matrix */
00186         px += numRowsB;
00187 
00188         /* Unpack and store the second element in the destination */
00189 
00190 #ifndef ARM_MATH_BIG_ENDIAN
00191 
00192         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
00193 
00194 #else
00195 
00196         *px = (q15_t) in;
00197 
00198 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
00199 
00200         /* Update the pointer px to point to the next row of the transposed matrix */
00201         px += numRowsB;
00202 
00203 #else
00204 
00205         /* Read one element from the row */
00206         in = *pInB++;
00207 
00208         /* Store one element in the destination */
00209         *px = in;
00210  
00211         /* Update the pointer px to point to the next row of the transposed matrix */
00212         px += numRowsB;
00213 
00214         /* Read one element from the row */
00215         in = *pInB++;
00216 
00217         /* Store one element in the destination */
00218         *px = in;
00219  
00220         /* Update the pointer px to point to the next row of the transposed matrix */
00221         px += numRowsB;
00222 
00223         /* Read one element from the row */
00224         in = *pInB++;
00225 
00226         /* Store one element in the destination */
00227         *px = in;
00228  
00229         /* Update the pointer px to point to the next row of the transposed matrix */
00230         px += numRowsB;
00231 
00232         /* Read one element from the row */
00233         in = *pInB++;
00234 
00235         /* Store one element in the destination */
00236         *px = in;
00237  
00238         /* Update the pointer px to point to the next row of the transposed matrix */
00239         px += numRowsB;
00240 
00241 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00242 
00243        /* Decrement the column loop counter */
00244         col--;
00245       }
00246 
00247       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.        
00248        ** No loop unrolling is used. */
00249       col = numColsB % 0x4u;
00250 
00251       while(col > 0u)
00252       {
00253         /* Read and store the input element in the destination */
00254         *px = *pInB++;
00255 
00256         /* Update the pointer px to point to the next row of the transposed matrix */
00257         px += numRowsB;
00258 
00259         /* Decrement the column loop counter */
00260         col--;
00261       }
00262 
00263       i++;
00264 
00265       /* Decrement the row loop counter */
00266       row--;
00267 
00268     } while(row > 0u);
00269 
00270     /* Reset the variables for the usage in the following multiplication process */
00271     row = numRowsA;
00272     i = 0u;
00273     px = pDst->pData;
00274 
00275     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
00276     /* row loop */
00277     do
00278     {
00279       /* For every row wise process, the column loop counter is to be initiated */
00280       col = numColsB;
00281 
00282       /* For every row wise process, the pIn2 pointer is set        
00283        ** to the starting address of the transposed pSrcB data */
00284       pInB = pSrcBT;
00285 
00286       /* column loop */
00287       do
00288       {
00289         /* Set the variable sum, that acts as accumulator, to zero */
00290         sum = 0;
00291 
00292         /* Apply loop unrolling and compute 2 MACs simultaneously. */
00293         colCnt = numColsA >> 2;
00294 
00295         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
00296         pInA = pSrcA->pData + i;
00297 
00298 
00299         /* matrix multiplication */
00300         while(colCnt > 0u)
00301         {
00302           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
00303 #ifndef UNALIGNED_SUPPORT_DISABLE
00304 
00305           /* read real and imag values from pSrcA and pSrcB buffer */
00306           pSourceA1 = *__SIMD32(pInA)++;
00307           pSourceB1 = *__SIMD32(pInB)++;
00308 
00309           pSourceA2 = *__SIMD32(pInA)++;
00310           pSourceB2 = *__SIMD32(pInB)++;
00311 
00312           /* Multiply and Accumlates */
00313           sum = __SMLALD(pSourceA1, pSourceB1, sum);
00314           sum = __SMLALD(pSourceA2, pSourceB2, sum);
00315 
00316 #else
00317           /* read real and imag values from pSrcA and pSrcB buffer */
00318           inA1 = *pInA++;
00319           inB1 = *pInB++;
00320           inA2 = *pInA++;
00321           /* Multiply and Accumlates */
00322           sum += inA1 * inB1;
00323           inB2 = *pInB++;
00324 
00325           inA1 = *pInA++;
00326           inB1 = *pInB++;
00327           /* Multiply and Accumlates */
00328           sum += inA2 * inB2;
00329           inA2 = *pInA++;
00330           inB2 = *pInB++;
00331 
00332           /* Multiply and Accumlates */
00333           sum += inA1 * inB1;
00334           sum += inA2 * inB2;
00335 
00336 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00337 
00338           /* Decrement the loop counter */
00339           colCnt--;
00340         }
00341 
00342         /* process remaining column samples */
00343         colCnt = numColsA & 3u;
00344 
00345         while(colCnt > 0u)
00346         {
00347           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
00348           sum += *pInA++ * *pInB++;
00349 
00350           /* Decrement the loop counter */
00351           colCnt--;
00352         }
00353 
00354         /* Saturate and store the result in the destination buffer */
00355         *px = (q15_t) (__SSAT((sum >> 15), 16));
00356         px++;
00357 
00358         /* Decrement the column loop counter */
00359         col--;
00360 
00361       } while(col > 0u);
00362 
00363       i = i + numColsA;
00364 
00365       /* Decrement the row loop counter */
00366       row--;
00367 
00368     } while(row > 0u);
00369 
00370 #else
00371 
00372   /* Run the below code for Cortex-M0 */
00373 
00374   q15_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
00375   q15_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
00376   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
00377   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
00378   q15_t *pOut = pDst->pData;                     /* output data matrix pointer */
00379   q15_t *px;                                     /* Temporary output data matrix pointer */
00380   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
00381   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
00382   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
00383   uint16_t col, i = 0u, row = numRowsA, colCnt;  /* loop counters */
00384   arm_status status;                             /* status of matrix multiplication */
00385 
00386 #ifdef ARM_MATH_MATRIX_CHECK
00387 
00388   /* Check for matrix mismatch condition */
00389   if((pSrcA->numCols != pSrcB->numRows) ||
00390      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
00391   {
00392     /* Set status as ARM_MATH_SIZE_MISMATCH */
00393     status = ARM_MATH_SIZE_MISMATCH;
00394   }
00395   else
00396 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
00397 
00398   {
00399     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
00400     /* row loop */
00401     do
00402     {
00403       /* Output pointer is set to starting address of the row being processed */
00404       px = pOut + i;
00405 
00406       /* For every row wise process, the column loop counter is to be initiated */
00407       col = numColsB;
00408 
00409       /* For every row wise process, the pIn2 pointer is set          
00410        ** to the starting address of the pSrcB data */
00411       pIn2 = pSrcB->pData;
00412 
00413       /* column loop */
00414       do
00415       {
00416         /* Set the variable sum, that acts as accumulator, to zero */
00417         sum = 0;
00418 
00419         /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
00420         pIn1 = pInA;
00421 
00422         /* Matrix A columns number of MAC operations are to be performed */
00423         colCnt = numColsA;
00424 
00425         /* matrix multiplication */
00426         while(colCnt > 0u)
00427         {
00428           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
00429           /* Perform the multiply-accumulates */
00430           sum += (q31_t) * pIn1++ * *pIn2;
00431           pIn2 += numColsB;
00432 
00433           /* Decrement the loop counter */
00434           colCnt--;
00435         }
00436 
00437         /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
00438         /* Saturate and store the result in the destination buffer */
00439         *px++ = (q15_t) __SSAT((sum >> 15), 16);
00440 
00441         /* Decrement the column loop counter */
00442         col--;
00443 
00444         /* Update the pointer pIn2 to point to the  starting address of the next column */
00445         pIn2 = pInB + (numColsB - col);
00446 
00447       } while(col > 0u);
00448 
00449       /* Update the pointer pSrcA to point to the  starting address of the next row */
00450       i = i + numColsB;
00451       pInA = pInA + numColsA;
00452 
00453       /* Decrement the row loop counter */
00454       row--;
00455 
00456     } while(row > 0u);
00457 
00458 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
00459     /* set status as ARM_MATH_SUCCESS */
00460     status = ARM_MATH_SUCCESS;
00461   }
00462 
00463   /* Return to application */
00464   return (status);
00465 }
00466 
00467 /**        
00468  * @} end of MatrixMult group        
00469  */