Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_mat_mult_fast_q15.c Source File

arm_mat_mult_fast_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_mat_mult_fast_q15.c
00004  * Description:  Q15 matrix multiplication (fast variant)
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupMatrix
00033  */
00034 
00035 /**
00036  * @addtogroup MatrixMult
00037  * @{
00038  */
00039 
00040 
00041 /**
00042  * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
00043  * @param[in]       *pSrcA points to the first input matrix structure
00044  * @param[in]       *pSrcB points to the second input matrix structure
00045  * @param[out]      *pDst points to output matrix structure
00046  * @param[in]       *pState points to the array for storing intermediate results
00047  * @return          The function returns either
00048  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
00049  *
00050  * @details
00051  * <b>Scaling and Overflow Behavior:</b>
00052  *
00053  * \par
00054  * The difference between the function arm_mat_mult_q15() and this fast variant is that
00055  * the fast variant use a 32-bit rather than a 64-bit accumulator.
00056  * The result of each 1.15 x 1.15 multiplication is truncated to
00057  * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
00058  * format. Finally, the accumulator is saturated and converted to a 1.15 result.
00059  *
00060  * \par
00061  * The fast version has the same overflow behavior as the standard version but provides
00062  * less precision since it discards the low 16 bits of each multiplication result.
00063  * In order to avoid overflows completely the input signals must be scaled down.
00064  * Scale down one of the input matrices by log2(numColsA) bits to
00065  * avoid overflows, as a total of numColsA additions are computed internally for each
00066  * output element.
00067  *
00068  * \par
00069  * See <code>arm_mat_mult_q15()</code> for a slower implementation of this function
00070  * which uses 64-bit accumulation to provide higher precision.
00071  */
00072 
00073 arm_status arm_mat_mult_fast_q15(
00074   const arm_matrix_instance_q15 * pSrcA,
00075   const arm_matrix_instance_q15 * pSrcB,
00076   arm_matrix_instance_q15 * pDst,
00077   q15_t * pState)
00078 {
00079   q31_t sum;                                     /* accumulator */
00080   q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
00081   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
00082   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
00083   q15_t *px;                                     /* Temporary output data matrix pointer */
00084   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
00085   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
00086   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
00087   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
00088   uint32_t col, i = 0U, row = numRowsB, colCnt;  /* loop counters */
00089   arm_status status;                             /* status of matrix multiplication */
00090 
00091 #ifndef UNALIGNED_SUPPORT_DISABLE
00092 
00093   q31_t in;                                      /* Temporary variable to hold the input value */
00094   q31_t inA1, inA2, inB1, inB2;
00095   q31_t sum2, sum3, sum4;
00096   q15_t *pInA2, *pInB2, *px2;
00097   uint32_t j = 0;
00098 
00099 #else
00100 
00101   q15_t in;                                      /* Temporary variable to hold the input value */
00102   q15_t inA1, inA2, inB1, inB2;
00103 
00104 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
00105 
00106 #ifdef ARM_MATH_MATRIX_CHECK
00107   /* Check for matrix mismatch condition */
00108   if ((pSrcA->numCols != pSrcB->numRows) ||
00109      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
00110   {
00111     /* Set status as ARM_MATH_SIZE_MISMATCH */
00112     status = ARM_MATH_SIZE_MISMATCH;
00113   }
00114   else
00115 #endif
00116   {
00117     /* Matrix transpose */
00118     do
00119     {
00120       /* Apply loop unrolling and exchange the columns with row elements */
00121       col = numColsB >> 2;
00122 
00123       /* The pointer px is set to starting address of the column being processed */
00124       px = pSrcBT + i;
00125 
00126       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
00127        ** a second loop below computes the remaining 1 to 3 samples. */
00128       while (col > 0U)
00129       {
00130 #ifndef UNALIGNED_SUPPORT_DISABLE
00131         /* Read two elements from the row */
00132         in = *__SIMD32(pInB)++;
00133 
00134         /* Unpack and store one element in the destination */
00135 #ifndef ARM_MATH_BIG_ENDIAN
00136 
00137         *px = (q15_t) in;
00138 
00139 #else
00140 
00141         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
00142 
00143 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
00144 
00145         /* Update the pointer px to point to the next row of the transposed matrix */
00146         px += numRowsB;
00147 
00148         /* Unpack and store the second element in the destination */
00149 #ifndef ARM_MATH_BIG_ENDIAN
00150 
00151         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
00152 
00153 #else
00154 
00155         *px = (q15_t) in;
00156 
00157 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
00158 
00159         /* Update the pointer px to point to the next row of the transposed matrix */
00160         px += numRowsB;
00161 
00162         /* Read two elements from the row */
00163         in = *__SIMD32(pInB)++;
00164 
00165         /* Unpack and store one element in the destination */
00166 #ifndef ARM_MATH_BIG_ENDIAN
00167 
00168         *px = (q15_t) in;
00169 
00170 #else
00171 
00172         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
00173 
00174 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
00175 
00176         /* Update the pointer px to point to the next row of the transposed matrix */
00177         px += numRowsB;
00178 
00179         /* Unpack and store the second element in the destination */
00180 
00181 #ifndef ARM_MATH_BIG_ENDIAN
00182 
00183         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
00184 
00185 #else
00186 
00187         *px = (q15_t) in;
00188 
00189 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
00190 
00191 #else
00192 
00193         /* Read one element from the row */
00194         in = *pInB++;
00195 
00196         /* Store one element in the destination */
00197         *px = in;
00198 
00199         /* Update the pointer px to point to the next row of the transposed matrix */
00200         px += numRowsB;
00201 
00202         /* Read one element from the row */
00203         in = *pInB++;
00204 
00205         /* Store one element in the destination */
00206         *px = in;
00207 
00208         /* Update the pointer px to point to the next row of the transposed matrix */
00209         px += numRowsB;
00210 
00211         /* Read one element from the row */
00212         in = *pInB++;
00213 
00214         /* Store one element in the destination */
00215         *px = in;
00216 
00217         /* Update the pointer px to point to the next row of the transposed matrix */
00218         px += numRowsB;
00219 
00220         /* Read one element from the row */
00221         in = *pInB++;
00222 
00223         /* Store one element in the destination */
00224         *px = in;
00225 
00226 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
00227 
00228         /* Update the pointer px to point to the next row of the transposed matrix */
00229         px += numRowsB;
00230 
00231         /* Decrement the column loop counter */
00232         col--;
00233       }
00234 
00235       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
00236        ** No loop unrolling is used. */
00237       col = numColsB % 0x4U;
00238 
00239       while (col > 0U)
00240       {
00241         /* Read and store the input element in the destination */
00242         *px = *pInB++;
00243 
00244         /* Update the pointer px to point to the next row of the transposed matrix */
00245         px += numRowsB;
00246 
00247         /* Decrement the column loop counter */
00248         col--;
00249       }
00250 
00251       i++;
00252 
00253       /* Decrement the row loop counter */
00254       row--;
00255 
00256     } while (row > 0U);
00257 
00258     /* Reset the variables for the usage in the following multiplication process */
00259     row = numRowsA;
00260     i = 0U;
00261     px = pDst->pData;
00262 
00263 #ifndef UNALIGNED_SUPPORT_DISABLE
00264     /* Process two rows from matrix A at a time and output two rows at a time */
00265     row = row >> 1;
00266     px2 = px + numColsB;
00267 #endif
00268 
00269     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
00270     /* row loop */
00271     while (row > 0U)
00272     {
00273       /* For every row wise process, the column loop counter is to be initiated */
00274       col = numColsB;
00275 
00276       /* For every row wise process, the pIn2 pointer is set
00277        ** to the starting address of the transposed pSrcB data */
00278       pInB = pSrcBT;
00279 
00280 #ifndef UNALIGNED_SUPPORT_DISABLE
00281       /* Process two (transposed) columns from matrix B at a time */
00282       col = col >> 1;
00283       j = 0;
00284 #endif
00285 
00286       /* column loop */
00287       while (col > 0U)
00288       {
00289         /* Set the variable sum, that acts as accumulator, to zero */
00290         sum = 0;
00291 
00292         /* Initiate the pointer pInA to point to the starting address of the column being processed */
00293         pInA = pSrcA->pData + i;
00294 
00295 #ifndef UNALIGNED_SUPPORT_DISABLE
00296         sum2 = 0;
00297         sum3 = 0;
00298         sum4 = 0;
00299         pInB  = pSrcBT + j;
00300         pInA2 = pInA + numColsA;
00301         pInB2 = pInB + numRowsB;
00302 
00303         /* Read in two elements at once - alows dual MAC instruction */
00304         colCnt = numColsA >> 1;
00305 #else
00306         colCnt = numColsA >> 2;
00307 #endif
00308 
00309         /* matrix multiplication */
00310         while (colCnt > 0U)
00311         {
00312           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
00313 #ifndef UNALIGNED_SUPPORT_DISABLE
00314 
00315           inA1 = *__SIMD32(pInA)++;
00316           inB1 = *__SIMD32(pInB)++;
00317           inA2 = *__SIMD32(pInA2)++;
00318           inB2 = *__SIMD32(pInB2)++;
00319 
00320           sum  = __SMLAD(inA1, inB1, sum);
00321           sum2 = __SMLAD(inA1, inB2, sum2);
00322           sum3 = __SMLAD(inA2, inB1, sum3);
00323           sum4 = __SMLAD(inA2, inB2, sum4);
00324 
00325 #else
00326 
00327           inA1 = *pInA;
00328           inB1 = *pInB;
00329           sum += inA1 * inB1;
00330 
00331           inA2 = pInA[1];
00332           inB2 = pInB[1];
00333           sum += inA2 * inB2;
00334 
00335           inA1 = pInA[2];
00336           inB1 = pInB[2];
00337           sum += inA1 * inB1;
00338 
00339           inA2 = pInA[3];
00340           inB2 = pInB[3];
00341           sum += inA2 * inB2;
00342 
00343           pInA += 4;
00344           pInB += 4;
00345 
00346 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
00347 
00348           /* Decrement the loop counter */
00349           colCnt--;
00350         }
00351 
00352         /* process odd column samples */
00353 #ifndef UNALIGNED_SUPPORT_DISABLE
00354         if (numColsA & 1U) {
00355           inA1 = *pInA++;
00356           inB1 = *pInB++;
00357           inA2 = *pInA2++;
00358           inB2 = *pInB2++;
00359           sum  += inA1 * inB1;
00360           sum2 += inA1 * inB2;
00361           sum3 += inA2 * inB1;
00362           sum4 += inA2 * inB2;
00363         }
00364 #else
00365         colCnt = numColsA % 0x4U;
00366 
00367         while (colCnt > 0U)
00368         {
00369           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
00370           sum += (q31_t) (*pInA++) * (*pInB++);
00371 
00372           colCnt--;
00373         }
00374 #endif
00375 
00376         /* Saturate and store the result in the destination buffer */
00377         *px++  = (q15_t) (sum >> 15);
00378 
00379 #ifndef UNALIGNED_SUPPORT_DISABLE
00380         *px++  = (q15_t) (sum2 >> 15);
00381         *px2++ = (q15_t) (sum3 >> 15);
00382         *px2++ = (q15_t) (sum4 >> 15);
00383         j += numRowsB * 2;
00384 #endif
00385 
00386         /* Decrement the column loop counter */
00387         col--;
00388 
00389       }
00390 
00391       i = i + numColsA;
00392 
00393 #ifndef UNALIGNED_SUPPORT_DISABLE
00394       i = i + numColsA;
00395       px = px2 + (numColsB & 1U);
00396       px2 = px + numColsB;
00397 #endif
00398 
00399       /* Decrement the row loop counter */
00400       row--;
00401 
00402     }
00403 
00404     /* Compute any remaining odd row/column below */
00405 
00406 #ifndef UNALIGNED_SUPPORT_DISABLE
00407 
00408     /* Compute remaining output column */
00409     if (numColsB & 1U) {
00410 
00411       /* Avoid redundant computation of last element */
00412       row = numRowsA & (~0x1);
00413 
00414       /* Point to remaining unfilled column in output matrix */
00415       px = pDst->pData+numColsB-1;
00416       pInA = pSrcA->pData;
00417 
00418       /* row loop */
00419       while (row > 0)
00420       {
00421 
00422         /* point to last column in matrix B */
00423         pInB  = pSrcBT + numRowsB*(numColsB-1);
00424 
00425         /* Set the variable sum, that acts as accumulator, to zero */
00426         sum  = 0;
00427 
00428         /* Compute 4 columns at once */
00429         colCnt = numColsA >> 2;
00430 
00431         /* matrix multiplication */
00432         while (colCnt > 0U)
00433         {
00434           inA1 = *__SIMD32(pInA)++;
00435           inA2 = *__SIMD32(pInA)++;
00436           inB1 = *__SIMD32(pInB)++;
00437           inB2 = *__SIMD32(pInB)++;
00438 
00439           sum  = __SMLAD(inA1, inB1, sum);
00440           sum  = __SMLAD(inA2, inB2, sum);
00441 
00442           /* Decrement the loop counter */
00443           colCnt--;
00444         }
00445 
00446         colCnt = numColsA & 3U;
00447         while (colCnt > 0U) {
00448           sum += (q31_t) (*pInA++) * (*pInB++);
00449           colCnt--;
00450         }
00451 
00452         /* Store the result in the destination buffer */
00453         *px  = (q15_t) (sum  >> 15);
00454         px += numColsB;
00455 
00456         /* Decrement the row loop counter */
00457         row--;
00458       }
00459     }
00460 
00461     /* Compute remaining output row */
00462     if (numRowsA & 1U) {
00463 
00464       /* point to last row in output matrix */
00465       px = pDst->pData+(numColsB)*(numRowsA-1);
00466 
00467       pInB  = pSrcBT;
00468       col = numColsB;
00469       i = 0U;
00470 
00471       /* col loop */
00472       while (col > 0)
00473       {
00474 
00475         /* point to last row in matrix A */
00476         pInA = pSrcA->pData + (numRowsA-1)*numColsA;
00477 
00478         /* Set the variable sum, that acts as accumulator, to zero */
00479         sum  = 0;
00480 
00481         /* Compute 4 columns at once */
00482         colCnt = numColsA >> 2;
00483 
00484         /* matrix multiplication */
00485         while (colCnt > 0U)
00486         {
00487           inA1 = *__SIMD32(pInA)++;
00488           inA2 = *__SIMD32(pInA)++;
00489           inB1 = *__SIMD32(pInB)++;
00490           inB2 = *__SIMD32(pInB)++;
00491 
00492           sum  = __SMLAD(inA1, inB1, sum);
00493           sum  = __SMLAD(inA2, inB2, sum);
00494 
00495           /* Decrement the loop counter */
00496           colCnt--;
00497         }
00498 
00499         colCnt = numColsA & 3U;
00500         while (colCnt > 0U) {
00501           sum += (q31_t) (*pInA++) * (*pInB++);
00502           colCnt--;
00503         }
00504 
00505         /* Store the result in the destination buffer */
00506         *px++  = (q15_t) (sum  >> 15);
00507 
00508         /* Decrement the col loop counter */
00509         col--;
00510       }
00511     }
00512 
00513 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
00514 
00515     /* set status as ARM_MATH_SUCCESS */
00516     status = ARM_MATH_SUCCESS;
00517   }
00518 
00519   /* Return to application */
00520   return (status);
00521 }
00522 
00523 /**
00524  * @} end of MatrixMult group
00525  */
00526