Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_nnfunctions.h Source File

arm_nnfunctions.h

00001 /*
00002  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
00003  *
00004  * SPDX-License-Identifier: Apache-2.0
00005  *
00006  * Licensed under the Apache License, Version 2.0 (the License); you may
00007  * not use this file except in compliance with the License.
00008  * You may obtain a copy of the License at
00009  *
00010  * www.apache.org/licenses/LICENSE-2.0
00011  *
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00014  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  */
00018 
00019 /* ----------------------------------------------------------------------
00020  * Project:      CMSIS NN Library
00021  * Title:        arm_nnfunctions.h
00022  * Description:  Public header file for CMSIS NN Library
00023  *
00024  * $Date:        17. January 2018
00025  * $Revision:    V.1.0.0
00026  *
00027  * Target Processor:  Cortex-M cores
00028  * -------------------------------------------------------------------- */
00029 
00030 /**
00031    \mainpage CMSIS NN Software Library
00032    *
00033    * Introduction
00034    * ------------
00035    *
00036    * This user manual describes the CMSIS NN software library,
00037    * a collection of efficient neural network kernels developed to maximize the 
00038    * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
00039    *
00040    * The library is divided into a number of functions each covering a specific category:
00041    * - Neural Network Convolution Functions
00042    * - Neural Network Activation Functions
00043    * - Fully-connected Layer Functions
00044    * - Neural Network Pooling Functions
00045    * - Softmax Functions
00046    * - Neural Network Support Functions
00047    *
00048    * The library has separate functions for operating on different weight and activation data
00049    * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
00050    * kernels are included in the function description. The implementation details are also 
00051    * described in this paper [1]. 
00052    *
00053    * Block Diagram
00054    * --------
00055    * \image html CMSIS-NN-OVERVIEW.PNG
00056    *
00057    * Examples
00058    * --------
00059    *
00060    * The library ships with a number of examples which demonstrate how to use the library functions.
00061    *
00062    * Pre-processor Macros
00063    * ------------
00064    *
00065    * Each library project have differant pre-processor macros.
00066    *
00067    * - ARM_MATH_DSP:
00068    *
00069    * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions.
00070    *
00071    * - ARM_MATH_BIG_ENDIAN:
00072    *
00073    * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets.
00074    *
00075    * - ARM_NN_TRUNCATE:
00076    *
00077    * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
00078    *
00079    * Copyright Notice
00080    * ------------
00081    *
00082    * Copyright (C) 2010-2018 Arm Limited. All rights reserved.
00083    *
00084    * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
00085    */
00086 
00087 /**
00088  * @defgroup groupNN Neural Network Functions
00089  * These functions perform basic operations for neural network layers. 
00090  */
00091 
00092 #ifndef _ARM_NNFUNCTIONS_H
00093 #define _ARM_NNFUNCTIONS_H
00094 
00095 #include "arm_nnsupportfunctions.h"
00096 #include "arm_nn_tables.h"
00097 
00098 #define USE_INTRINSIC
00099 
00100 //#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
00101 
00102 #ifdef __cplusplus
00103 extern    "C"
00104 {
00105 #endif
00106 
00107 /**
00108  * @defgroup NNConv Neural Network Convolution Functions
00109  *
00110  * Perform convolution layer
00111  *
00112  * The convolution is implemented in 2 steps: im2col and GEMM
00113  *
00114  * im2col is a process of converting each patch of image data into 
00115  * a column. After im2col, the convolution is computed as matrix-matrix
00116  * multiplication.
00117  * 
00118  * To reduce the memory footprint, the im2col is performed partially.
00119  * Each iteration, only a few column (i.e., patches) are generated and 
00120  * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
00121  *
00122  */
00123 
00124   /**
00125    * @brief Basic Q7 convolution function
00126    * @param[in]       Im_in       pointer to input tensor
00127    * @param[in]       dim_im_in   input tensor dimention
00128    * @param[in]       ch_im_in    number of input tensor channels
00129    * @param[in]       wt          pointer to kernel weights
00130    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
00131    * @param[in]       dim_kernel  filter kernel size
00132    * @param[in]       padding     padding sizes
00133    * @param[in]       stride      convolution stride
00134    * @param[in]       bias        pointer to bias
00135    * @param[in]       bias_shift  amount of left-shift for bias
00136    * @param[in]       out_shift   amount of right-shift for output
00137    * @param[in,out]   Im_out      pointer to output tensor
00138    * @param[in]       dim_im_out  output tensor dimension
00139    * @param[in,out]   bufferA     pointer to buffer space for input 
00140    * @param[in,out]   bufferB     pointer to buffer space for output
00141    * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
00142    *
00143    */
00144 
00145     arm_status arm_convolve_HWC_q7_basic(const q7_t * Im_in,
00146                                          const uint16_t dim_im_in,
00147                                          const uint16_t ch_im_in,
00148                                          const q7_t * wt,
00149                                          const uint16_t ch_im_out,
00150                                          const uint16_t dim_kernel,
00151                                          const uint16_t padding,
00152                                          const uint16_t stride,
00153                                          const q7_t * bias,
00154                                          const uint16_t bias_shift,
00155                                          const uint16_t out_shift,
00156                                          q7_t * Im_out, 
00157                                          const uint16_t dim_im_out, 
00158                                          q15_t * bufferA, 
00159                                          q7_t * bufferB);
00160 
00161   /**
00162    * @brief Basic Q15 convolution function
00163    * @param[in]       Im_in       pointer to input tensor
00164    * @param[in]       dim_im_in   input tensor dimention
00165    * @param[in]       ch_im_in    number of input tensor channels
00166    * @param[in]       wt          pointer to kernel weights
00167    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
00168    * @param[in]       dim_kernel  filter kernel size
00169    * @param[in]       padding     padding sizes
00170    * @param[in]       stride      convolution stride
00171    * @param[in]       bias        pointer to bias
00172    * @param[in]       bias_shift  amount of left-shift for bias
00173    * @param[in]       out_shift   amount of right-shift for output
00174    * @param[in,out]   Im_out      pointer to output tensor
00175    * @param[in]       dim_im_out  output tensor dimension
00176    * @param[in,out]   bufferA     pointer to buffer space for input 
00177    * @param[in,out]   bufferB     pointer to buffer space for output
00178    * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
00179    *
00180    */
00181 
00182     arm_status arm_convolve_HWC_q15_basic(const q15_t * Im_in,
00183                                           const uint16_t dim_im_in,
00184                                           const uint16_t ch_im_in,
00185                                           const q15_t * wt,
00186                                           const uint16_t ch_im_out,
00187                                           const uint16_t dim_kernel,
00188                                           const uint16_t padding,
00189                                           const uint16_t stride,
00190                                           const q15_t * bias,
00191                                           const uint16_t bias_shift,
00192                                           const uint16_t out_shift,
00193                                           q15_t * Im_out, 
00194                                           const uint16_t dim_im_out, 
00195                                           q15_t * bufferA, 
00196                                           q7_t * bufferB);
00197 
00198   /**
00199    * @brief Fast Q7 convolution function
00200    * @param[in]       Im_in       pointer to input tensor
00201    * @param[in]       dim_im_in   input tensor dimention
00202    * @param[in]       ch_im_in    number of input tensor channels
00203    * @param[in]       wt          pointer to kernel weights
00204    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
00205    * @param[in]       dim_kernel  filter kernel size
00206    * @param[in]       padding     padding sizes
00207    * @param[in]       stride      convolution stride
00208    * @param[in]       bias        pointer to bias
00209    * @param[in]       bias_shift  amount of left-shift for bias
00210    * @param[in]       out_shift   amount of right-shift for output
00211    * @param[in,out]   Im_out      pointer to output tensor
00212    * @param[in]       dim_im_out  output tensor dimension
00213    * @param[in,out]   bufferA     pointer to buffer space for input 
00214    * @param[in,out]   bufferB     pointer to buffer space for output
00215    * @return     The function returns either
00216    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
00217    *
00218    * This function is the version with full list of optimization tricks, but with
00219    * some contraints:
00220    *   ch_im_in is multiple of 4
00221    *   ch_im_out is multiple of 2
00222    */
00223 
00224     arm_status arm_convolve_HWC_q7_fast(const q7_t * Im_in,
00225                                         const uint16_t dim_im_in,
00226                                         const uint16_t ch_im_in,
00227                                         const q7_t * wt,
00228                                         const uint16_t ch_im_out,
00229                                         const uint16_t dim_kernel,
00230                                         const uint16_t padding,
00231                                         const uint16_t stride,
00232                                         const q7_t * bias,
00233                                         const uint16_t bias_shift,
00234                                         const uint16_t out_shift,
00235                                         q7_t * Im_out, 
00236                                         const uint16_t dim_im_out, 
00237                                         q15_t * bufferA, 
00238                                         q7_t * bufferB);
00239 
00240   /**
00241    * @brief Fast Q7 convolution function (non-sqaure shape)
00242    * @param[in]       Im_in        pointer to input tensor
00243    * @param[in]       dim_im_in_x  input tensor dimention x
00244    * @param[in]       dim_im_in_y  input tensor dimention y
00245    * @param[in]       ch_im_in     number of input tensor channels
00246    * @param[in]       wt           pointer to kernel weights
00247    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
00248    * @param[in]       dim_kernel_x filter kernel size x
00249    * @param[in]       dim_kernel_y filter kernel size y
00250    * @param[in]       padding_x    padding size x
00251    * @param[in]       padding_y    padding size y
00252    * @param[in]       stride_x     convolution stride x
00253    * @param[in]       stride_y     convolution stride y
00254    * @param[in]       bias         pointer to bias
00255    * @param[in]       bias_shift   amount of left-shift for bias
00256    * @param[in]       out_shift    amount of right-shift for output
00257    * @param[in,out]   Im_out       pointer to output tensor
00258    * @param[in]       dim_im_out_x output tensor dimension x
00259    * @param[in]       dim_im_out_y output tensor dimension y
00260    * @param[in,out]   bufferA      pointer to buffer space for input 
00261    * @param[in,out]   bufferB      pointer to buffer space for output
00262    * @return     The function returns either
00263    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
00264    *
00265    * This function is the version with full list of optimization tricks, but with
00266    * some contraints:
00267    *   ch_im_in is multiple of 4
00268    *   ch_im_out is multiple of 2
00269    */
00270 
00271     arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
00272                                                   const uint16_t dim_im_in_x,
00273                                                   const uint16_t dim_im_in_y,
00274                                                   const uint16_t ch_im_in,
00275                                                   const q7_t * wt,
00276                                                   const uint16_t ch_im_out,
00277                                                   const uint16_t dim_kernel_x,
00278                                                   const uint16_t dim_kernel_y,
00279                                                   const uint16_t padding_x,
00280                                                   const uint16_t padding_y,
00281                                                   const uint16_t stride_x,
00282                                                   const uint16_t stride_y,
00283                                                   const q7_t * bias,
00284                                                   const uint16_t bias_shift,
00285                                                   const uint16_t out_shift,
00286                                                   q7_t * Im_out,
00287                                                   const uint16_t dim_im_out_x,
00288                                                   const uint16_t dim_im_out_y,
00289                                                   q15_t * bufferA,
00290                                                   q7_t * bufferB);
00291 
00292   /**
00293    * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
00294    * @param[in]       Im_in        pointer to input tensor
00295    * @param[in]       dim_im_in_x  input tensor dimention x
00296    * @param[in]       dim_im_in_y  input tensor dimention y
00297    * @param[in]       ch_im_in     number of input tensor channels
00298    * @param[in]       wt           pointer to kernel weights
00299    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
00300    * @param[in]       dim_kernel_x filter kernel size x
00301    * @param[in]       dim_kernel_y filter kernel size y
00302    * @param[in]       padding_x    padding size x
00303    * @param[in]       padding_y    padding size y
00304    * @param[in]       stride_x     convolution stride x
00305    * @param[in]       stride_y     convolution stride y
00306    * @param[in]       bias         pointer to bias
00307    * @param[in]       bias_shift   amount of left-shift for bias
00308    * @param[in]       out_shift    amount of right-shift for output
00309    * @param[in,out]   Im_out       pointer to output tensor
00310    * @param[in]       dim_im_out_x output tensor dimension x
00311    * @param[in]       dim_im_out_y output tensor dimension y
00312    * @param[in,out]   bufferA      pointer to buffer space for input 
00313    * @param[in,out]   bufferB      pointer to buffer space for output
00314    * @return     The function returns either
00315    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
00316    *
00317    * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
00318    * and dim_kernel_y=1). It can be used for
00319    * second half of MobileNets after depthwise separable convolution.
00320    *
00321    * This function is the version with full list of optimization tricks, but with
00322    * some contraints:
00323    *   ch_im_in is multiple of 4
00324    *   ch_im_out is multiple of 2
00325    */
00326     arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
00327                                                       const uint16_t dim_im_in_x,
00328                                                       const uint16_t dim_im_in_y,
00329                                                       const uint16_t ch_im_in,
00330                                                       const q7_t * wt,
00331                                                       const uint16_t ch_im_out,
00332                                                       const uint16_t dim_kernel_x,
00333                                                       const uint16_t dim_kernel_y,
00334                                                       const uint16_t padding_x,
00335                                                       const uint16_t padding_y,
00336                                                       const uint16_t stride_x,
00337                                                       const uint16_t stride_y,
00338                                                       const q7_t * bias,
00339                                                       const uint16_t bias_shift,
00340                                                       const uint16_t out_shift,
00341                                                       q7_t * Im_out,
00342                                                       const uint16_t dim_im_out_x,
00343                                                       const uint16_t dim_im_out_y,
00344                                                       q15_t * bufferA,
00345                                                       q7_t * bufferB);
00346 
00347   /**
00348    * @brief Q7 version of convolution for RGB image
00349    * @param[in]       Im_in       pointer to input tensor
00350    * @param[in]       dim_im_in   input tensor dimention
00351    * @param[in]       ch_im_in    number of input tensor channels
00352    * @param[in]       wt          pointer to kernel weights
00353    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
00354    * @param[in]       dim_kernel  filter kernel size
00355    * @param[in]       padding     padding sizes
00356    * @param[in]       stride      convolution stride
00357    * @param[in]       bias        pointer to bias
00358    * @param[in]       bias_shift  amount of left-shift for bias
00359    * @param[in]       out_shift   amount of right-shift for output
00360    * @param[in,out]   Im_out      pointer to output tensor
00361    * @param[in]       dim_im_out  output tensor dimension
00362    * @param[in,out]   bufferA     pointer to buffer space for input 
00363    * @param[in,out]   bufferB     pointer to buffer space for output
00364    * @return     The function returns either
00365    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
00366    *
00367    * This kernel is written exclusively for convolution with ch_im_in
00368    * equals 3. This applies on the first layer of CNNs which has input
00369    * image with RGB format.
00370    */
00371 
00372     arm_status arm_convolve_HWC_q7_RGB(const q7_t * Im_in,
00373                                        const uint16_t dim_im_in,
00374                                        const uint16_t ch_im_in,
00375                                        const q7_t * wt,
00376                                        const uint16_t ch_im_out,
00377                                        const uint16_t dim_kernel,
00378                                        const uint16_t padding,
00379                                        const uint16_t stride,
00380                                        const q7_t * bias,
00381                                        const uint16_t bias_shift,
00382                                        const uint16_t out_shift,
00383                                        q7_t * Im_out, 
00384                                        const uint16_t dim_im_out, 
00385                                        q15_t * bufferA, 
00386                                        q7_t * bufferB);
00387 
00388   /**
00389    * @brief Fast Q15 convolution function
00390    * @param[in]       Im_in       pointer to input tensor
00391    * @param[in]       dim_im_in   input tensor dimention
00392    * @param[in]       ch_im_in    number of input tensor channels
00393    * @param[in]       wt          pointer to kernel weights
00394    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
00395    * @param[in]       dim_kernel  filter kernel size
00396    * @param[in]       padding     padding sizes
00397    * @param[in]       stride      convolution stride
00398    * @param[in]       bias        pointer to bias
00399    * @param[in]       bias_shift  amount of left-shift for bias
00400    * @param[in]       out_shift   amount of right-shift for output
00401    * @param[in,out]   Im_out      pointer to output tensor
00402    * @param[in]       dim_im_out  output tensor dimension
00403    * @param[in,out]   bufferA     pointer to buffer space for input 
00404    * @param[in,out]   bufferB     pointer to buffer space for output
00405    * @return     The function returns either
00406    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
00407    *
00408    * This function is the version with full list of optimization tricks, but with
00409    * some contraints:
00410    *   ch_im_in is multiple of 2
00411    *   ch_im_out is multiple of 2
00412    */
00413 
00414     arm_status arm_convolve_HWC_q15_fast(const q15_t * Im_in,
00415                                          const uint16_t dim_im_in,
00416                                          const uint16_t ch_im_in,
00417                                          const q15_t * wt,
00418                                          const uint16_t ch_im_out,
00419                                          const uint16_t dim_kernel,
00420                                          const uint16_t padding,
00421                                          const uint16_t stride,
00422                                          const q15_t * bias,
00423                                          const uint16_t bias_shift,
00424                                          const uint16_t out_shift,
00425                                          q15_t * Im_out, 
00426                                          const uint16_t dim_im_out, 
00427                                          q15_t * bufferA, 
00428                                          q7_t * bufferB);
00429 
00430   /**
00431    * @brief Q7 depthwise separable convolution function
00432    * @param[in]       Im_in       pointer to input tensor
00433    * @param[in]       dim_im_in   input tensor dimention
00434    * @param[in]       ch_im_in    number of input tensor channels
00435    * @param[in]       wt          pointer to kernel weights
00436    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
00437    * @param[in]       dim_kernel  filter kernel size
00438    * @param[in]       padding     padding sizes
00439    * @param[in]       stride      convolution stride
00440    * @param[in]       bias        pointer to bias
00441    * @param[in]       bias_shift  amount of left-shift for bias
00442    * @param[in]       out_shift   amount of right-shift for output
00443    * @param[in,out]   Im_out      pointer to output tensor
00444    * @param[in]       dim_im_out  output tensor dimension
00445    * @param[in,out]   bufferA     pointer to buffer space for input 
00446    * @param[in,out]   bufferB     pointer to buffer space for output
00447    * @return     The function returns either
00448    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
00449    *
00450    * This function is the version with full list of optimization tricks, but with
00451    * some contraints:
00452    *   ch_im_in is multiple of 2
00453    *   ch_im_out is multiple of 2
00454    */
00455 
00456     arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
00457                                                    const uint16_t dim_im_in,
00458                                                    const uint16_t ch_im_in,
00459                                                    const q7_t * wt,
00460                                                    const uint16_t ch_im_out,
00461                                                    const uint16_t dim_kernel,
00462                                                    const uint16_t padding,
00463                                                    const uint16_t stride,
00464                                                    const q7_t * bias,
00465                                                    const uint16_t bias_shift,
00466                                                    const uint16_t out_shift,
00467                                                    q7_t * Im_out,
00468                                                    const uint16_t dim_im_out, 
00469                                                    q15_t * bufferA, 
00470                                                    q7_t * bufferB);
00471 
00472   /**
00473    * @brief Q7 depthwise separable convolution function (non-square shape)
00474    * @param[in]       Im_in         pointer to input tensor
00475    * @param[in]       dim_im_in_x   input tensor dimention x
00476    * @param[in]       dim_im_in_y   input tensor dimention y
00477    * @param[in]       ch_im_in      number of input tensor channels
00478    * @param[in]       wt            pointer to kernel weights
00479    * @param[in]       ch_im_out     number of filters, i.e., output tensor channels
00480    * @param[in]       dim_kernel_x  filter kernel size x
00481    * @param[in]       dim_kernel_y  filter kernel size y
00482    * @param[in]       padding_x     padding sizes x
00483    * @param[in]       padding_y     padding sizes y
00484    * @param[in]       stride_x      convolution stride x
00485    * @param[in]       stride_y      convolution stride y
00486    * @param[in]       bias          pointer to bias
00487    * @param[in]       bias_shift    amount of left-shift for bias
00488    * @param[in]       out_shift     amount of right-shift for output
00489    * @param[in,out]   Im_out        pointer to output tensor
00490    * @param[in]       dim_im_out_x  output tensor dimension x
00491    * @param[in]       dim_im_out_y  output tensor dimension y
00492    * @param[in,out]   bufferA       pointer to buffer space for input 
00493    * @param[in,out]   bufferB       pointer to buffer space for output
00494    * @return     The function returns either
00495    * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
00496    *
00497    * This function is the version with full list of optimization tricks, but with
00498    * some contraints:
00499    *   ch_im_in is multiple of 2
00500    *   ch_im_out is multiple of 2
00501    */
00502     arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
00503                                                              const uint16_t dim_im_in_x,
00504                                                              const uint16_t dim_im_in_y,
00505                                                              const uint16_t ch_im_in,
00506                                                              const q7_t * wt,
00507                                                              const uint16_t ch_im_out,
00508                                                              const uint16_t dim_kernel_x,
00509                                                              const uint16_t dim_kernel_y,
00510                                                              const uint16_t padding_x,
00511                                                              const uint16_t padding_y,
00512                                                              const uint16_t stride_x,
00513                                                              const uint16_t stride_y,
00514                                                              const q7_t * bias,
00515                                                              const uint16_t bias_shift,
00516                                                              const uint16_t out_shift,
00517                                                              q7_t * Im_out,
00518                                                              const uint16_t dim_im_out_x,
00519                                                              const uint16_t dim_im_out_y,
00520                                                              q15_t * bufferA,
00521                                                              q7_t * bufferB);
00522 
00523 
00524 /**
00525  * @defgroup FC Fully-connected Layer Functions
00526  *
00527  * Perform fully-connected layer
00528  *
00529  * Fully-connected layer is basically a matrix-vector multiplication
00530  * with bias. The matrix is the weights and the input/output vectors
00531  * are the activation values. Supported {weight, activation} precisions
00532  * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
00533  *
00534  * Here we have two types of kernel functions. The basic function
00535  * implements the function using regular GEMV approach. The opt functions
00536  * operates with weights in interleaved formats. 
00537  *
00538  */
00539 
00540   /**
00541    * @brief Q7 basic fully-connected layer function
00542    * @param[in]       pV          pointer to input vector
00543    * @param[in]       pM          pointer to matrix weights
00544    * @param[in]       dim_vec     length of the vector
00545    * @param[in]       num_of_rows number of rows in weight matrix
00546    * @param[in]       bias_shift  amount of left-shift for bias
00547    * @param[in]       out_shift   amount of right-shift for output
00548    * @param[in]       bias        pointer to bias
00549    * @param[in,out]   pOut        pointer to output vector
00550    * @param[in,out]   vec_buffer  pointer to buffer space for input
00551    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
00552    *
00553    */
00554 
00555     arm_status arm_fully_connected_q7(const q7_t * pV,
00556                                       const q7_t * pM,
00557                                       const uint16_t dim_vec,
00558                                       const uint16_t num_of_rows,
00559                                       const uint16_t bias_shift,
00560                                       const uint16_t out_shift, 
00561                                       const q7_t * bias, 
00562                                       q7_t * pOut, 
00563                                       q15_t * vec_buffer);
00564 
00565   /**
00566    * @brief Q7 opt fully-connected layer function
00567    * @param[in]       pV          pointer to input vector
00568    * @param[in]       pM          pointer to matrix weights
00569    * @param[in]       dim_vec     length of the vector
00570    * @param[in]       num_of_rows number of rows in weight matrix
00571    * @param[in]       bias_shift  amount of left-shift for bias
00572    * @param[in]       out_shift   amount of right-shift for output
00573    * @param[in]       bias        pointer to bias
00574    * @param[in,out]   pOut        pointer to output vector
00575    * @param[in,out]   vec_buffer  pointer to buffer space for input
00576    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
00577    *
00578    */
00579 
00580     arm_status arm_fully_connected_q7_opt(const q7_t * pV,
00581                                           const q7_t * pM,
00582                                           const uint16_t dim_vec,
00583                                           const uint16_t num_of_rows,
00584                                           const uint16_t bias_shift,
00585                                           const uint16_t out_shift, 
00586                                           const q7_t * bias, 
00587                                           q7_t * pOut, 
00588                                           q15_t * vec_buffer);
00589 
00590   /**
00591    * @brief Q15 basic fully-connected layer function
00592    * @param[in]       pV          pointer to input vector
00593    * @param[in]       pM          pointer to matrix weights
00594    * @param[in]       dim_vec     length of the vector
00595    * @param[in]       num_of_rows number of rows in weight matrix
00596    * @param[in]       bias_shift  amount of left-shift for bias
00597    * @param[in]       out_shift   amount of right-shift for output
00598    * @param[in]       bias        pointer to bias
00599    * @param[in,out]   pOut        pointer to output vector
00600    * @param[in,out]   vec_buffer  pointer to buffer space for input
00601    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
00602    *
00603    */
00604 
00605     arm_status arm_fully_connected_q15(const q15_t * pV,
00606                                        const q15_t * pM,
00607                                        const uint16_t dim_vec,
00608                                        const uint16_t num_of_rows,
00609                                        const uint16_t bias_shift,
00610                                        const uint16_t out_shift, 
00611                                        const q15_t * bias, 
00612                                        q15_t * pOut, 
00613                                        q15_t * vec_buffer);
00614 
00615   /**
00616    * @brief Q15 opt fully-connected layer function
00617    * @param[in]       pV          pointer to input vector
00618    * @param[in]       pM          pointer to matrix weights
00619    * @param[in]       dim_vec     length of the vector
00620    * @param[in]       num_of_rows number of rows in weight matrix
00621    * @param[in]       bias_shift  amount of left-shift for bias
00622    * @param[in]       out_shift   amount of right-shift for output
00623    * @param[in]       bias        pointer to bias
00624    * @param[in,out]   pOut        pointer to output vector
00625    * @param[in,out]   vec_buffer  pointer to buffer space for input
00626    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
00627    *
00628    */
00629 
00630     arm_status arm_fully_connected_q15_opt(const q15_t * pV,
00631                                            const q15_t * pM,
00632                                            const uint16_t dim_vec,
00633                                            const uint16_t num_of_rows,
00634                                            const uint16_t bias_shift,
00635                                            const uint16_t out_shift,
00636                                            const q15_t * bias, 
00637                                            q15_t * pOut, 
00638                                            q15_t * vec_buffer);
00639 
00640   /**
00641    * @brief Mixed Q15-Q7 fully-connected layer function
00642    * @param[in]       pV          pointer to input vector
00643    * @param[in]       pM          pointer to matrix weights
00644    * @param[in]       dim_vec     length of the vector
00645    * @param[in]       num_of_rows number of rows in weight matrix
00646    * @param[in]       bias_shift  amount of left-shift for bias
00647    * @param[in]       out_shift   amount of right-shift for output
00648    * @param[in]       bias        pointer to bias
00649    * @param[in,out]   pOut        pointer to output vector
00650    * @param[in,out]   vec_buffer  pointer to buffer space for input
00651    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
00652    *
00653    */
00654 
00655     arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t * pV,
00656                                                   const q7_t * pM,
00657                                                   const uint16_t dim_vec,
00658                                                   const uint16_t num_of_rows,
00659                                                   const uint16_t bias_shift,
00660                                                   const uint16_t out_shift,
00661                                                   const q7_t * bias, 
00662                                                   q15_t * pOut, 
00663                                                   q15_t * vec_buffer);
00664 
00665   /**
00666    * @brief Mixed Q15-Q7 opt fully-connected layer function
00667    * @param[in]       pV          pointer to input vector
00668    * @param[in]       pM          pointer to matrix weights
00669    * @param[in]       dim_vec     length of the vector
00670    * @param[in]       num_of_rows number of rows in weight matrix
00671    * @param[in]       bias_shift  amount of left-shift for bias
00672    * @param[in]       out_shift   amount of right-shift for output
00673    * @param[in]       bias        pointer to bias
00674    * @param[in,out]   pOut        pointer to output vector
00675    * @param[in,out]   vec_buffer  pointer to buffer space for input
00676    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
00677    *
00678    */
00679 
00680     arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
00681                                                       const q7_t * pM,
00682                                                       const uint16_t dim_vec,
00683                                                       const uint16_t num_of_rows,
00684                                                       const uint16_t bias_shift,
00685                                                       const uint16_t out_shift,
00686                                                       const q7_t * bias, 
00687                                                       q15_t * pOut, 
00688                                                       q15_t * vec_buffer);
00689 
00690 /**
00691  * @brief Matrix-Multiplication Kernels for Convolution
00692  *
00693  * These functions are used within convolution layer functions for 
00694  * matrix multiplication.
00695  * 
00696  * The implementation is similar to CMSIS-DSP arm_mat_mult functions
00697  * with one Q7 and one Q15 operands. The Q15 operand is the im2col
00698  * output which is always with 2 columns.
00699  *
00700  */
00701 
00702   /**
00703    * @brief Matrix-multiplication function for convolution
00704    * @param[in]       pA          pointer to operand A
00705    * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
00706    * @param[in]       ch_im_out   numRow of A
00707    * @param[in]       numCol_A    numCol of A
00708    * @param[in]       bias_shift  amount of left-shift for bias
00709    * @param[in]       out_shift   amount of right-shift for output
00710    * @param[in]       bias        the bias
00711    * @param[in,out]   pOut        pointer to output
00712    * @return     The function returns the incremented output pointer
00713    */
00714 
00715     q7_t     *arm_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
00716                                             const q15_t * pInBuffer,
00717                                             const uint16_t ch_im_out,
00718                                             const uint16_t numCol_A,
00719                                             const uint16_t bias_shift,
00720                                             const uint16_t out_shift, 
00721                                             const q7_t * bias, 
00722                                             q7_t * pOut);
00723 
00724   /**
00725    * @brief Matrix-multiplication function for convolution with reordered columns
00726    * @param[in]       pA          pointer to operand A
00727    * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
00728    * @param[in]       ch_im_out   numRow of A
00729    * @param[in]       numCol_A    numCol of A
00730    * @param[in]       bias_shift  amount of left-shift for bias
00731    * @param[in]       out_shift   amount of right-shift for output
00732    * @param[in]       bias        the bias
00733    * @param[in,out]   pOut        pointer to output
00734    * @return     The function returns the incremented output pointer
00735    */
00736 
00737     q7_t     *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA,
00738                                                       const q15_t * pInBuffer,
00739                                                       const uint16_t ch_im_out,
00740                                                       const uint16_t numCol_A,
00741                                                       const uint16_t bias_shift,
00742                                                       const uint16_t out_shift, 
00743                                                       const q7_t * bias, 
00744                                                       q7_t * pOut);
00745 
00746 #ifdef __cplusplus
00747 }
00748 #endif
00749 
00750 /*
00751  *  Other functions
00752  *  These layers are typically not timing critical
00753  *  Basic implementation is supported here
00754  */
00755 
00756 #ifdef __cplusplus
00757 extern    "C"
00758 {
00759 #endif
00760 
00761 /**
00762  * @defgroup Acti Neural Network Activation Functions
00763  *
00764  * Perform activation layers, including ReLU (Rectified Linear Unit),
00765  * sigmoid and tanh
00766  *
00767  */
00768 
00769   /**
00770    * @brief Q7 RELU function
00771    * @param[in,out]   data        pointer to input
00772    * @param[in]       size        number of elements
00773    * @return none.
00774    */
00775 
00776     void      arm_relu_q7(q7_t * data, uint16_t size);
00777 
00778   /**
00779    * @brief Q15 RELU function
00780    * @param[in,out]   data        pointer to input
00781    * @param[in]       size        number of elements
00782    * @return none.
00783    */
00784 
00785     void      arm_relu_q15(q15_t * data, uint16_t size);
00786 
00787   /**
00788    * @brief Q7 neural network activation function using direct table look-up
00789    * @param[in,out]   data        pointer to input
00790    * @param[in]       size        number of elements
00791    * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
00792    * @param[in]       type        type of activation functions
00793    * @return none.
00794    */
00795 
00796     void      arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width, 
00797                                            arm_nn_activation_type type);
00798 
00799   /**
00800    * @brief Q15 neural network activation function using direct table look-up
00801    * @param[in,out]   data        pointer to input
00802    * @param[in]       size        number of elements
00803    * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
00804    * @param[in]       type        type of activation functions
00805    * @return none.
00806    */
00807 
00808     void      arm_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width,
00809                                             arm_nn_activation_type type);
00810 
00811 /**
00812  * @defgroup Pooling Neural Network Pooling Functions
00813  *
00814  * Perform pooling functions, including max pooling and average pooling
00815  *
00816  */
00817 
00818   /**
00819    * @brief Q7 max pooling function
00820    * @param[in]       Im_in       pointer to input tensor
00821    * @param[in]       dim_im_in   input tensor dimention
00822    * @param[in]       ch_im_in    number of input tensor channels
00823    * @param[in]       dim_kernel  filter kernel size
00824    * @param[in]       padding     padding sizes
00825    * @param[in]       stride      convolution stride
00826    * @param[in]       dim_im_out  output tensor dimension
00827    * @param[in,out]   bufferA     pointer to buffer space for input
00828    * @param[in,out]   Im_out      pointer to output tensor
00829    * @return none.
00830    *
00831    */
00832 
00833     void      arm_maxpool_q7_HWC(q7_t * Im_in,
00834                                  const uint16_t dim_im_in,
00835                                  const uint16_t ch_im_in,
00836                                  const uint16_t dim_kernel,
00837                                  const uint16_t padding,
00838                                  const uint16_t stride, 
00839                                  const uint16_t dim_im_out, 
00840                                  q7_t * bufferA, 
00841                                  q7_t * Im_out);
00842 
00843   /**
00844    * @brief Q7 average pooling function
00845    * @param[in]       Im_in       pointer to input tensor
00846    * @param[in]       dim_im_in   input tensor dimention
00847    * @param[in]       ch_im_in    number of input tensor channels
00848    * @param[in]       dim_kernel  filter kernel size
00849    * @param[in]       padding     padding sizes
00850    * @param[in]       stride      convolution stride
00851    * @param[in]       dim_im_out  output tensor dimension
00852    * @param[in,out]   bufferA     pointer to buffer space for input
00853    * @param[in,out]   Im_out      pointer to output tensor
00854    * @return none.
00855    *
00856    */
00857 
00858     void      arm_avepool_q7_HWC(q7_t * Im_in,
00859                                  const uint16_t dim_im_in,
00860                                  const uint16_t ch_im_in,
00861                                  const uint16_t dim_kernel,
00862                                  const uint16_t padding,
00863                                  const uint16_t stride, 
00864                                  const uint16_t dim_im_out, 
00865                                  q7_t * bufferA, 
00866                                  q7_t * Im_out);
00867 
00868 /**
00869  * @defgroup Softmax Softmax Functions
00870  *
00871  * EXP(2) based softmax function
00872  *
00873  */
00874 
00875   /**
00876    * @brief Q7 softmax function
00877    * @param[in]       vec_in      pointer to input vector
00878    * @param[in]       dim_vec     input vector dimention
00879    * @param[out]      p_out       pointer to output vector
00880    * @return none.
00881    *
00882    */
00883 
00884     void      arm_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out);
00885 
00886   /**
00887    * @brief Q15 softmax function
00888    * @param[in]       vec_in      pointer to input vector
00889    * @param[in]       dim_vec     input vector dimention
00890    * @param[out]      p_out       pointer to output vector
00891    * @return none.
00892    *
00893    */
00894 
00895     void      arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
00896 
00897 #ifdef __cplusplus
00898 }
00899 #endif
00900 
00901 #endif
00902