Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Neural Network Convolution Functions
[Neural Network Functions]
Perform convolution layer. More...
Functions | |
arm_status | arm_convolve_1x1_HWC_q7_fast_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Fast Q7 version of 1x1 convolution (non-sqaure shape) | |
arm_status | arm_convolve_HWC_q15_basic (const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Basic Q15 convolution function. | |
arm_status | arm_convolve_HWC_q15_fast (const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Fast Q15 convolution function. | |
arm_status | arm_convolve_HWC_q7_basic (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Basic Q7 convolution function. | |
arm_status | arm_convolve_HWC_q7_fast (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Fast Q7 convolution function. | |
arm_status | arm_convolve_HWC_q7_fast_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Fast Q7 convolution function (non-sqaure shape) | |
arm_status | arm_convolve_HWC_q7_RGB (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Q7 convolution function for RGB image. | |
arm_status | arm_depthwise_separable_conv_HWC_q7 (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Q7 depthwise separable convolution function. | |
arm_status | arm_depthwise_separable_conv_HWC_q7_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Q7 depthwise separable convolution function (non-square shape) |
Detailed Description
Perform convolution layer.
The convolution is implemented in 2 steps: im2col and GEMM
im2col is a process of converting each patch of image data into a column. After im2col, the convolution is computed as matrix-matrix multiplication.
To reduce the memory footprint, the im2col is performed partially. Each iteration, only a few column (i.e., patches) are generated and computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
Function Documentation
arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in_x, | ||
const uint16_t | dim_im_in_y, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel_x, | ||
const uint16_t | dim_kernel_y, | ||
const uint16_t | padding_x, | ||
const uint16_t | padding_y, | ||
const uint16_t | stride_x, | ||
const uint16_t | stride_y, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out_x, | ||
const uint16_t | dim_im_out_y, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
Fast Q7 version of 1x1 convolution (non-sqaure shape)
- Parameters:
-
[in] Im_in pointer to input tensor [in] dim_im_in_x input tensor dimention x [in] dim_im_in_y input tensor dimention y [in] ch_im_in number of input tensor channels [in] wt pointer to kernel weights [in] ch_im_out number of filters, i.e., output tensor channels [in] dim_kernel_x filter kernel size x [in] dim_kernel_y filter kernel size y [in] padding_x padding size x [in] padding_y padding size y [in] stride_x convolution stride x [in] stride_y convolution stride y [in] bias pointer to bias [in] bias_shift amount of left-shift for bias [in] out_shift amount of right-shift for output [in,out] Im_out pointer to output tensor [in] dim_im_out_x output tensor dimension x [in] dim_im_out_y output tensor dimension y [in,out] bufferA pointer to buffer space for input [in,out] bufferB pointer to buffer space for output
- Returns:
- The function returns either
ARM_MATH_SIZE_MISMATCH
orARM_MATH_SUCCESS
based on the outcome of size checking.
This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1 and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise separable convolution.
This function is the version with full list of optimization tricks, but with some contraints: ch_im_in is multiple of 4 ch_im_out is multiple of 2
[1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications https://arxiv.org/abs/1704.04861
Definition at line 81 of file arm_convolve_1x1_HWC_q7_fast_nonsquare.c.
arm_status arm_convolve_HWC_q15_basic | ( | const q15_t * | Im_in, |
const uint16_t | dim_im_in, | ||
const uint16_t | ch_im_in, | ||
const q15_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel, | ||
const uint16_t | padding, | ||
const uint16_t | stride, | ||
const q15_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q15_t * | Im_out, | ||
const uint16_t | dim_im_out, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
Basic Q15 convolution function.
- Parameters:
-
[in] Im_in pointer to input tensor [in] dim_im_in input tensor dimention [in] ch_im_in number of input tensor channels [in] wt pointer to kernel weights [in] ch_im_out number of filters, i.e., output tensor channels [in] dim_kernel filter kernel size [in] padding padding sizes [in] stride convolution stride [in] bias pointer to bias [in] bias_shift amount of left-shift for bias [in] out_shift amount of right-shift for output [in,out] Im_out pointer to output tensor [in] dim_im_out output tensor dimension [in,out] bufferA pointer to buffer space for input [in,out] bufferB pointer to buffer space for output
- Returns:
- The function returns
ARM_MATH_SUCCESS
Buffer size:
bufferA size: ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
This basic version is designed to work for any input tensor and weight dimension.
Definition at line 75 of file arm_convolve_HWC_q15_basic.c.
arm_status arm_convolve_HWC_q15_fast | ( | const q15_t * | Im_in, |
const uint16_t | dim_im_in, | ||
const uint16_t | ch_im_in, | ||
const q15_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel, | ||
const uint16_t | padding, | ||
const uint16_t | stride, | ||
const q15_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q15_t * | Im_out, | ||
const uint16_t | dim_im_out, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
Fast Q15 convolution function.
- Parameters:
-
[in] Im_in pointer to input tensor [in] dim_im_in input tensor dimention [in] ch_im_in number of input tensor channels [in] wt pointer to kernel weights [in] ch_im_out number of filters, i.e., output tensor channels [in] dim_kernel filter kernel size [in] padding padding sizes [in] stride convolution stride [in] bias pointer to bias [in] bias_shift amount of left-shift for bias [in] out_shift amount of right-shift for output [in,out] Im_out pointer to output tensor [in] dim_im_out output tensor dimension [in,out] bufferA pointer to buffer space for input [in,out] bufferB pointer to buffer space for output
- Returns:
- The function returns either
ARM_MATH_SIZE_MISMATCH
orARM_MATH_SUCCESS
based on the outcome of size checking.
Buffer size:
bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
Input dimension constraints:
ch_im_in is multiple of 2
ch_im_out is multipe of 2
Definition at line 80 of file arm_convolve_HWC_q15_fast.c.
arm_status arm_convolve_HWC_q7_basic | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel, | ||
const uint16_t | padding, | ||
const uint16_t | stride, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
Basic Q7 convolution function.
- Parameters:
-
[in] Im_in pointer to input tensor [in] dim_im_in input tensor dimention [in] ch_im_in number of input tensor channels [in] wt pointer to kernel weights [in] ch_im_out number of filters, i.e., output tensor channels [in] dim_kernel filter kernel size [in] padding padding sizes [in] stride convolution stride [in] bias pointer to bias [in] bias_shift amount of left-shift for bias [in] out_shift amount of right-shift for output [in,out] Im_out pointer to output tensor [in] dim_im_out output tensor dimension [in,out] bufferA pointer to buffer space for input [in,out] bufferB pointer to buffer space for output
- Returns:
- The function returns
ARM_MATH_SUCCESS
Buffer size:
bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
This basic version is designed to work for any input tensor and weight dimension.
Definition at line 74 of file arm_convolve_HWC_q7_basic.c.
arm_status arm_convolve_HWC_q7_fast | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel, | ||
const uint16_t | padding, | ||
const uint16_t | stride, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
Fast Q7 convolution function.
- Parameters:
-
[in] Im_in pointer to input tensor [in] dim_im_in input tensor dimention [in] ch_im_in number of input tensor channels [in] wt pointer to kernel weights [in] ch_im_out number of filters, i.e., output tensor channels [in] dim_kernel filter kernel size [in] padding padding sizes [in] stride convolution stride [in] bias pointer to bias [in] bias_shift amount of left-shift for bias [in] out_shift amount of right-shift for output [in,out] Im_out pointer to output tensor [in] dim_im_out output tensor dimension [in,out] bufferA pointer to buffer space for input [in,out] bufferB pointer to buffer space for output
- Returns:
- The function returns either
ARM_MATH_SIZE_MISMATCH
orARM_MATH_SUCCESS
based on the outcome of size checking.
Buffer size:
bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
Input dimension constraints:
ch_im_in is multiple of 4 ( because of the SIMD32 read and swap )
ch_im_out is multipe of 2 ( bacause 2x2 mat_mult kernel )
The im2col converts the Q7 tensor input into Q15 column, which is stored in bufferA. There is reordering happenning during this im2col process with arm_q7_to_q15_reordered_no_shift. For every four elements, the second and third elements are swapped.
The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the GEMM computation with the reordered columns.
To speed-up the determination of the padding condition, we split the computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}. This reduces the total number of boundary condition checks and improves the data copying performance.
Definition at line 92 of file arm_convolve_HWC_q7_fast.c.
arm_status arm_convolve_HWC_q7_fast_nonsquare | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in_x, | ||
const uint16_t | dim_im_in_y, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel_x, | ||
const uint16_t | dim_kernel_y, | ||
const uint16_t | padding_x, | ||
const uint16_t | padding_y, | ||
const uint16_t | stride_x, | ||
const uint16_t | stride_y, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out_x, | ||
const uint16_t | dim_im_out_y, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
Fast Q7 convolution function (non-sqaure shape)
- Parameters:
-
[in] Im_in pointer to input tensor [in] dim_im_in_x input tensor dimention x [in] dim_im_in_y input tensor dimention y [in] ch_im_in number of input tensor channels [in] wt pointer to kernel weights [in] ch_im_out number of filters, i.e., output tensor channels [in] dim_kernel_x filter kernel size x [in] dim_kernel_y filter kernel size y [in] padding_x padding size x [in] padding_y padding size y [in] stride_x convolution stride x [in] stride_y convolution stride y [in] bias pointer to bias [in] bias_shift amount of left-shift for bias [in] out_shift amount of right-shift for output [in,out] Im_out pointer to output tensor [in] dim_im_out_x output tensor dimension x [in] dim_im_out_y output tensor dimension y [in,out] bufferA pointer to buffer space for input [in,out] bufferB pointer to buffer space for output
- Returns:
- The function returns either
ARM_MATH_SIZE_MISMATCH
orARM_MATH_SUCCESS
based on the outcome of size checking.
This function is the version with full list of optimization tricks, but with some contraints: ch_im_in is multiple of 4 ch_im_out is multiple of 2
Definition at line 74 of file arm_convolve_HWC_q7_fast_nonsquare.c.
arm_status arm_convolve_HWC_q7_RGB | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel, | ||
const uint16_t | padding, | ||
const uint16_t | stride, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
Q7 convolution function for RGB image.
Q7 version of convolution for RGB image.
- Parameters:
-
[in] Im_in pointer to input tensor [in] dim_im_in input tensor dimention [in] ch_im_in number of input tensor channels [in] wt pointer to kernel weights [in] ch_im_out number of filters, i.e., output tensor channels [in] dim_kernel filter kernel size [in] padding padding sizes [in] stride convolution stride [in] bias pointer to bias [in] bias_shift amount of left-shift for bias [in] out_shift amount of right-shift for output [in,out] Im_out pointer to output tensor [in] dim_im_out output tensor dimension [in,out] bufferA pointer to buffer space for input [in,out] bufferB pointer to buffer space for output
- Returns:
- The function returns either
ARM_MATH_SIZE_MISMATCH
orARM_MATH_SUCCESS
based on the outcome of size checking.
Buffer size:
bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
Input dimension constraints:
ch_im_in equals 3
This kernel is written exclusively for convolution with ch_im_in equals 3. This applies on the first layer of CNNs which has input image with RGB format.
Definition at line 80 of file arm_convolve_HWC_q7_RGB.c.
arm_status arm_depthwise_separable_conv_HWC_q7 | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel, | ||
const uint16_t | padding, | ||
const uint16_t | stride, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
Q7 depthwise separable convolution function.
- Parameters:
-
[in] Im_in pointer to input tensor [in] dim_im_in input tensor dimention [in] ch_im_in number of input tensor channels [in] wt pointer to kernel weights [in] ch_im_out number of filters, i.e., output tensor channels [in] dim_kernel filter kernel size [in] padding padding sizes [in] stride convolution stride [in] bias pointer to bias [in] bias_shift amount of left-shift for bias [in] out_shift amount of right-shift for output [in,out] Im_out pointer to output tensor [in] dim_im_out output tensor dimension [in,out] bufferA pointer to buffer space for input [in,out] bufferB pointer to buffer space for output
- Returns:
- The function returns either
ARM_MATH_SIZE_MISMATCH
orARM_MATH_SUCCESS
based on the outcome of size checking.
Buffer size:
bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
Input dimension constraints:
ch_im_in equals ch_im_out
Implementation: There are 3 nested loop here: Inner loop: calculate each output value with MAC instruction over an accumulator Mid loop: loop over different output channel Outer loop: loop over different output (x, y)
Definition at line 82 of file arm_depthwise_separable_conv_HWC_q7.c.
arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in_x, | ||
const uint16_t | dim_im_in_y, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel_x, | ||
const uint16_t | dim_kernel_y, | ||
const uint16_t | padding_x, | ||
const uint16_t | padding_y, | ||
const uint16_t | stride_x, | ||
const uint16_t | stride_y, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out_x, | ||
const uint16_t | dim_im_out_y, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
Q7 depthwise separable convolution function (non-square shape)
- Parameters:
-
[in] Im_in pointer to input tensor [in] dim_im_in_x input tensor dimention x [in] dim_im_in_y input tensor dimention y [in] ch_im_in number of input tensor channels [in] wt pointer to kernel weights [in] ch_im_out number of filters, i.e., output tensor channels [in] dim_kernel_x filter kernel size x [in] dim_kernel_y filter kernel size y [in] padding_x padding sizes x [in] padding_y padding sizes y [in] stride_x convolution stride x [in] stride_y convolution stride y [in] bias pointer to bias [in] bias_shift amount of left-shift for bias [in] out_shift amount of right-shift for output [in,out] Im_out pointer to output tensor [in] dim_im_out_x output tensor dimension x [in] dim_im_out_y output tensor dimension y [in,out] bufferA pointer to buffer space for input [in,out] bufferB pointer to buffer space for output
- Returns:
- The function returns either
ARM_MATH_SIZE_MISMATCH
orARM_MATH_SUCCESS
based on the outcome of size checking.
This function is the version with full list of optimization tricks, but with some contraints: ch_im_in is multiple of 2 ch_im_out is multiple of 2
Definition at line 74 of file arm_depthwise_separable_conv_HWC_q7_nonsquare.c.
Generated on Tue Jul 12 2022 16:47:30 by
