Aded CMSIS5 DSP and NN folder. Needs some work
Embed:
(wiki syntax)
Show/hide line numbers
arm_convolve_HWC_q7_fast_nonsquare.c
00001 /* 00002 * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. 00003 * 00004 * SPDX-License-Identifier: Apache-2.0 00005 * 00006 * Licensed under the Apache License, Version 2.0 (the License); you may 00007 * not use this file except in compliance with the License. 00008 * You may obtain a copy of the License at 00009 * 00010 * www.apache.org/licenses/LICENSE-2.0 00011 * 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00014 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 */ 00018 00019 /* ---------------------------------------------------------------------- 00020 * Project: CMSIS NN Library 00021 * Title: arm_convolve_HWC_q7_fast_nonsquare.c 00022 * Description: Fast Q7 version of convolution (non-sqaure shape) 00023 * 00024 * $Date: 17. January 2018 00025 * $Revision: V.1.0.0 00026 * 00027 * Target Processor: Cortex-M cores 00028 * 00029 * -------------------------------------------------------------------- */ 00030 00031 #include "arm_math.h" 00032 #include "arm_nnfunctions.h" 00033 00034 /** 00035 * @ingroup groupNN 00036 */ 00037 00038 /** 00039 * @addtogroup NNConv 00040 * @{ 00041 */ 00042 00043 /** 00044 * @brief Fast Q7 convolution function (non-sqaure shape) 00045 * @param[in] Im_in pointer to input tensor 00046 * @param[in] dim_im_in_x input tensor dimention x 00047 * @param[in] dim_im_in_y input tensor dimention y 00048 * @param[in] ch_im_in number of input tensor channels 00049 * @param[in] wt pointer to kernel weights 00050 * @param[in] ch_im_out number of filters, i.e., output tensor channels 00051 * @param[in] dim_kernel_x filter kernel size x 00052 * @param[in] dim_kernel_y filter kernel size y 00053 * @param[in] padding_x padding size x 00054 * @param[in] padding_y padding size y 00055 * @param[in] stride_x convolution stride x 00056 * @param[in] stride_y convolution stride y 00057 * @param[in] bias pointer to bias 00058 * @param[in] bias_shift amount of left-shift for bias 00059 * @param[in] out_shift amount of right-shift for output 00060 * @param[in,out] Im_out pointer to output tensor 00061 * @param[in] dim_im_out_x output tensor dimension x 00062 * @param[in] dim_im_out_y output tensor dimension y 00063 * @param[in,out] bufferA pointer to buffer space for input 00064 * @param[in,out] bufferB pointer to buffer space for output 00065 * @return The function returns either 00066 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 00067 * 00068 * This function is the version with full list of optimization tricks, but with 00069 * some contraints: 00070 * ch_im_in is multiple of 4 00071 * ch_im_out is multiple of 2 00072 */ 00073 00074 arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in, 00075 const uint16_t dim_im_in_x, 00076 const uint16_t dim_im_in_y, 00077 const uint16_t ch_im_in, 00078 const q7_t * wt, 00079 const uint16_t ch_im_out, 00080 const uint16_t dim_kernel_x, 00081 const uint16_t dim_kernel_y, 00082 const uint16_t padding_x, 00083 const uint16_t padding_y, 00084 const uint16_t stride_x, 00085 const uint16_t stride_y, 00086 const q7_t * bias, 00087 const uint16_t bias_shift, 00088 const uint16_t out_shift, 00089 q7_t * Im_out, 00090 const uint16_t dim_im_out_x, 00091 const uint16_t dim_im_out_y, 00092 q15_t * bufferA, 00093 q7_t * bufferB) 00094 { 00095 00096 #if defined (ARM_MATH_DSP) 00097 /* Run the following code for Cortex-M4 and Cortex-M7 */ 00098 00099 int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; 00100 00101 /* ----------------------- 00102 * Here we use bufferA as q15_t internally as computation are done with q15_t level 00103 * im2col are done to output in q15_t format from q7_t input 00104 */ 00105 00106 q15_t *pBuffer = bufferA; 00107 q7_t *pOut = Im_out; 00108 00109 if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) 00110 { 00111 /* check if the input dimension meets the constraints */ 00112 return ARM_MATH_SIZE_MISMATCH; 00113 } 00114 00115 /* 00116 * Here we split the entire matrix into three regions depending on the padding situation 00117 * Top: i_out_y from 0 to padding - 1 00118 * Middle: i_out_y from padding to dim_im_out-padding-1 00119 * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1 00120 */ 00121 00122 /* top part */ 00123 for (i_out_y = 0; i_out_y < padding_y; i_out_y++) 00124 { 00125 for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) 00126 { 00127 /* This part implements the im2col function */ 00128 for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; 00129 i_ker_y++) 00130 { 00131 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; 00132 i_ker_x++) 00133 { 00134 if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) 00135 { 00136 /* arm_fill_q15(0, pBuffer, ch_im_in); */ 00137 memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); 00138 } else 00139 { 00140 arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, 00141 pBuffer, ch_im_in); 00142 } 00143 pBuffer += ch_im_in; 00144 } 00145 } 00146 00147 if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) 00148 { 00149 pOut = 00150 arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, 00151 bias_shift, out_shift, bias, pOut); 00152 /* counter reset */ 00153 pBuffer = bufferA; 00154 } 00155 } 00156 } 00157 00158 /* middle part, here we also divide the x into left, mid and right */ 00159 for (; i_out_y < dim_im_out_y - padding_y; i_out_y++) 00160 { 00161 00162 /* left part */ 00163 for (i_out_x = 0; i_out_x < padding_x; i_out_x++) 00164 { 00165 /* This part implements the im2col function */ 00166 for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; 00167 i_ker_y++) 00168 { 00169 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; 00170 i_ker_x++) 00171 { 00172 if (i_ker_x < 0 || i_ker_x >= dim_im_in_x) 00173 { 00174 /* arm_fill_q15(0, pBuffer, ch_im_in); */ 00175 memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); 00176 } else 00177 { 00178 arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, 00179 pBuffer, ch_im_in); 00180 } 00181 pBuffer += ch_im_in; 00182 } 00183 } 00184 00185 if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) 00186 { 00187 pOut = 00188 arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, 00189 bias_shift, out_shift, bias, pOut); 00190 /* counter reset */ 00191 pBuffer = bufferA; 00192 } 00193 } 00194 00195 /* mid part */ 00196 for (; i_out_x < dim_im_out_x - padding_x; i_out_x++) 00197 { 00198 /* This part implements the im2col function */ 00199 for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; 00200 i_ker_y++) 00201 { 00202 arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + 00203 (i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) * ch_im_in, 00204 pBuffer, ch_im_in * dim_kernel_x); 00205 pBuffer += ch_im_in * dim_kernel_x; 00206 } 00207 00208 if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) 00209 { 00210 pOut = 00211 arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, 00212 bias_shift, out_shift, bias, pOut); 00213 /* counter reset */ 00214 pBuffer = bufferA; 00215 } 00216 } 00217 00218 /* right part */ 00219 for (; i_out_x < dim_im_out_x; i_out_x++) 00220 { 00221 /* This part implements the im2col function */ 00222 for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; 00223 i_ker_y++) 00224 { 00225 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; 00226 i_ker_x++) 00227 { 00228 if (i_ker_x < 0 || i_ker_x >= dim_im_in_x) 00229 { 00230 /* arm_fill_q15(0, pBuffer, ch_im_in); */ 00231 memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); 00232 } else 00233 { 00234 arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, 00235 pBuffer, ch_im_in); 00236 } 00237 pBuffer += ch_im_in; 00238 } 00239 } 00240 00241 if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) 00242 { 00243 pOut = 00244 arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, 00245 bias_shift, out_shift, bias, pOut); 00246 /* counter reset */ 00247 pBuffer = bufferA; 00248 } 00249 } 00250 } 00251 00252 for (; i_out_y < dim_im_out_y; i_out_y++) 00253 { 00254 for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) 00255 { 00256 /* This part implements the im2col function */ 00257 for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; 00258 i_ker_y++) 00259 { 00260 for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; 00261 i_ker_x++) 00262 { 00263 if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) 00264 { 00265 /* arm_fill_q15(0, pBuffer, ch_im_in); */ 00266 memset(pBuffer, 0, sizeof(q15_t)*ch_im_in); 00267 } else 00268 { 00269 arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, 00270 pBuffer, ch_im_in); 00271 } 00272 pBuffer += ch_im_in; 00273 } 00274 } 00275 00276 if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) 00277 { 00278 pOut = 00279 arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, 00280 bias_shift, out_shift, bias, pOut); 00281 /* counter reset */ 00282 pBuffer = bufferA; 00283 } 00284 } 00285 } 00286 00287 /* check if there is left-over for compute */ 00288 if (pBuffer != bufferA) 00289 { 00290 const q7_t *pA = wt; 00291 int i; 00292 for (i = 0; i < ch_im_out; i++) 00293 { 00294 q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); 00295 q15_t *pB = bufferA; 00296 /* basically each time it process 4 entries */ 00297 uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2; 00298 00299 while (colCnt) 00300 { 00301 00302 q31_t inA1, inA2; 00303 q31_t inB1, inB2; 00304 00305 pA = (const q7_t *)read_and_pad_reordered((void *)pA, &inA1, &inA2); 00306 00307 inB1 = *__SIMD32(pB)++; 00308 sum = __SMLAD(inA1, inB1, sum); 00309 inB2 = *__SIMD32(pB)++; 00310 sum = __SMLAD(inA2, inB2, sum); 00311 00312 colCnt--; 00313 } 00314 colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3; 00315 while (colCnt) 00316 { 00317 q7_t inA1 = *pA++; 00318 q15_t inB1 = *pB++; 00319 sum += inA1 * inB1; 00320 colCnt--; 00321 } 00322 *pOut = (q7_t) __SSAT((sum >> out_shift), 8); 00323 pOut++; 00324 00325 } 00326 00327 } 00328 00329 #else 00330 /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ 00331 int i, j, k, l, m, n; 00332 int conv_out; 00333 int in_row, in_col; 00334 00335 if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) 00336 { 00337 /* check if the input dimension meets the constraints */ 00338 return ARM_MATH_SIZE_MISMATCH; 00339 } 00340 00341 for (i = 0; i < ch_im_out; i++) 00342 { 00343 for (j = 0; j < dim_im_out_y; j++) 00344 { 00345 for (k = 0; k < dim_im_out_x; k++) 00346 { 00347 conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); 00348 for (m = 0; m < dim_kernel_y; m++) 00349 { 00350 for (n = 0; n < dim_kernel_x; n++) 00351 { 00352 /* if-for implementation */ 00353 in_row = stride_y * j + m - padding_y; 00354 in_col = stride_x * k + n - padding_x; 00355 if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) 00356 { 00357 for (l = 0; l < ch_im_in; l++) 00358 { 00359 conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * 00360 wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + l]; 00361 } 00362 } 00363 } 00364 } 00365 Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8); 00366 } 00367 } 00368 } 00369 00370 00371 #endif /* ARM_MATH_DSP */ 00372 00373 /* Return to application */ 00374 return ARM_MATH_SUCCESS; 00375 } 00376 00377 /** 00378 * @} end of NNConv group 00379 */ 00380
Generated on Tue Jul 12 2022 16:46:23 by 1.7.2