Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
arm_pool_q7_HWC.c
00001 /* 00002 * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. 00003 * 00004 * SPDX-License-Identifier: Apache-2.0 00005 * 00006 * Licensed under the Apache License, Version 2.0 (the License); you may 00007 * not use this file except in compliance with the License. 00008 * You may obtain a copy of the License at 00009 * 00010 * www.apache.org/licenses/LICENSE-2.0 00011 * 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00014 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 */ 00018 00019 /* ---------------------------------------------------------------------- 00020 * Project: CMSIS NN Library 00021 * Title: arm_pool_q7_HWC.c 00022 * Description: Pooling function implementations 00023 * 00024 * $Date: 17. January 2018 00025 * $Revision: V.1.0.0 00026 * 00027 * Target Processor: Cortex-M cores 00028 * 00029 * -------------------------------------------------------------------- */ 00030 00031 #include "arm_math.h" 00032 #include "arm_nnfunctions.h" 00033 00034 #if defined (ARM_MATH_DSP) 00035 00036 /** 00037 * @brief A few utility functions used by pooling functions 00038 * 00039 * 00040 */ 00041 00042 static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale) 00043 { 00044 int i; 00045 00046 for (i = 0; i < length; i++) 00047 { 00048 target[i] = (q7_t) (buffer[i] / scale); 00049 } 00050 } 00051 00052 static void compare_and_replace_if_larger_q7(q7_t * base, // base data 00053 q7_t * target, // compare target 00054 const uint16_t length // data length 00055 ) 00056 { 00057 q7_t *pIn = base; 00058 q7_t *pCom = target; 00059 union arm_nnword in; 00060 union arm_nnword com; 00061 uint16_t cnt = length >> 2; 00062 00063 while (cnt > 0u) 00064 { 00065 in.word = *__SIMD32(pIn); 00066 com.word = *__SIMD32(pCom)++; 00067 00068 // if version 00069 if (com.bytes[0] > in.bytes[0]) 00070 in.bytes[0] = com.bytes[0]; 00071 if (com.bytes[1] > in.bytes[1]) 00072 in.bytes[1] = com.bytes[1]; 00073 if (com.bytes[2] > in.bytes[2]) 00074 in.bytes[2] = com.bytes[2]; 00075 if (com.bytes[3] > in.bytes[3]) 00076 in.bytes[3] = com.bytes[3]; 00077 00078 *__SIMD32(pIn)++ = in.word; 00079 00080 cnt--; 00081 } 00082 } 00083 00084 static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length) 00085 { 00086 q15_t *pCnt = base; 00087 q7_t *pV = target; 00088 q31_t v1, v2, vo1, vo2; 00089 uint16_t cnt = length >> 2; 00090 q31_t in; 00091 00092 while (cnt > 0u) 00093 { 00094 q31_t value = *__SIMD32(pV)++; 00095 v1 = __SXTB16(__ROR(value, 8)); 00096 v2 = __SXTB16(value); 00097 #ifndef ARM_MATH_BIG_ENDIAN 00098 00099 vo2 = __PKHTB(v1, v2, 16); 00100 vo1 = __PKHBT(v2, v1, 16); 00101 00102 #else 00103 00104 vo1 = __PKHTB(v1, v2, 16); 00105 vo2 = __PKHBT(v2, v1, 16); 00106 00107 #endif 00108 00109 in = *__SIMD32(pCnt); 00110 *__SIMD32(pCnt)++ = __QADD16(vo1, in); 00111 00112 in = *__SIMD32(pCnt); 00113 *__SIMD32(pCnt)++ = __QADD16(vo2, in); 00114 00115 cnt--; 00116 } 00117 cnt = length & 0x3; 00118 while (cnt > 0u) 00119 { 00120 *pCnt++ += *pV++; 00121 cnt--; 00122 } 00123 } 00124 00125 #endif // ARM_MATH_DSP 00126 00127 /** 00128 * @ingroup groupNN 00129 */ 00130 00131 /** 00132 * @addtogroup Pooling 00133 * @{ 00134 */ 00135 00136 /** 00137 * @brief Q7 max pooling function 00138 * @param[in, out] Im_in pointer to input tensor 00139 * @param[in] dim_im_in input tensor dimention 00140 * @param[in] ch_im_in number of input tensor channels 00141 * @param[in] dim_kernel filter kernel size 00142 * @param[in] padding padding sizes 00143 * @param[in] stride convolution stride 00144 * @param[in] dim_im_out output tensor dimension 00145 * @param[in,out] bufferA pointer to buffer space for input 00146 * @param[in,out] Im_out pointer to output tensor 00147 * @return none. 00148 * 00149 * @details 00150 * 00151 * <b>Buffer size:</b> 00152 * 00153 * bufferA size: 0 00154 * 00155 * The pooling function is implemented as split x-pooling then 00156 * y-pooling. 00157 * 00158 * This pooling function is input-destructive. Input data is undefined 00159 * after calling this function. 00160 * 00161 */ 00162 00163 void 00164 arm_maxpool_q7_HWC(q7_t * Im_in, 00165 const uint16_t dim_im_in, 00166 const uint16_t ch_im_in, 00167 const uint16_t dim_kernel, 00168 const uint16_t padding, 00169 const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out) 00170 { 00171 00172 #if defined (ARM_MATH_DSP) 00173 /* Run the following code for Cortex-M4 and Cortex-M7 */ 00174 00175 int16_t i_x, i_y; 00176 00177 /* first does the pooling along x axis */ 00178 for (i_y = 0; i_y < dim_im_in; i_y++) 00179 { 00180 00181 for (i_x = 0; i_x < dim_im_out; i_x++) 00182 { 00183 /* for each output pixel */ 00184 q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; 00185 q7_t *win_start; 00186 q7_t *win_stop; 00187 if (i_x * stride - padding < 0) 00188 { 00189 win_start = target; 00190 } else 00191 { 00192 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; 00193 } 00194 00195 if (i_x * stride - padding + dim_kernel >= dim_im_in) 00196 { 00197 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; 00198 } else 00199 { 00200 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; 00201 } 00202 00203 /* first step is to copy over initial data */ 00204 /* arm_copy_q7(win_start, target, ch_im_in); */ 00205 memmove(target, win_start, ch_im_in); 00206 00207 /* start the max operation from the second part */ 00208 win_start += ch_im_in; 00209 for (; win_start < win_stop; win_start += ch_im_in) 00210 { 00211 compare_and_replace_if_larger_q7(target, win_start, ch_im_in); 00212 } 00213 } 00214 } 00215 00216 /* then does the pooling along y axis */ 00217 for (i_y = 0; i_y < dim_im_out; i_y++) 00218 { 00219 00220 /* for each output row */ 00221 q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; 00222 q7_t *row_start; 00223 q7_t *row_end; 00224 /* setting the starting row */ 00225 if (i_y * stride - padding < 0) 00226 { 00227 row_start = Im_in; 00228 } else 00229 { 00230 row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; 00231 } 00232 /* setting the stopping row */ 00233 if (i_y * stride - padding + dim_kernel >= dim_im_in) 00234 { 00235 row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; 00236 } else 00237 { 00238 row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; 00239 } 00240 00241 /* copy over the first row */ 00242 /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ 00243 memmove(target, row_start, dim_im_out * ch_im_in); 00244 00245 /* move over to next row */ 00246 row_start += ch_im_in * dim_im_in; 00247 00248 for (; row_start < row_end; row_start += dim_im_in * ch_im_in) 00249 { 00250 compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in); 00251 } 00252 } 00253 00254 #else 00255 /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ 00256 00257 int16_t i_ch_in, i_x, i_y; 00258 int16_t k_x, k_y; 00259 00260 for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) 00261 { 00262 for (i_y = 0; i_y < dim_im_out; i_y++) 00263 { 00264 for (i_x = 0; i_x < dim_im_out; i_x++) 00265 { 00266 int max = -129; 00267 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) 00268 { 00269 for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) 00270 { 00271 if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) 00272 { 00273 if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max) 00274 { 00275 max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; 00276 } 00277 } 00278 } 00279 } 00280 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max; 00281 } 00282 } 00283 } 00284 00285 #endif /* ARM_MATH_DSP */ 00286 00287 } 00288 00289 /** 00290 * @brief Q7 average pooling function 00291 * @param[in,out] Im_in pointer to input tensor 00292 * @param[in] dim_im_in input tensor dimention 00293 * @param[in] ch_im_in number of input tensor channels 00294 * @param[in] dim_kernel filter kernel size 00295 * @param[in] padding padding sizes 00296 * @param[in] stride convolution stride 00297 * @param[in] dim_im_out output tensor dimension 00298 * @param[in,out] bufferA pointer to buffer space for input 00299 * @param[in,out] Im_out pointer to output tensor 00300 * @return none. 00301 * 00302 * @details 00303 * 00304 * <b>Buffer size:</b> 00305 * 00306 * bufferA size: 2*dim_im_out*ch_im_in 00307 * 00308 * The pooling function is implemented as split x-pooling then 00309 * y-pooling. 00310 * 00311 * This pooling function is input-destructive. Input data is undefined 00312 * after calling this function. 00313 * 00314 */ 00315 00316 void 00317 arm_avepool_q7_HWC(q7_t * Im_in, 00318 const uint16_t dim_im_in, 00319 const uint16_t ch_im_in, 00320 const uint16_t dim_kernel, 00321 const uint16_t padding, 00322 const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out) 00323 { 00324 00325 #if defined (ARM_MATH_DSP) 00326 /* Run the following code for Cortex-M4 and Cortex-M7 */ 00327 00328 q15_t *buffer = (q15_t *) bufferA; 00329 int16_t i_x, i_y; 00330 int16_t count = 0; 00331 00332 /* first does the pooling along x axis */ 00333 for (i_y = 0; i_y < dim_im_in; i_y++) 00334 { 00335 00336 for (i_x = 0; i_x < dim_im_out; i_x++) 00337 { 00338 /* for each output pixel */ 00339 q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; 00340 q7_t *win_start; 00341 q7_t *win_stop; 00342 if (i_x * stride - padding < 0) 00343 { 00344 win_start = target; 00345 } else 00346 { 00347 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; 00348 } 00349 00350 if (i_x * stride - padding + dim_kernel >= dim_im_in) 00351 { 00352 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; 00353 } else 00354 { 00355 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; 00356 } 00357 00358 /* first step is to copy over initial data */ 00359 arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in); 00360 count = 1; 00361 00362 /* start the max operation from the second part */ 00363 win_start += ch_im_in; 00364 for (; win_start < win_stop; win_start += ch_im_in) 00365 { 00366 accumulate_q7_to_q15(buffer, win_start, ch_im_in); 00367 count++; 00368 } 00369 buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); 00370 } 00371 } 00372 00373 /* then does the pooling along y axis */ 00374 for (i_y = 0; i_y < dim_im_out; i_y++) 00375 { 00376 /* for each output row */ 00377 q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; 00378 q7_t *row_start; 00379 q7_t *row_end; 00380 /* setting the starting row */ 00381 if (i_y * stride - padding < 0) 00382 { 00383 row_start = Im_in; 00384 } else 00385 { 00386 row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; 00387 } 00388 /* setting the stopping row */ 00389 if (i_y * stride - padding + dim_kernel >= dim_im_in) 00390 { 00391 row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; 00392 } else 00393 { 00394 row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; 00395 } 00396 00397 /* copy over the first row */ 00398 arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in); 00399 count = 1; 00400 00401 /* move over to next row */ 00402 row_start += ch_im_in * dim_im_in; 00403 00404 for (; row_start < row_end; row_start += dim_im_in * ch_im_in) 00405 { 00406 accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in); 00407 count++; 00408 } 00409 buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count); 00410 } 00411 00412 #else 00413 /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ 00414 00415 int16_t i_ch_in, i_x, i_y; 00416 int16_t k_x, k_y; 00417 00418 for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) 00419 { 00420 for (i_y = 0; i_y < dim_im_out; i_y++) 00421 { 00422 for (i_x = 0; i_x < dim_im_out; i_x++) 00423 { 00424 int sum = 0; 00425 int count = 0; 00426 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) 00427 { 00428 for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) 00429 { 00430 if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) 00431 { 00432 sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; 00433 count++; 00434 } 00435 } 00436 } 00437 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count; 00438 } 00439 } 00440 } 00441 00442 #endif /* ARM_MATH_DSP */ 00443 00444 } 00445 00446 /** 00447 * @} end of Pooling group 00448 */ 00449
Generated on Tue Jul 12 2022 16:47:27 by
