Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
fully_connected.h
00001 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 00002 00003 Licensed under the Apache License, Version 2.0 (the "License"); 00004 you may not use this file except in compliance with the License. 00005 You may obtain a copy of the License at 00006 00007 http://www.apache.org/licenses/LICENSE-2.0 00008 00009 Unless required by applicable law or agreed to in writing, software 00010 distributed under the License is distributed on an "AS IS" BASIS, 00011 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00012 See the License for the specific language governing permissions and 00013 limitations under the License. 00014 ==============================================================================*/ 00015 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_ 00016 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_ 00017 00018 #include "tensorflow/lite/kernels/internal/common.h" 00019 #include "tensorflow/lite/kernels/internal/quantization_util.h" 00020 #include "tensorflow/lite/kernels/internal/round.h" 00021 #include "tensorflow/lite/kernels/internal/types.h" 00022 00023 namespace tflite { 00024 namespace reference_ops { 00025 00026 const int kReverseShift = -1; 00027 00028 inline void FullyConnected( 00029 const FullyConnectedParams& params, const RuntimeShape& input_shape, 00030 const float* input_data, const RuntimeShape& weights_shape, 00031 const float* weights_data, const RuntimeShape& bias_shape, 00032 const float* bias_data, const RuntimeShape& output_shape, 00033 float* output_data) { 00034 const float output_activation_min = params.float_activation_min; 00035 const float output_activation_max = params.float_activation_max; 00036 // TODO(benoitjacob): This really should be: 00037 // const int batches = ArraySize(output_dims, 1); 00038 // but the current --variable_batch hack consists in overwriting the 3rd 00039 // dimension with the runtime batch size, as we don't keep track for each 00040 // array of which dimension is the batch dimension in it. 00041 const int output_dims_count = output_shape.DimensionsCount(); 00042 const int weights_dims_count = weights_shape.DimensionsCount(); 00043 const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); 00044 const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2, 00045 output_shape, output_dims_count - 1); 00046 const int accum_depth = weights_shape.Dims(weights_dims_count - 1); 00047 for (int b = 0; b < batches; ++b) { 00048 for (int out_c = 0; out_c < output_depth; ++out_c) { 00049 float total = 0.f; 00050 for (int d = 0; d < accum_depth; ++d) { 00051 total += input_data[b * accum_depth + d] * 00052 weights_data[out_c * accum_depth + d]; 00053 } 00054 float bias_value = 0.0f; 00055 if (bias_data) { 00056 bias_value = bias_data[out_c]; 00057 } 00058 output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( 00059 total + bias_value, output_activation_min, output_activation_max); 00060 } 00061 } 00062 } 00063 00064 inline void FullyConnected( 00065 const FullyConnectedParams& params, const RuntimeShape& input_shape, 00066 const uint8* input_data, const RuntimeShape& filter_shape, 00067 const uint8* filter_data, const RuntimeShape& bias_shape, 00068 const int32* bias_data, const RuntimeShape& output_shape, 00069 uint8* output_data) { 00070 const int32 input_offset = params.input_offset; 00071 const int32 filter_offset = params.weights_offset; 00072 const int32 output_offset = params.output_offset; 00073 const int32 output_multiplier = params.output_multiplier; 00074 const int output_shift = params.output_shift; 00075 const int32 output_activation_min = params.quantized_activation_min; 00076 const int32 output_activation_max = params.quantized_activation_max; 00077 TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); 00078 TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); 00079 00080 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 00081 // TODO(benoitjacob): This really should be: 00082 // const int batches = ArraySize(output_dims, 1); 00083 // but the current --variable_batch hack consists in overwriting the 3rd 00084 // dimension with the runtime batch size, as we don't keep track for each 00085 // array of which dimension is the batch dimension in it. 00086 const int output_dim_count = output_shape.DimensionsCount(); 00087 const int filter_dim_count = filter_shape.DimensionsCount(); 00088 const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); 00089 const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, 00090 output_shape, output_dim_count - 1); 00091 const int accum_depth = filter_shape.Dims(filter_dim_count - 1); 00092 for (int b = 0; b < batches; ++b) { 00093 for (int out_c = 0; out_c < output_depth; ++out_c) { 00094 int32 acc = 0; 00095 for (int d = 0; d < accum_depth; ++d) { 00096 int32 input_val = input_data[b * accum_depth + d]; 00097 int32 filter_val = filter_data[out_c * accum_depth + d]; 00098 acc += (filter_val + filter_offset) * (input_val + input_offset); 00099 } 00100 if (bias_data) { 00101 acc += bias_data[out_c]; 00102 } 00103 acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); 00104 acc += output_offset; 00105 acc = std::max(acc, output_activation_min); 00106 acc = std::min(acc, output_activation_max); 00107 output_data[out_c + output_depth * b] = static_cast<uint8>(acc); 00108 } 00109 } 00110 } 00111 00112 inline void FullyConnected( 00113 const FullyConnectedParams& params, const RuntimeShape& input_shape, 00114 const uint8* input_data, const RuntimeShape& filter_shape, 00115 const uint8* filter_data, const RuntimeShape& bias_shape, 00116 const int32* bias_data, const RuntimeShape& output_shape, 00117 int16* output_data) { 00118 const int32 input_offset = params.input_offset; 00119 const int32 filter_offset = params.weights_offset; 00120 const int32 output_offset = params.output_offset; 00121 const int32 output_multiplier = params.output_multiplier; 00122 const int output_shift = params.output_shift; 00123 const int32 output_activation_min = params.quantized_activation_min; 00124 const int32 output_activation_max = params.quantized_activation_max; 00125 00126 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 00127 TFLITE_DCHECK_EQ(output_offset, 0); 00128 // TODO(benoitjacob): This really should be: 00129 // const int batches = ArraySize(output_dims, 1); 00130 // but the current --variable_batch hack consists in overwriting the 3rd 00131 // dimension with the runtime batch size, as we don't keep track for each 00132 // array of which dimension is the batch dimension in it. 00133 const int output_dim_count = output_shape.DimensionsCount(); 00134 const int filter_dim_count = filter_shape.DimensionsCount(); 00135 const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); 00136 const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, 00137 output_shape, output_dim_count - 1); 00138 const int accum_depth = filter_shape.Dims(filter_dim_count - 1); 00139 for (int b = 0; b < batches; ++b) { 00140 for (int out_c = 0; out_c < output_depth; ++out_c) { 00141 // Internal accumulation. 00142 // Initialize accumulator with the bias-value. 00143 int32 accum = bias_data[out_c]; 00144 // Accumulation loop. 00145 for (int d = 0; d < accum_depth; ++d) { 00146 int16 input_val = input_data[b * accum_depth + d] + input_offset; 00147 int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset; 00148 accum += filter_val * input_val; 00149 } 00150 // Down-scale the final int32 accumulator to the scale used by our 00151 // (16-bit, typically 3 integer bits) fixed-point format. The quantized 00152 // multiplier and shift here have been pre-computed offline 00153 // (e.g. by toco). 00154 accum = 00155 MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift); 00156 // Saturate, cast to int16, and store to output array. 00157 accum = std::max(accum, output_activation_min - output_offset); 00158 accum = std::min(accum, output_activation_max - output_offset); 00159 accum += output_offset; 00160 output_data[out_c + output_depth * b] = accum; 00161 } 00162 } 00163 } 00164 00165 inline void ShuffledFullyConnected( 00166 const FullyConnectedParams& params, const RuntimeShape& input_shape, 00167 const uint8* input_data, const RuntimeShape& weights_shape, 00168 const uint8* shuffled_weights_data, const RuntimeShape& bias_shape, 00169 const int32* bias_data, const RuntimeShape& output_shape, 00170 int16* output_data, uint8* shuffled_input_workspace_data) { 00171 const int32 output_multiplier = params.output_multiplier; 00172 const int output_shift = params.output_shift; 00173 const int32 output_activation_min = params.quantized_activation_min; 00174 const int32 output_activation_max = params.quantized_activation_max; 00175 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 00176 00177 TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); 00178 TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); 00179 TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); 00180 // TODO(benoitjacob): This really should be: 00181 // const int batches = ArraySize(output_dims, 1); 00182 // but the current --variable_batch hack consists in overwriting the 3rd 00183 // dimension with the runtime batch size, as we don't keep track for each 00184 // array of which dimension is the batch dimension in it. 00185 const int output_dim_count = output_shape.DimensionsCount(); 00186 const int weights_dim_count = weights_shape.DimensionsCount(); 00187 const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); 00188 const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2, 00189 output_shape, output_dim_count - 1); 00190 const int accum_depth = weights_shape.Dims(weights_dim_count - 1); 00191 TFLITE_DCHECK((accum_depth % 16) == 0); 00192 TFLITE_DCHECK((output_depth % 4) == 0); 00193 00194 // Shuffling and xoring of input activations into the workspace buffer 00195 uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; 00196 if (batches == 1) { 00197 for (int i = 0; i < accum_depth; i++) { 00198 shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; 00199 } 00200 } else if (batches == 4) { 00201 for (int c = 0; c < accum_depth; c += 16) { 00202 for (int b = 0; b < 4; b++) { 00203 const uint8* src_data_ptr = input_data + b * accum_depth + c; 00204 for (int j = 0; j < 16; j++) { 00205 uint8 src_val = *src_data_ptr++; 00206 // Flip the sign bit, so that the kernel will only need to 00207 // reinterpret these uint8 values as int8, getting for free the 00208 // subtraction of the zero_point value 128. 00209 uint8 dst_val = src_val ^ 0x80; 00210 *shuffled_input_workspace_ptr++ = dst_val; 00211 } 00212 } 00213 } 00214 } else { 00215 TFLITE_DCHECK(false); 00216 return; 00217 } 00218 00219 // Actual computation 00220 if (batches == 1) { 00221 int16* output_ptr = output_data; 00222 // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) 00223 // so that just reinterpreting them as int8 values is equivalent to 00224 // subtracting 128 from them, thus implementing for free the subtraction of 00225 // the zero_point value 128. 00226 const int8* shuffled_weights_ptr = 00227 reinterpret_cast<const int8*>(shuffled_weights_data); 00228 // Likewise, we preshuffled and pre-xored the input data above. 00229 const int8* shuffled_input_data = 00230 reinterpret_cast<const int8*>(shuffled_input_workspace_data); 00231 for (int c = 0; c < output_depth; c += 4) { 00232 // Internal accumulation. 00233 // Initialize accumulator with the bias-value. 00234 int32 accum[4] = {0}; 00235 // Accumulation loop. 00236 for (int d = 0; d < accum_depth; d += 16) { 00237 for (int i = 0; i < 4; i++) { 00238 for (int j = 0; j < 16; j++) { 00239 int8 input_val = shuffled_input_data[d + j]; 00240 int8 weights_val = *shuffled_weights_ptr++; 00241 accum[i] += weights_val * input_val; 00242 } 00243 } 00244 } 00245 for (int i = 0; i < 4; i++) { 00246 // Add bias value 00247 int32 acc = accum[i] + bias_data[c + i]; 00248 // Down-scale the final int32 accumulator to the scale used by our 00249 // (16-bit, typically 3 integer bits) fixed-point format. The quantized 00250 // multiplier and shift here have been pre-computed offline 00251 // (e.g. by toco). 00252 acc = 00253 MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); 00254 // Saturate, cast to int16, and store to output array. 00255 acc = std::max(acc, output_activation_min); 00256 acc = std::min(acc, output_activation_max); 00257 output_ptr[c + i] = acc; 00258 } 00259 } 00260 } else if (batches == 4) { 00261 int16* output_ptr = output_data; 00262 // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) 00263 // so that just reinterpreting them as int8 values is equivalent to 00264 // subtracting 128 from them, thus implementing for free the subtraction of 00265 // the zero_point value 128. 00266 const int8* shuffled_weights_ptr = 00267 reinterpret_cast<const int8*>(shuffled_weights_data); 00268 // Likewise, we preshuffled and pre-xored the input data above. 00269 const int8* shuffled_input_data = 00270 reinterpret_cast<const int8*>(shuffled_input_workspace_data); 00271 for (int c = 0; c < output_depth; c += 4) { 00272 const int8* shuffled_input_ptr = shuffled_input_data; 00273 // Accumulation loop. 00274 // Internal accumulation. 00275 // Initialize accumulator with the bias-value. 00276 int32 accum[4][4]; 00277 for (int i = 0; i < 4; i++) { 00278 for (int b = 0; b < 4; b++) { 00279 accum[i][b] = 0; 00280 } 00281 } 00282 for (int d = 0; d < accum_depth; d += 16) { 00283 for (int i = 0; i < 4; i++) { 00284 for (int b = 0; b < 4; b++) { 00285 for (int j = 0; j < 16; j++) { 00286 int8 input_val = shuffled_input_ptr[16 * b + j]; 00287 int8 weights_val = shuffled_weights_ptr[16 * i + j]; 00288 accum[i][b] += weights_val * input_val; 00289 } 00290 } 00291 } 00292 shuffled_input_ptr += 64; 00293 shuffled_weights_ptr += 64; 00294 } 00295 for (int i = 0; i < 4; i++) { 00296 for (int b = 0; b < 4; b++) { 00297 // Add bias value 00298 int32 acc = accum[i][b] + bias_data[c + i]; 00299 // Down-scale the final int32 accumulator to the scale used by our 00300 // (16-bit, typically 3 integer bits) fixed-point format. The 00301 // quantized multiplier and shift here have been pre-computed offline 00302 // (e.g. by toco). 00303 acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, 00304 output_shift); 00305 // Saturate, cast to int16, and store to output array. 00306 acc = std::max(acc, output_activation_min); 00307 acc = std::min(acc, output_activation_max); 00308 output_ptr[b * output_depth + c + i] = acc; 00309 } 00310 } 00311 } 00312 } else { 00313 TFLITE_DCHECK(false); 00314 return; 00315 } 00316 } 00317 00318 } // namespace reference_ops 00319 } // namespace tflite 00320 00321 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
Generated on Wed Jul 13 2022 16:03:35 by
