Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
add.h
00001 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 00002 00003 Licensed under the Apache License, Version 2.0 (the "License"); 00004 you may not use this file except in compliance with the License. 00005 You may obtain a copy of the License at 00006 00007 http://www.apache.org/licenses/LICENSE-2.0 00008 00009 Unless required by applicable law or agreed to in writing, software 00010 distributed under the License is distributed on an "AS IS" BASIS, 00011 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00012 See the License for the specific language governing permissions and 00013 limitations under the License. 00014 ==============================================================================*/ 00015 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_ 00016 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_ 00017 00018 #include "fixedpoint/fixedpoint.h" 00019 #include "tensorflow/lite/kernels/internal/common.h" 00020 00021 namespace tflite { 00022 00023 namespace reference_ops { 00024 00025 template <typename T> 00026 inline void Add(const ArithmeticParams& params, 00027 const RuntimeShape& input1_shape, const T* input1_data, 00028 const RuntimeShape& input2_shape, const T* input2_data, 00029 const RuntimeShape& output_shape, T* output_data) { 00030 const int flat_size = 00031 MatchingElementsSize(input1_shape, input2_shape, output_shape); 00032 for (int i = 0; i < flat_size; ++i) { 00033 output_data[i] = ActivationFunctionWithMinMax( 00034 input1_data[i] + input2_data[i], params.quantized_activation_min, 00035 params.quantized_activation_max); 00036 } 00037 } 00038 00039 inline void Add(const ArithmeticParams& params, 00040 const RuntimeShape& input1_shape, const float* input1_data, 00041 const RuntimeShape& input2_shape, const float* input2_data, 00042 const RuntimeShape& output_shape, float* output_data) { 00043 const int flat_size = 00044 MatchingElementsSize(input1_shape, input2_shape, output_shape); 00045 for (int i = 0; i < flat_size; i++) { 00046 auto x = input1_data[i] + input2_data[i]; 00047 output_data[i] = ActivationFunctionWithMinMax( 00048 x, params.float_activation_min, params.float_activation_max); 00049 } 00050 } 00051 00052 // Element-wise add that can often be used for inner loop of broadcast add as 00053 // well as the non-broadcast add. 00054 inline void AddElementwise(int size, const ArithmeticParams& params, 00055 const uint8* input1_data, const uint8* input2_data, 00056 uint8* output_data) { 00057 TFLITE_DCHECK_GT(params.input1_offset, -256); 00058 TFLITE_DCHECK_GT(params.input2_offset, -256); 00059 TFLITE_DCHECK_LT(params.input1_offset, 256); 00060 TFLITE_DCHECK_LT(params.input2_offset, 256); 00061 00062 for (int i = 0; i < size; ++i) { 00063 const int32 input1_val = params.input1_offset + input1_data[i]; 00064 const int32 input2_val = params.input2_offset + input2_data[i]; 00065 const int32 shifted_input1_val = input1_val * (1 << params.left_shift); 00066 const int32 shifted_input2_val = input2_val * (1 << params.left_shift); 00067 const int32 scaled_input1_val = 00068 MultiplyByQuantizedMultiplierSmallerThanOneExp( 00069 shifted_input1_val, params.input1_multiplier, params.input1_shift); 00070 const int32 scaled_input2_val = 00071 MultiplyByQuantizedMultiplierSmallerThanOneExp( 00072 shifted_input2_val, params.input2_multiplier, params.input2_shift); 00073 const int32 raw_sum = scaled_input1_val + scaled_input2_val; 00074 const int32 raw_output = 00075 MultiplyByQuantizedMultiplierSmallerThanOneExp( 00076 raw_sum, params.output_multiplier, params.output_shift) + 00077 params.output_offset; 00078 const int32 clamped_output = 00079 std::min(params.quantized_activation_max, 00080 std::max(params.quantized_activation_min, raw_output)); 00081 output_data[i] = static_cast<uint8>(clamped_output); 00082 } 00083 } 00084 00085 // Scalar-broadcast add that can be used for inner loop of more general 00086 // broadcast add, so that, for example, scalar-broadcast with batch will still 00087 // be fast. 00088 inline void AddScalarBroadcast(int size, const ArithmeticParams& params, 00089 uint8 input1_data, const uint8* input2_data, 00090 uint8* output_data) { 00091 TFLITE_DCHECK_GT(params.input1_offset, -256); 00092 TFLITE_DCHECK_GT(params.input2_offset, -256); 00093 TFLITE_DCHECK_LT(params.input1_offset, 256); 00094 TFLITE_DCHECK_LT(params.input2_offset, 256); 00095 00096 const int32 input1_val = params.input1_offset + input1_data; 00097 const int32 shifted_input1_val = input1_val * (1 << params.left_shift); 00098 const int32 scaled_input1_val = 00099 MultiplyByQuantizedMultiplierSmallerThanOneExp( 00100 shifted_input1_val, params.input1_multiplier, params.input1_shift); 00101 for (int i = 0; i < size; ++i) { 00102 const int32 input2_val = params.input2_offset + input2_data[i]; 00103 const int32 shifted_input2_val = input2_val * (1 << params.left_shift); 00104 const int32 scaled_input2_val = 00105 MultiplyByQuantizedMultiplierSmallerThanOneExp( 00106 shifted_input2_val, params.input2_multiplier, params.input2_shift); 00107 const int32 raw_sum = scaled_input1_val + scaled_input2_val; 00108 const int32 raw_output = 00109 MultiplyByQuantizedMultiplierSmallerThanOneExp( 00110 raw_sum, params.output_multiplier, params.output_shift) + 00111 params.output_offset; 00112 const int32 clamped_output = 00113 std::min(params.quantized_activation_max, 00114 std::max(params.quantized_activation_min, raw_output)); 00115 output_data[i] = static_cast<uint8>(clamped_output); 00116 } 00117 } 00118 00119 inline void Add(const ArithmeticParams& params, 00120 const RuntimeShape& input1_shape, const uint8* input1_data, 00121 const RuntimeShape& input2_shape, const uint8* input2_data, 00122 const RuntimeShape& output_shape, uint8* output_data) { 00123 TFLITE_DCHECK_LE(params.quantized_activation_min, 00124 params.quantized_activation_max); 00125 const int flat_size = 00126 MatchingElementsSize(input1_shape, input2_shape, output_shape); 00127 00128 TFLITE_DCHECK_GT(params.input1_offset, -256); 00129 TFLITE_DCHECK_GT(params.input2_offset, -256); 00130 TFLITE_DCHECK_LT(params.input1_offset, 256); 00131 TFLITE_DCHECK_LT(params.input2_offset, 256); 00132 AddElementwise(flat_size, params, input1_data, input2_data, output_data); 00133 } 00134 00135 inline void Add(const ArithmeticParams& params, 00136 const RuntimeShape& input1_shape, const int16* input1_data, 00137 const RuntimeShape& input2_shape, const int16* input2_data, 00138 const RuntimeShape& output_shape, int16* output_data) { 00139 TFLITE_DCHECK_LE(params.quantized_activation_min, 00140 params.quantized_activation_max); 00141 00142 const int input1_shift = params.input1_shift; 00143 const int flat_size = 00144 MatchingElementsSize(input1_shape, input2_shape, output_shape); 00145 const int16 output_activation_min = params.quantized_activation_min; 00146 const int16 output_activation_max = params.quantized_activation_max; 00147 00148 TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0); 00149 TFLITE_DCHECK_LE(input1_shift, 0); 00150 TFLITE_DCHECK_LE(params.input2_shift, 0); 00151 const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data; 00152 const int16* shift_input = input1_shift == 0 ? input2_data : input1_data; 00153 const int input_right_shift = 00154 input1_shift == 0 ? -params.input2_shift : -input1_shift; 00155 00156 for (int i = 0; i < flat_size; i++) { 00157 // F0 uses 0 integer bits, range [-1, 1]. 00158 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; 00159 00160 F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]); 00161 F0 scaled_input = F0::FromRaw( 00162 gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift)); 00163 F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled); 00164 const int16 raw_output = result.raw(); 00165 const int16 clamped_output = std::min( 00166 output_activation_max, std::max(output_activation_min, raw_output)); 00167 output_data[i] = clamped_output; 00168 } 00169 } 00170 00171 // TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary 00172 // dimensionality if the runtime code does a single loop over one dimension 00173 // that handles broadcasting as the base case. The code generator would then 00174 // generate max(D1, D2) nested for loops. 00175 // TODO(benoitjacob): BroadcastAdd is intentionally duplicated from 00176 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> 00177 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to 00178 // reference_ops.h. 00179 inline void BroadcastAdd4DSlow(const ArithmeticParams& params, 00180 const RuntimeShape& input1_shape, 00181 const float* input1_data, 00182 const RuntimeShape& input2_shape, 00183 const float* input2_data, 00184 const RuntimeShape& output_shape, 00185 float* output_data) { 00186 NdArrayDesc<4> desc1; 00187 NdArrayDesc<4> desc2; 00188 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, 00189 &desc2); 00190 const RuntimeShape extended_output_shape = 00191 RuntimeShape::ExtendedShape(4, output_shape); 00192 00193 // In Tensorflow, the dimensions are canonically named (batch_number, row, 00194 // col, channel), with extents (batches, height, width, depth), with the 00195 // trailing dimension changing most rapidly (channels has the smallest stride, 00196 // typically 1 element). 00197 // 00198 // In generated C code, we store arrays with the dimensions reversed. The 00199 // first dimension has smallest stride. 00200 // 00201 // We name our variables by their Tensorflow convention, but generate C code 00202 // nesting loops such that the innermost loop has the smallest stride for the 00203 // best cache behavior. 00204 for (int b = 0; b < extended_output_shape.Dims(0); ++b) { 00205 for (int y = 0; y < extended_output_shape.Dims(1); ++y) { 00206 for (int x = 0; x < extended_output_shape.Dims(2); ++x) { 00207 for (int c = 0; c < extended_output_shape.Dims(3); ++c) { 00208 output_data[Offset(extended_output_shape, b, y, x, c)] = 00209 ActivationFunctionWithMinMax( 00210 input1_data[SubscriptToIndex(desc1, b, y, x, c)] + 00211 input2_data[SubscriptToIndex(desc2, b, y, x, c)], 00212 params.float_activation_min, params.float_activation_max); 00213 } 00214 } 00215 } 00216 } 00217 } 00218 00219 inline void BroadcastAdd4DSlow(const ArithmeticParams& params, 00220 const RuntimeShape& input1_shape, 00221 const int32* input1_data, 00222 const RuntimeShape& input2_shape, 00223 const int32* input2_data, 00224 const RuntimeShape& output_shape, 00225 int32* output_data) { 00226 NdArrayDesc<4> desc1; 00227 NdArrayDesc<4> desc2; 00228 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, 00229 &desc2); 00230 const RuntimeShape extended_output_shape = 00231 RuntimeShape::ExtendedShape(4, output_shape); 00232 00233 // In Tensorflow, the dimensions are canonically named (batch_number, row, 00234 // col, channel), with extents (batches, height, width, depth), with the 00235 // trailing dimension changing most rapidly (channels has the smallest stride, 00236 // typically 1 element). 00237 // 00238 // In generated C code, we store arrays with the dimensions reversed. The 00239 // first dimension has smallest stride. 00240 // 00241 // We name our variables by their Tensorflow convention, but generate C code 00242 // nesting loops such that the innermost loop has the smallest stride for the 00243 // best cache behavior. 00244 for (int b = 0; b < extended_output_shape.Dims(0); ++b) { 00245 for (int y = 0; y < extended_output_shape.Dims(1); ++y) { 00246 for (int x = 0; x < extended_output_shape.Dims(2); ++x) { 00247 for (int c = 0; c < extended_output_shape.Dims(3); ++c) { 00248 output_data[Offset(extended_output_shape, b, y, x, c)] = 00249 ActivationFunctionWithMinMax( 00250 input1_data[SubscriptToIndex(desc1, b, y, x, c)] + 00251 input2_data[SubscriptToIndex(desc2, b, y, x, c)], 00252 params.quantized_activation_min, 00253 params.quantized_activation_max); 00254 } 00255 } 00256 } 00257 } 00258 } 00259 00260 inline void BroadcastAdd4DSlow(const ArithmeticParams& params, 00261 const RuntimeShape& input1_shape, 00262 const uint8* input1_data, 00263 const RuntimeShape& input2_shape, 00264 const uint8* input2_data, 00265 const RuntimeShape& output_shape, 00266 uint8* output_data) { 00267 NdArrayDesc<4> desc1; 00268 NdArrayDesc<4> desc2; 00269 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, 00270 &desc2); 00271 const RuntimeShape extended_output_shape = 00272 RuntimeShape::ExtendedShape(4, output_shape); 00273 00274 // In Tensorflow, the dimensions are canonically named (batch_number, row, 00275 // col, channel), with extents (batches, height, width, depth), with the 00276 // trailing dimension changing most rapidly (channels has the smallest stride, 00277 // typically 1 element). 00278 // 00279 // In generated C code, we store arrays with the dimensions reversed. The 00280 // first dimension has smallest stride. 00281 // 00282 // We name our variables by their Tensorflow convention, but generate C code 00283 // nesting loops such that the innermost loop has the smallest stride for the 00284 // best cache behavior. 00285 for (int b = 0; b < extended_output_shape.Dims(0); ++b) { 00286 for (int y = 0; y < extended_output_shape.Dims(1); ++y) { 00287 for (int x = 0; x < extended_output_shape.Dims(2); ++x) { 00288 for (int c = 0; c < extended_output_shape.Dims(3); ++c) { 00289 const int32 input1_val = 00290 params.input1_offset + 00291 input1_data[SubscriptToIndex(desc1, b, y, x, c)]; 00292 const int32 input2_val = 00293 params.input2_offset + 00294 input2_data[SubscriptToIndex(desc2, b, y, x, c)]; 00295 const int32 shifted_input1_val = 00296 input1_val * (1 << params.left_shift); 00297 const int32 shifted_input2_val = 00298 input2_val * (1 << params.left_shift); 00299 const int32 scaled_input1_val = 00300 MultiplyByQuantizedMultiplierSmallerThanOneExp( 00301 shifted_input1_val, params.input1_multiplier, 00302 params.input1_shift); 00303 const int32 scaled_input2_val = 00304 MultiplyByQuantizedMultiplierSmallerThanOneExp( 00305 shifted_input2_val, params.input2_multiplier, 00306 params.input2_shift); 00307 const int32 raw_sum = scaled_input1_val + scaled_input2_val; 00308 const int32 raw_output = 00309 MultiplyByQuantizedMultiplierSmallerThanOneExp( 00310 raw_sum, params.output_multiplier, params.output_shift) + 00311 params.output_offset; 00312 const int32 clamped_output = 00313 std::min(params.quantized_activation_max, 00314 std::max(params.quantized_activation_min, raw_output)); 00315 output_data[Offset(extended_output_shape, b, y, x, c)] = 00316 static_cast<uint8>(clamped_output); 00317 } 00318 } 00319 } 00320 } 00321 } 00322 00323 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, 00324 const RuntimeShape& unswitched_input1_shape, 00325 const uint8* unswitched_input1_data, 00326 const RuntimeShape& unswitched_input2_shape, 00327 const uint8* unswitched_input2_data, 00328 const RuntimeShape& output_shape, 00329 uint8* output_data) { 00330 ArithmeticParams switched_params = unswitched_params; 00331 switched_params.input1_offset = unswitched_params.input2_offset; 00332 switched_params.input1_multiplier = unswitched_params.input2_multiplier; 00333 switched_params.input1_shift = unswitched_params.input2_shift; 00334 switched_params.input2_offset = unswitched_params.input1_offset; 00335 switched_params.input2_multiplier = unswitched_params.input1_multiplier; 00336 switched_params.input2_shift = unswitched_params.input1_shift; 00337 00338 const bool use_unswitched = 00339 unswitched_params.broadcast_category == 00340 tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; 00341 00342 const ArithmeticParams& params = 00343 use_unswitched ? unswitched_params : switched_params; 00344 const uint8* input1_data = 00345 use_unswitched ? unswitched_input1_data : unswitched_input2_data; 00346 const uint8* input2_data = 00347 use_unswitched ? unswitched_input2_data : unswitched_input1_data; 00348 00349 // Fivefold nested loops. The second input resets its position for each 00350 // iteration of the second loop. The first input resets its position at the 00351 // beginning of the fourth loop. The innermost loop is an elementwise add of 00352 // sections of the arrays. 00353 uint8* output_data_ptr = output_data; 00354 const uint8* input1_data_ptr = input1_data; 00355 const uint8* input2_data_reset = input2_data; 00356 // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared 00357 // between input shapes. y3 for input 1 is always broadcast, and so the 00358 // dimension there is 1, whereas optionally y1 might be broadcast for input 2. 00359 // Put another way, 00360 // input1.shape.FlatSize = y0 * y1 * y2 * y4, 00361 // input2.shape.FlatSize = y0 * y2 * y3 * y4. 00362 int y0 = params.broadcast_shape[0]; 00363 int y1 = params.broadcast_shape[1]; 00364 int y2 = params.broadcast_shape[2]; 00365 int y3 = params.broadcast_shape[3]; 00366 int y4 = params.broadcast_shape[4]; 00367 if (y4 > 1) { 00368 // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner 00369 // dimension. 00370 for (int i0 = 0; i0 < y0; ++i0) { 00371 const uint8* input2_data_ptr; 00372 for (int i1 = 0; i1 < y1; ++i1) { 00373 input2_data_ptr = input2_data_reset; 00374 for (int i2 = 0; i2 < y2; ++i2) { 00375 for (int i3 = 0; i3 < y3; ++i3) { 00376 AddElementwise(y4, params, input1_data_ptr, input2_data_ptr, 00377 output_data_ptr); 00378 input2_data_ptr += y4; 00379 output_data_ptr += y4; 00380 } 00381 // We have broadcast y4 of input1 data y3 times, and now move on. 00382 input1_data_ptr += y4; 00383 } 00384 } 00385 // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on. 00386 input2_data_reset = input2_data_ptr; 00387 } 00388 } else { 00389 // Special case of y4 == 1, in which the innermost loop is a single element 00390 // and can be combined with the next (y3) as an inner broadcast. 00391 // 00392 // Note that this handles the case of pure scalar broadcast when 00393 // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar 00394 // broadcast with batch (as y2 > 1). 00395 // 00396 // NOTE The process is the same as the above general case except simplified 00397 // for y4 == 1 and the loop over y3 is contained within the 00398 // AddScalarBroadcast function. 00399 for (int i0 = 0; i0 < y0; ++i0) { 00400 const uint8* input2_data_ptr; 00401 for (int i1 = 0; i1 < y1; ++i1) { 00402 input2_data_ptr = input2_data_reset; 00403 for (int i2 = 0; i2 < y2; ++i2) { 00404 AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr, 00405 output_data_ptr); 00406 input2_data_ptr += y3; 00407 output_data_ptr += y3; 00408 input1_data_ptr += 1; 00409 } 00410 } 00411 input2_data_reset = input2_data_ptr; 00412 } 00413 } 00414 } 00415 00416 } // namespace reference_ops 00417 } // namespace tflite 00418 00419 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
Generated on Wed Jul 13 2022 16:03:34 by
