Daniel Konegen / MNIST_example

Dependencies:   mbed-os

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers add.h Source File

add.h

00001 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
00002 
00003 Licensed under the Apache License, Version 2.0 (the "License");
00004 you may not use this file except in compliance with the License.
00005 You may obtain a copy of the License at
00006 
00007     http://www.apache.org/licenses/LICENSE-2.0
00008 
00009 Unless required by applicable law or agreed to in writing, software
00010 distributed under the License is distributed on an "AS IS" BASIS,
00011 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00012 See the License for the specific language governing permissions and
00013 limitations under the License.
00014 ==============================================================================*/
00015 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
00016 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
00017 
00018 #include "fixedpoint/fixedpoint.h"
00019 #include "tensorflow/lite/kernels/internal/common.h"
00020 
00021 namespace tflite {
00022 
00023 namespace reference_ops {
00024 
00025 template <typename T>
00026 inline void Add(const ArithmeticParams& params,
00027                 const RuntimeShape& input1_shape, const T* input1_data,
00028                 const RuntimeShape& input2_shape, const T* input2_data,
00029                 const RuntimeShape& output_shape, T* output_data) {
00030   const int flat_size =
00031       MatchingElementsSize(input1_shape, input2_shape, output_shape);
00032   for (int i = 0; i < flat_size; ++i) {
00033     output_data[i] = ActivationFunctionWithMinMax(
00034         input1_data[i] + input2_data[i], params.quantized_activation_min,
00035         params.quantized_activation_max);
00036   }
00037 }
00038 
00039 inline void Add(const ArithmeticParams& params,
00040                 const RuntimeShape& input1_shape, const float* input1_data,
00041                 const RuntimeShape& input2_shape, const float* input2_data,
00042                 const RuntimeShape& output_shape, float* output_data) {
00043   const int flat_size =
00044       MatchingElementsSize(input1_shape, input2_shape, output_shape);
00045   for (int i = 0; i < flat_size; i++) {
00046     auto x = input1_data[i] + input2_data[i];
00047     output_data[i] = ActivationFunctionWithMinMax(
00048         x, params.float_activation_min, params.float_activation_max);
00049   }
00050 }
00051 
00052 // Element-wise add that can often be used for inner loop of broadcast add as
00053 // well as the non-broadcast add.
00054 inline void AddElementwise(int size, const ArithmeticParams& params,
00055                            const uint8* input1_data, const uint8* input2_data,
00056                            uint8* output_data) {
00057   TFLITE_DCHECK_GT(params.input1_offset, -256);
00058   TFLITE_DCHECK_GT(params.input2_offset, -256);
00059   TFLITE_DCHECK_LT(params.input1_offset, 256);
00060   TFLITE_DCHECK_LT(params.input2_offset, 256);
00061 
00062   for (int i = 0; i < size; ++i) {
00063     const int32 input1_val = params.input1_offset + input1_data[i];
00064     const int32 input2_val = params.input2_offset + input2_data[i];
00065     const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
00066     const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
00067     const int32 scaled_input1_val =
00068         MultiplyByQuantizedMultiplierSmallerThanOneExp(
00069             shifted_input1_val, params.input1_multiplier, params.input1_shift);
00070     const int32 scaled_input2_val =
00071         MultiplyByQuantizedMultiplierSmallerThanOneExp(
00072             shifted_input2_val, params.input2_multiplier, params.input2_shift);
00073     const int32 raw_sum = scaled_input1_val + scaled_input2_val;
00074     const int32 raw_output =
00075         MultiplyByQuantizedMultiplierSmallerThanOneExp(
00076             raw_sum, params.output_multiplier, params.output_shift) +
00077         params.output_offset;
00078     const int32 clamped_output =
00079         std::min(params.quantized_activation_max,
00080                  std::max(params.quantized_activation_min, raw_output));
00081     output_data[i] = static_cast<uint8>(clamped_output);
00082   }
00083 }
00084 
00085 // Scalar-broadcast add that can be used for inner loop of more general
00086 // broadcast add, so that, for example, scalar-broadcast with batch will still
00087 // be fast.
00088 inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
00089                                uint8 input1_data, const uint8* input2_data,
00090                                uint8* output_data) {
00091   TFLITE_DCHECK_GT(params.input1_offset, -256);
00092   TFLITE_DCHECK_GT(params.input2_offset, -256);
00093   TFLITE_DCHECK_LT(params.input1_offset, 256);
00094   TFLITE_DCHECK_LT(params.input2_offset, 256);
00095 
00096   const int32 input1_val = params.input1_offset + input1_data;
00097   const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
00098   const int32 scaled_input1_val =
00099       MultiplyByQuantizedMultiplierSmallerThanOneExp(
00100           shifted_input1_val, params.input1_multiplier, params.input1_shift);
00101   for (int i = 0; i < size; ++i) {
00102     const int32 input2_val = params.input2_offset + input2_data[i];
00103     const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
00104     const int32 scaled_input2_val =
00105         MultiplyByQuantizedMultiplierSmallerThanOneExp(
00106             shifted_input2_val, params.input2_multiplier, params.input2_shift);
00107     const int32 raw_sum = scaled_input1_val + scaled_input2_val;
00108     const int32 raw_output =
00109         MultiplyByQuantizedMultiplierSmallerThanOneExp(
00110             raw_sum, params.output_multiplier, params.output_shift) +
00111         params.output_offset;
00112     const int32 clamped_output =
00113         std::min(params.quantized_activation_max,
00114                  std::max(params.quantized_activation_min, raw_output));
00115     output_data[i] = static_cast<uint8>(clamped_output);
00116   }
00117 }
00118 
00119 inline void Add(const ArithmeticParams& params,
00120                 const RuntimeShape& input1_shape, const uint8* input1_data,
00121                 const RuntimeShape& input2_shape, const uint8* input2_data,
00122                 const RuntimeShape& output_shape, uint8* output_data) {
00123   TFLITE_DCHECK_LE(params.quantized_activation_min,
00124                    params.quantized_activation_max);
00125   const int flat_size =
00126       MatchingElementsSize(input1_shape, input2_shape, output_shape);
00127 
00128   TFLITE_DCHECK_GT(params.input1_offset, -256);
00129   TFLITE_DCHECK_GT(params.input2_offset, -256);
00130   TFLITE_DCHECK_LT(params.input1_offset, 256);
00131   TFLITE_DCHECK_LT(params.input2_offset, 256);
00132   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
00133 }
00134 
00135 inline void Add(const ArithmeticParams& params,
00136                 const RuntimeShape& input1_shape, const int16* input1_data,
00137                 const RuntimeShape& input2_shape, const int16* input2_data,
00138                 const RuntimeShape& output_shape, int16* output_data) {
00139   TFLITE_DCHECK_LE(params.quantized_activation_min,
00140                    params.quantized_activation_max);
00141 
00142   const int input1_shift = params.input1_shift;
00143   const int flat_size =
00144       MatchingElementsSize(input1_shape, input2_shape, output_shape);
00145   const int16 output_activation_min = params.quantized_activation_min;
00146   const int16 output_activation_max = params.quantized_activation_max;
00147 
00148   TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
00149   TFLITE_DCHECK_LE(input1_shift, 0);
00150   TFLITE_DCHECK_LE(params.input2_shift, 0);
00151   const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
00152   const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
00153   const int input_right_shift =
00154       input1_shift == 0 ? -params.input2_shift : -input1_shift;
00155 
00156   for (int i = 0; i < flat_size; i++) {
00157     // F0 uses 0 integer bits, range [-1, 1].
00158     using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
00159 
00160     F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
00161     F0 scaled_input = F0::FromRaw(
00162         gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
00163     F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
00164     const int16 raw_output = result.raw();
00165     const int16 clamped_output = std::min(
00166         output_activation_max, std::max(output_activation_min, raw_output));
00167     output_data[i] = clamped_output;
00168   }
00169 }
00170 
00171 // TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
00172 // dimensionality if the runtime code does a single loop over one dimension
00173 // that handles broadcasting as the base case. The code generator would then
00174 // generate max(D1, D2) nested for loops.
00175 // TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
00176 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
00177 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
00178 // reference_ops.h.
00179 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
00180                                const RuntimeShape& input1_shape,
00181                                const float* input1_data,
00182                                const RuntimeShape& input2_shape,
00183                                const float* input2_data,
00184                                const RuntimeShape& output_shape,
00185                                float* output_data) {
00186   NdArrayDesc<4> desc1;
00187   NdArrayDesc<4> desc2;
00188   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
00189                                       &desc2);
00190   const RuntimeShape extended_output_shape =
00191       RuntimeShape::ExtendedShape(4, output_shape);
00192 
00193   // In Tensorflow, the dimensions are canonically named (batch_number, row,
00194   // col, channel), with extents (batches, height, width, depth), with the
00195   // trailing dimension changing most rapidly (channels has the smallest stride,
00196   // typically 1 element).
00197   //
00198   // In generated C code, we store arrays with the dimensions reversed. The
00199   // first dimension has smallest stride.
00200   //
00201   // We name our variables by their Tensorflow convention, but generate C code
00202   // nesting loops such that the innermost loop has the smallest stride for the
00203   // best cache behavior.
00204   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
00205     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
00206       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
00207         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
00208           output_data[Offset(extended_output_shape, b, y, x, c)] =
00209               ActivationFunctionWithMinMax(
00210                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
00211                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
00212                   params.float_activation_min, params.float_activation_max);
00213         }
00214       }
00215     }
00216   }
00217 }
00218 
00219 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
00220                                const RuntimeShape& input1_shape,
00221                                const int32* input1_data,
00222                                const RuntimeShape& input2_shape,
00223                                const int32* input2_data,
00224                                const RuntimeShape& output_shape,
00225                                int32* output_data) {
00226   NdArrayDesc<4> desc1;
00227   NdArrayDesc<4> desc2;
00228   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
00229                                       &desc2);
00230   const RuntimeShape extended_output_shape =
00231       RuntimeShape::ExtendedShape(4, output_shape);
00232 
00233   // In Tensorflow, the dimensions are canonically named (batch_number, row,
00234   // col, channel), with extents (batches, height, width, depth), with the
00235   // trailing dimension changing most rapidly (channels has the smallest stride,
00236   // typically 1 element).
00237   //
00238   // In generated C code, we store arrays with the dimensions reversed. The
00239   // first dimension has smallest stride.
00240   //
00241   // We name our variables by their Tensorflow convention, but generate C code
00242   // nesting loops such that the innermost loop has the smallest stride for the
00243   // best cache behavior.
00244   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
00245     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
00246       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
00247         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
00248           output_data[Offset(extended_output_shape, b, y, x, c)] =
00249               ActivationFunctionWithMinMax(
00250                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
00251                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
00252                   params.quantized_activation_min,
00253                   params.quantized_activation_max);
00254         }
00255       }
00256     }
00257   }
00258 }
00259 
00260 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
00261                                const RuntimeShape& input1_shape,
00262                                const uint8* input1_data,
00263                                const RuntimeShape& input2_shape,
00264                                const uint8* input2_data,
00265                                const RuntimeShape& output_shape,
00266                                uint8* output_data) {
00267   NdArrayDesc<4> desc1;
00268   NdArrayDesc<4> desc2;
00269   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
00270                                       &desc2);
00271   const RuntimeShape extended_output_shape =
00272       RuntimeShape::ExtendedShape(4, output_shape);
00273 
00274   // In Tensorflow, the dimensions are canonically named (batch_number, row,
00275   // col, channel), with extents (batches, height, width, depth), with the
00276   // trailing dimension changing most rapidly (channels has the smallest stride,
00277   // typically 1 element).
00278   //
00279   // In generated C code, we store arrays with the dimensions reversed. The
00280   // first dimension has smallest stride.
00281   //
00282   // We name our variables by their Tensorflow convention, but generate C code
00283   // nesting loops such that the innermost loop has the smallest stride for the
00284   // best cache behavior.
00285   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
00286     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
00287       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
00288         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
00289           const int32 input1_val =
00290               params.input1_offset +
00291               input1_data[SubscriptToIndex(desc1, b, y, x, c)];
00292           const int32 input2_val =
00293               params.input2_offset +
00294               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
00295           const int32 shifted_input1_val =
00296               input1_val * (1 << params.left_shift);
00297           const int32 shifted_input2_val =
00298               input2_val * (1 << params.left_shift);
00299           const int32 scaled_input1_val =
00300               MultiplyByQuantizedMultiplierSmallerThanOneExp(
00301                   shifted_input1_val, params.input1_multiplier,
00302                   params.input1_shift);
00303           const int32 scaled_input2_val =
00304               MultiplyByQuantizedMultiplierSmallerThanOneExp(
00305                   shifted_input2_val, params.input2_multiplier,
00306                   params.input2_shift);
00307           const int32 raw_sum = scaled_input1_val + scaled_input2_val;
00308           const int32 raw_output =
00309               MultiplyByQuantizedMultiplierSmallerThanOneExp(
00310                   raw_sum, params.output_multiplier, params.output_shift) +
00311               params.output_offset;
00312           const int32 clamped_output =
00313               std::min(params.quantized_activation_max,
00314                        std::max(params.quantized_activation_min, raw_output));
00315           output_data[Offset(extended_output_shape, b, y, x, c)] =
00316               static_cast<uint8>(clamped_output);
00317         }
00318       }
00319     }
00320   }
00321 }
00322 
00323 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
00324                                  const RuntimeShape& unswitched_input1_shape,
00325                                  const uint8* unswitched_input1_data,
00326                                  const RuntimeShape& unswitched_input2_shape,
00327                                  const uint8* unswitched_input2_data,
00328                                  const RuntimeShape& output_shape,
00329                                  uint8* output_data) {
00330   ArithmeticParams switched_params = unswitched_params;
00331   switched_params.input1_offset = unswitched_params.input2_offset;
00332   switched_params.input1_multiplier = unswitched_params.input2_multiplier;
00333   switched_params.input1_shift = unswitched_params.input2_shift;
00334   switched_params.input2_offset = unswitched_params.input1_offset;
00335   switched_params.input2_multiplier = unswitched_params.input1_multiplier;
00336   switched_params.input2_shift = unswitched_params.input1_shift;
00337 
00338   const bool use_unswitched =
00339       unswitched_params.broadcast_category ==
00340       tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
00341 
00342   const ArithmeticParams& params =
00343       use_unswitched ? unswitched_params : switched_params;
00344   const uint8* input1_data =
00345       use_unswitched ? unswitched_input1_data : unswitched_input2_data;
00346   const uint8* input2_data =
00347       use_unswitched ? unswitched_input2_data : unswitched_input1_data;
00348 
00349   // Fivefold nested loops. The second input resets its position for each
00350   // iteration of the second loop. The first input resets its position at the
00351   // beginning of the fourth loop. The innermost loop is an elementwise add of
00352   // sections of the arrays.
00353   uint8* output_data_ptr = output_data;
00354   const uint8* input1_data_ptr = input1_data;
00355   const uint8* input2_data_reset = input2_data;
00356   // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
00357   // between input shapes. y3 for input 1 is always broadcast, and so the
00358   // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
00359   // Put another way,
00360   // input1.shape.FlatSize = y0 * y1 * y2 * y4,
00361   // input2.shape.FlatSize = y0 * y2 * y3 * y4.
00362   int y0 = params.broadcast_shape[0];
00363   int y1 = params.broadcast_shape[1];
00364   int y2 = params.broadcast_shape[2];
00365   int y3 = params.broadcast_shape[3];
00366   int y4 = params.broadcast_shape[4];
00367   if (y4 > 1) {
00368     // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
00369     // dimension.
00370     for (int i0 = 0; i0 < y0; ++i0) {
00371       const uint8* input2_data_ptr;
00372       for (int i1 = 0; i1 < y1; ++i1) {
00373         input2_data_ptr = input2_data_reset;
00374         for (int i2 = 0; i2 < y2; ++i2) {
00375           for (int i3 = 0; i3 < y3; ++i3) {
00376             AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
00377                            output_data_ptr);
00378             input2_data_ptr += y4;
00379             output_data_ptr += y4;
00380           }
00381           // We have broadcast y4 of input1 data y3 times, and now move on.
00382           input1_data_ptr += y4;
00383         }
00384       }
00385       // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
00386       input2_data_reset = input2_data_ptr;
00387     }
00388   } else {
00389     // Special case of y4 == 1, in which the innermost loop is a single element
00390     // and can be combined with the next (y3) as an inner broadcast.
00391     //
00392     // Note that this handles the case of pure scalar broadcast when
00393     // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
00394     // broadcast with batch (as y2 > 1).
00395     //
00396     // NOTE The process is the same as the above general case except simplified
00397     // for y4 == 1 and the loop over y3 is contained within the
00398     // AddScalarBroadcast function.
00399     for (int i0 = 0; i0 < y0; ++i0) {
00400       const uint8* input2_data_ptr;
00401       for (int i1 = 0; i1 < y1; ++i1) {
00402         input2_data_ptr = input2_data_reset;
00403         for (int i2 = 0; i2 < y2; ++i2) {
00404           AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
00405                              output_data_ptr);
00406           input2_data_ptr += y3;
00407           output_data_ptr += y3;
00408           input1_data_ptr += 1;
00409         }
00410       }
00411       input2_data_reset = input2_data_ptr;
00412     }
00413   }
00414 }
00415 
00416 }  // namespace reference_ops
00417 }  // namespace tflite
00418 
00419 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_