Daniel Konegen / MNIST_example

Dependencies:   mbed-os

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers add.h Source File

add.h

00001 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
00002 
00003 Licensed under the Apache License, Version 2.0 (the "License");
00004 you may not use this file except in compliance with the License.
00005 You may obtain a copy of the License at
00006 
00007     http://www.apache.org/licenses/LICENSE-2.0
00008 
00009 Unless required by applicable law or agreed to in writing, software
00010 distributed under the License is distributed on an "AS IS" BASIS,
00011 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00012 See the License for the specific language governing permissions and
00013 limitations under the License.
00014 ==============================================================================*/
00015 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
00016 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
00017 
00018 #include <limits>
00019 
00020 #include "tensorflow/lite/kernels/internal/common.h"
00021 #include "tensorflow/lite/kernels/internal/types.h"
00022 
00023 namespace tflite {
00024 namespace reference_integer_ops {
00025 
00026 // Element-wise add that can often be used for inner loop of broadcast add as
00027 // well as the non-broadcast add.
00028 inline void AddElementwise(int size, const ArithmeticParams& params,
00029                            const int8_t* input1_data, const int8_t* input2_data,
00030                            int8_t* output_data) {
00031   const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
00032   TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
00033   TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
00034   TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
00035   TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
00036 
00037   for (int i = 0; i < size; ++i) {
00038     const int32 input1_val = params.input1_offset + input1_data[i];
00039     const int32 input2_val = params.input2_offset + input2_data[i];
00040     const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
00041     const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
00042     const int32 scaled_input1_val =
00043         MultiplyByQuantizedMultiplierSmallerThanOneExp(
00044             shifted_input1_val, params.input1_multiplier, params.input1_shift);
00045     const int32 scaled_input2_val =
00046         MultiplyByQuantizedMultiplierSmallerThanOneExp(
00047             shifted_input2_val, params.input2_multiplier, params.input2_shift);
00048     const int32 raw_sum = scaled_input1_val + scaled_input2_val;
00049     const int32 raw_output =
00050         MultiplyByQuantizedMultiplierSmallerThanOneExp(
00051             raw_sum, params.output_multiplier, params.output_shift) +
00052         params.output_offset;
00053     const int32 clamped_output =
00054         std::min(params.quantized_activation_max,
00055                  std::max(params.quantized_activation_min, raw_output));
00056     output_data[i] = static_cast<int8_t>(clamped_output);
00057   }
00058 }
00059 
00060 inline void Add(const ArithmeticParams& params,
00061                 const RuntimeShape& input1_shape, const int8_t* input1_data,
00062                 const RuntimeShape& input2_shape, const int8_t* input2_data,
00063                 const RuntimeShape& output_shape, int8_t* output_data) {
00064   TFLITE_DCHECK_LE(params.quantized_activation_min,
00065                    params.quantized_activation_max);
00066   const int flat_size =
00067       MatchingElementsSize(input1_shape, input2_shape, output_shape);
00068 
00069   const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
00070   TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
00071   TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
00072   TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
00073   TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
00074   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
00075 }
00076 
00077 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
00078                                const RuntimeShape& input1_shape,
00079                                const int8_t* input1_data,
00080                                const RuntimeShape& input2_shape,
00081                                const int8_t* input2_data,
00082                                const RuntimeShape& output_shape,
00083                                int8_t* output_data) {
00084   NdArrayDesc<4> desc1;
00085   NdArrayDesc<4> desc2;
00086   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
00087                                       &desc2);
00088   const RuntimeShape extended_output_shape =
00089       RuntimeShape::ExtendedShape(4, output_shape);
00090 
00091   // In Tensorflow, the dimensions are canonically named (batch_number, row,
00092   // col, channel), with extents (batches, height, width, depth), with the
00093   // trailing dimension changing most rapidly (channels has the smallest stride,
00094   // typically 1 element).
00095   //
00096   // In generated C code, we store arrays with the dimensions reversed. The
00097   // first dimension has smallest stride.
00098   //
00099   // We name our variables by their Tensorflow convention, but generate C code
00100   // nesting loops such that the innermost loop has the smallest stride for the
00101   // best cache behavior.
00102   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
00103     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
00104       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
00105         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
00106           const int32_t input1_val =
00107               params.input1_offset +
00108               input1_data[SubscriptToIndex(desc1, b, y, x, c)];
00109           const int32_t input2_val =
00110               params.input2_offset +
00111               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
00112           const int32_t shifted_input1_val =
00113               input1_val * (1 << params.left_shift);
00114           const int32_t shifted_input2_val =
00115               input2_val * (1 << params.left_shift);
00116           const int32_t scaled_input1_val =
00117               MultiplyByQuantizedMultiplierSmallerThanOneExp(
00118                   shifted_input1_val, params.input1_multiplier,
00119                   params.input1_shift);
00120           const int32_t scaled_input2_val =
00121               MultiplyByQuantizedMultiplierSmallerThanOneExp(
00122                   shifted_input2_val, params.input2_multiplier,
00123                   params.input2_shift);
00124           const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
00125           const int32_t raw_output =
00126               MultiplyByQuantizedMultiplierSmallerThanOneExp(
00127                   raw_sum, params.output_multiplier, params.output_shift) +
00128               params.output_offset;
00129           const int32_t clamped_output =
00130               std::min(params.quantized_activation_max,
00131                        std::max(params.quantized_activation_min, raw_output));
00132           output_data[Offset(extended_output_shape, b, y, x, c)] =
00133               static_cast<int8_t>(clamped_output);
00134         }
00135       }
00136     }
00137   }
00138 }
00139 
00140 }  // namespace reference_integer_ops
00141 }  // namespace tflite
00142 
00143 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_