Daniel Konegen / MNIST_example

Dependencies:   mbed-os

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers quantization_util.h Source File

quantization_util.h

00001 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
00002 
00003 Licensed under the Apache License, Version 2.0 (the "License");
00004 you may not use this file except in compliance with the License.
00005 You may obtain a copy of the License at
00006 
00007     http://www.apache.org/licenses/LICENSE-2.0
00008 
00009 Unless required by applicable law or agreed to in writing, software
00010 distributed under the License is distributed on an "AS IS" BASIS,
00011 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00012 See the License for the specific language governing permissions and
00013 limitations under the License.
00014 ==============================================================================*/
00015 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
00016 #define TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
00017 
00018 #include <cmath>
00019 #include <cstdint>
00020 #include <limits>
00021 
00022 #include "tensorflow/lite/kernels/internal/compatibility.h"
00023 #include "tensorflow/lite/kernels/internal/round.h"
00024 #include "tensorflow/lite/kernels/internal/types.h"
00025 
00026 namespace tflite {
00027 
00028 // Given the min and max values of a float array, return
00029 // reasonable quantization parameters to use for this array.
00030 template <typename T>
00031 QuantizationParams ChooseQuantizationParams(double rmin, double rmax,
00032                                             bool narrow_range) {
00033   const T qmin = std::numeric_limits<T>::min() + (narrow_range ? 1 : 0);
00034   const T qmax = std::numeric_limits<T>::max();
00035   const double qmin_double = qmin;
00036   const double qmax_double = qmax;
00037   // 0 should always be a representable value. Let's assume that the initial
00038   // min,max range contains 0.
00039   TFLITE_CHECK_LE(rmin, 0.);
00040   TFLITE_CHECK_GE(rmax, 0.);
00041   if (rmin == rmax) {
00042     // Special case where the min,max range is a point. Should be {0}.
00043     TFLITE_CHECK_EQ(rmin, 0.);
00044     TFLITE_CHECK_EQ(rmax, 0.);
00045     QuantizationParams quantization_params;
00046     quantization_params.zero_point = 0;
00047     quantization_params.scale = 0.;
00048     return quantization_params;
00049   }
00050 
00051   // General case.
00052   //
00053   // First determine the scale.
00054   const double scale = (rmax - rmin) / (qmax_double - qmin_double);
00055 
00056   // Zero-point computation.
00057   // First the initial floating-point computation. The zero-point can be
00058   // determined from solving an affine equation for any known pair
00059   // (real value, corresponding quantized value).
00060   // We know two such pairs: (rmin, qmin) and (rmax, qmax).
00061   // The arithmetic error on the zero point computed from either pair
00062   // will be roughly machine_epsilon * (sum of absolute values of terms)
00063   // so we want to use the variant that adds the smaller terms.
00064   const double zero_point_from_min = qmin_double - rmin / scale;
00065   const double zero_point_from_max = qmax_double - rmax / scale;
00066   const double zero_point_from_min_error =
00067       std::abs(qmin_double) + std::abs(rmin / scale);
00068   const double zero_point_from_max_error =
00069       std::abs(qmax_double) + std::abs(rmax / scale);
00070 
00071   const double zero_point_double =
00072       zero_point_from_min_error < zero_point_from_max_error
00073           ? zero_point_from_min
00074           : zero_point_from_max;
00075 
00076   // Now we need to nudge the zero point to be an integer
00077   // (our zero points are integer, and this is motivated by the requirement
00078   // to be able to represent the real value "0" exactly as a quantized value,
00079   // which is required in multiple places, for example in Im2col with SAME
00080   // padding).
00081   T nudged_zero_point = 0;
00082   if (zero_point_double < qmin_double) {
00083     nudged_zero_point = qmin;
00084   } else if (zero_point_double > qmax_double) {
00085     nudged_zero_point = qmax;
00086   } else {
00087     nudged_zero_point = static_cast<T>(round(zero_point_double));
00088   }
00089   // The zero point should always be in the range of quantized value,
00090   // [qmin, qmax].
00091   TFLITE_CHECK_GE(nudged_zero_point, qmin);
00092   TFLITE_CHECK_LE(nudged_zero_point, qmax);
00093 
00094   // Finally, store the result nudged quantization params.
00095   QuantizationParams quantization_params;
00096   quantization_params.zero_point = nudged_zero_point;
00097   quantization_params.scale = scale;
00098   return quantization_params;
00099 }
00100 
00101 template <typename T>
00102 QuantizationParams ChooseQuantizationParams(double rmin, double rmax) {
00103   return ChooseQuantizationParams<T>(rmin, rmax, false);
00104 }
00105 
00106 // Converts a floating-point number to an integer. For all inputs x where
00107 // static_cast<IntOut>(x) is legal according to the C++ standard, the result
00108 // is identical to that cast (i.e. the result is x with its fractional part
00109 // truncated whenever that is representable as IntOut).
00110 //
00111 // static_cast would cause undefined behavior for the following cases, which
00112 // have well-defined behavior for this function:
00113 //
00114 //  1. If x is NaN, the result is zero.
00115 //
00116 //  2. If the truncated form of x is above the representable range of IntOut,
00117 //     the result is std::numeric_limits<IntOut>::max().
00118 //
00119 //  3. If the truncated form of x is below the representable range of IntOut,
00120 //     the result is std::numeric_limits<IntOut>::min().
00121 //
00122 // Note that cases #2 and #3 cover infinities as well as finite numbers.
00123 //
00124 // The range of FloatIn must include the range of IntOut, otherwise
00125 // the results are undefined.
00126 // TODO(sfeuz): Replace by absl::SafeCast once available.
00127 template <class IntOut, class FloatIn>
00128 IntOut SafeCast(FloatIn x) {
00129   static_assert(!std::numeric_limits<FloatIn>::is_integer,
00130                 "FloatIn is integer");
00131   static_assert(std::numeric_limits<IntOut>::is_integer,
00132                 "IntOut is not integer");
00133   static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
00134 
00135   // Special case NaN, for which the logic below doesn't work.
00136   if (std::isnan(x)) {
00137     return 0;
00138   }
00139 
00140   // Negative values all clip to zero for unsigned results.
00141   if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
00142     return 0;
00143   }
00144 
00145   // Handle infinities.
00146   if (std::isinf(x)) {
00147     return x < 0 ? std::numeric_limits<IntOut>::min()
00148                  : std::numeric_limits<IntOut>::max();
00149   }
00150 
00151   // Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
00152   // unless x is zero in which case exp == 0. Note that this implies that the
00153   // magnitude of x is strictly less than 2^exp.
00154   int exp = 0;
00155   std::frexp(x, &exp);
00156 
00157   // Let N be the number of non-sign bits in the representation of IntOut. If
00158   // the magnitude of x is strictly less than 2^N, the truncated version of x
00159   // is representable as IntOut. The only representable integer for which this
00160   // is not the case is kMin for signed types (i.e. -2^N), but that is covered
00161   // by the fall-through below.
00162   if (exp <= std::numeric_limits<IntOut>::digits) {
00163     return x;
00164   }
00165 
00166   // Handle numbers with magnitude >= 2^N.
00167   return x < 0 ? std::numeric_limits<IntOut>::min()
00168                : std::numeric_limits<IntOut>::max();
00169 }
00170 
00171 // Decompose a double multiplier into a Q0.31 int32 representation of its
00172 // significand, and shift representation of NEGATIVE its exponent ---
00173 // this is intended as a RIGHT-shift.
00174 //
00175 // Restricted to the case where the multiplier < 1 (and non-negative).
00176 void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
00177                                          int32_t* quantized_multiplier,
00178                                          int* left_shift);
00179 
00180 // Decompose a double multiplier into a Q0.31 int32 representation of its
00181 // significand, and shift representation of its exponent.
00182 //
00183 // Restricted to the case where the multiplier > 1.
00184 void QuantizeMultiplierGreaterThanOne(double double_multiplier,
00185                                       int32_t* quantized_multiplier,
00186                                       int* left_shift);
00187 
00188 // Decompose a double multiplier into a Q0.31 int32 representation of its
00189 // significand, and shift representation of its exponent.
00190 //
00191 // Handles an arbitrary positive multiplier. The 'shift' output-value is
00192 // basically the 'floating-point exponent' of the multiplier:
00193 // Negative for a right-shift (when the multiplier is <1), positive for a
00194 // left-shift (when the multiplier is >1)
00195 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
00196                         int* shift);
00197 
00198 // Splits a double input value into a returned fraction, and a shift value from
00199 // the exponent, using only bitwise and integer operations to support
00200 // microcontrollers and other environments without floating-point support.
00201 //
00202 // This is designed to be a replacement for how std::frexp() is used within the
00203 // QuantizeMultiplier() function, and so has a different signature than the
00204 // standard version, returning a 64-bit integer rather than a double. This
00205 // result has a maximum value of 1<<31, with the fraction expressed as a
00206 // proportion of that maximum.
00207 //
00208 // std::frexp() returns NaNs and infinities unmodified, but since we're
00209 // returning integers that can't represent those values, instead we return
00210 // a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64
00211 // result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and
00212 // std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will
00213 // result in return values that end up truncating some bits at the end,
00214 // reflecting the loss of precision inherent in denormalization.
00215 int64_t IntegerFrExp(double input, int* shift);
00216 
00217 // Converts an integer fraction in the format produced by IntegerFrExp (where
00218 // 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an
00219 // IEEE binary64 double format result. The implementation uses only integer and
00220 // bitwise operators, so no floating point hardware support or emulation is
00221 // needed. This is here so quantized operations can run non-time-critical
00222 // preparation calculations on microcontrollers and other platforms without
00223 // float support.
00224 double DoubleFromFractionAndShift(int64_t fraction, int shift);
00225 
00226 // Performs a multiplication of two numbers in double format, using only integer
00227 // and bitwise instructions. This is aimed at supporting housekeeping functions
00228 // for quantized operations on microcontrollers without floating-point hardware.
00229 double IntegerDoubleMultiply(double a, double b);
00230 
00231 // Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is
00232 // greater than b. It is implemented using only integer and logical instructions
00233 // so that it can be easily run on microcontrollers for quantized operations.
00234 int IntegerDoubleCompare(double a, double b);
00235 
00236 // This first creates a multiplier in a double equivalent of
00237 // Q(input_integer_bits).(31-input_integer_bits) representation, with extra
00238 // precision in the double's fractional bits.  It then splits the result into
00239 // significand and exponent.
00240 void PreprocessSoftmaxScaling(double beta, double input_scale,
00241                               int input_integer_bits,
00242                               int32_t* quantized_multiplier, int* left_shift);
00243 // Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
00244 void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
00245                                     int input_integer_bits,
00246                                     int32_t* quantized_multiplier,
00247                                     int* left_shift,
00248                                     int32_t* reverse_scaling_divisor,
00249                                     int* reverse_scaling_left_shift);
00250 // Calculate the largest input that will result in a within-bounds intermediate
00251 // result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
00252 // it must not overflow before we reduce the value by multiplication by the
00253 // input multiplier.  The negative radius is used as the minimum difference in
00254 // Softmax.
00255 int CalculateInputRadius(int input_integer_bits, int input_left_shift,
00256                          int total_signed_bits = 31);
00257 
00258 // Nudges a min/max quantization range to ensure zero is zero.
00259 // Gymnastics with nudged zero point is to ensure that real zero maps to
00260 // an integer, which is required for e.g. zero-padding in convolutional layers.
00261 // Outputs nudged_min, nudged_max, nudged_scale.
00262 void NudgeQuantizationRange(const float min, const float max,
00263                             const int quant_min, const int quant_max,
00264                             float* nudged_min, float* nudged_max,
00265                             float* nudged_scale);
00266 
00267 // Fake quantizes (quantizes and dequantizes) input_data using the scale,
00268 // nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code
00269 // in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor.
00270 void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
00271                        const float nudged_max, const float* input_data,
00272                        float* output_data, const float size);
00273 
00274 // If x is approximately a power of two (with any positive or negative
00275 // exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise
00276 // returns false.
00277 bool CheckedLog2(const float x, int* log2_result);
00278 
00279 // Decomposes an array of double multipliers into a Q0.31 int32 representation
00280 // of its significand, and shift representation of its exponent.
00281 //
00282 // Handles an arbitrary multiplier. The 'shift' output-value is
00283 // basically the 'floating-point exponent' of the multiplier:
00284 // Negative for a right-shift (when the multiplier is <1), positive for a
00285 // left-shift (when the multiplier is >1)
00286 void QuantizeMultiplierArray(const double* effective_scales, size_t size,
00287                              int32_t* effective_scale_significand,
00288                              int* effective_shift);
00289 
00290 }  // namespace tflite
00291 
00292 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_