Daniel Konegen / MNIST_example

Dependencies:   mbed-os

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers quantization_util.cc Source File

quantization_util.cc

00001 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
00002 
00003 Licensed under the Apache License, Version 2.0 (the "License");
00004 you may not use this file except in compliance with the License.
00005 You may obtain a copy of the License at
00006 
00007     http://www.apache.org/licenses/LICENSE-2.0
00008 
00009 Unless required by applicable law or agreed to in writing, software
00010 distributed under the License is distributed on an "AS IS" BASIS,
00011 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00012 See the License for the specific language governing permissions and
00013 limitations under the License.
00014 ==============================================================================*/
00015 
00016 #include <algorithm>
00017 #include <cmath>
00018 #include <limits>
00019 
00020 #include "tensorflow/lite/kernels/internal/compatibility.h"
00021 #include "tensorflow/lite/kernels/internal/quantization_util.h"
00022 #include "tensorflow/lite/kernels/internal/round.h"
00023 
00024 namespace tflite {
00025 
00026 namespace {
00027 // These constants are used to manipulate the binary representation of doubles.
00028 // Double-precision binary64 floating point format is:
00029 // Bit |  63  |  62-52   |   51-0   |
00030 //     | Sign | Exponent | Fraction |
00031 // To avoid 64-bit integers as much as possible, I break this into high and
00032 // low 32-bit chunks. High is:
00033 // Bit |  31  |  30-20   |      19-0     |
00034 //     | Sign | Exponent | High Fraction |
00035 // Low is:
00036 // Bit |     31-0     |
00037 //     | Low Fraction |
00038 // We then access the components through logical bit-wise operations to
00039 // extract the parts needed, with the positions and masks derived from the
00040 // layout shown above.
00041 constexpr uint64_t kSignMask = 0x8000000000000000LL;
00042 constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
00043 constexpr int32_t kExponentShift = 52;
00044 constexpr int32_t kExponentBias = 1023;
00045 constexpr uint32_t kExponentIsBadNum = 0x7ff;
00046 constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
00047 constexpr uint32_t kFractionShift = 22;
00048 constexpr uint32_t kFractionRoundingMask = 0x003fffff;
00049 constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
00050 }  // namespace
00051 
00052 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
00053                         int* shift) {
00054   if (double_multiplier == 0.) {
00055     *quantized_multiplier = 0;
00056     *shift = 0;
00057     return;
00058   }
00059 #ifdef TFLITE_EMULATE_FLOAT
00060   // If we're trying to avoid the use of floating-point instructions (for
00061   // example on microcontrollers) then use an alternative implementation
00062   // that only requires integer and bitwise operations. To enable this, you
00063   // need to set the define during the build process for your platform.
00064   int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
00065 #else   // TFLITE_EMULATE_FLOAT
00066   const double q = std::frexp(double_multiplier, shift);
00067   auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
00068 #endif  // TFLITE_EMULATE_FLOAT
00069   TFLITE_CHECK(q_fixed <= (1ll << 31));
00070   if (q_fixed == (1ll << 31)) {
00071     q_fixed /= 2;
00072     ++*shift;
00073   }
00074   TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
00075   // A shift amount smaller than -31 would cause all bits to be shifted out
00076   // and thus all results would be zero. We implement that instead with
00077   // q_fixed==0, so as to avoid hitting issues with right-shift
00078   // operations with shift amounts greater than 31. Note that this happens
00079   // roughly when abs(double_multiplier) < 2^-31 and the present handling means
00080   // that we're effectively flushing tiny double_multiplier's to zero.
00081   // We could conceivably handle values in the range (roughly) [32, 63]
00082   // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
00083   // the present handling is just doing 'flush denormals to zero'. We could
00084   // reconsider and actually generate nonzero denormals if a need arises.
00085   if (*shift < -31) {
00086     *shift = 0;
00087     q_fixed = 0;
00088   }
00089   *quantized_multiplier = static_cast<int32_t>(q_fixed);
00090 }
00091 
00092 void QuantizeMultiplierGreaterThanOne(double double_multiplier,
00093                                       int32_t* quantized_multiplier,
00094                                       int* left_shift) {
00095   TFLITE_CHECK_GT(double_multiplier, 1.);
00096   QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
00097   TFLITE_CHECK_GE(*left_shift, 0);
00098 }
00099 
00100 void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
00101                                          int32_t* quantized_multiplier,
00102                                          int* left_shift) {
00103   TFLITE_CHECK_LT(double_multiplier, 1.);
00104   TFLITE_CHECK_GT(double_multiplier, 0.);
00105   int shift;
00106   QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
00107   TFLITE_CHECK_LE(shift, 0);
00108   *left_shift = shift;
00109 }
00110 
00111 int64_t IntegerFrExp(double input, int* shift) {
00112   // Make sure our assumptions about the double layout hold.
00113   TFLITE_CHECK_EQ(8, sizeof(double));
00114 
00115   // We want to access the bits of the input double value directly, which is
00116   // tricky to do safely, so use a union to handle the casting.
00117   union {
00118     double double_value;
00119     uint64_t double_as_uint;
00120   } cast_union;
00121   cast_union.double_value = input;
00122   const uint64_t u = cast_union.double_as_uint;
00123 
00124   // If the bitfield is all zeros apart from the sign bit, this is a normalized
00125   // zero value, so return standard values for this special case.
00126   if ((u & ~kSignMask) == 0) {
00127     *shift = 0;
00128     return 0;
00129   }
00130 
00131   // Deal with NaNs and Infs, which are always indicated with a fixed pattern in
00132   // the exponent, and distinguished by whether the fractions are zero or
00133   // non-zero.
00134   const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
00135   if (exponent_part == kExponentIsBadNum) {
00136     *shift = std::numeric_limits<int>::max();
00137     if (u & kFractionMask) {
00138       // NaN, so just return zero (with the exponent set to INT_MAX).
00139       return 0;
00140     } else {
00141       // Infinity, so return +/- INT_MAX.
00142       if (u & kSignMask) {
00143         return std::numeric_limits<int64_t>::min();
00144       } else {
00145         return std::numeric_limits<int64_t>::max();
00146       }
00147     }
00148   }
00149 
00150   // The shift is fairly easy to extract from the high bits of the double value,
00151   // just by masking it out and applying a bias. The std::frexp() implementation
00152   // always returns values between 0.5 and 1.0 though, whereas the exponent
00153   // assumes 1.0 to 2.0 is the standard range, so I add on one to match that
00154   // interface.
00155   *shift = (exponent_part - kExponentBias) + 1;
00156 
00157   // There's an implicit high bit in the double format definition, so make sure
00158   // we include that at the top, and then reconstruct the rest of the fractional
00159   // value from the remaining fragments.
00160   int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
00161 
00162   // We're cutting off some bits at the bottom, so to exactly match the standard
00163   // frexp implementation here we'll apply rounding by adding one to the least
00164   // significant bit of the result if the discarded portion is over half of the
00165   // maximum.
00166   if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
00167     fraction += 1;
00168   }
00169   // Negate the fraction if the sign bit was set.
00170   if (u & kSignMask) {
00171     fraction *= -1;
00172   }
00173 
00174   return fraction;
00175 }
00176 
00177 double DoubleFromFractionAndShift(int64_t fraction, int shift) {
00178   union {
00179     double double_value;
00180     uint64_t double_as_uint;
00181   } result;
00182 
00183   // Detect NaNs and infinities.
00184   if (shift == std::numeric_limits<int>::max()) {
00185     if (fraction == 0) {
00186       return NAN;
00187     } else if (fraction > 0) {
00188       return INFINITY;
00189     } else {
00190       return -INFINITY;
00191     }
00192   }
00193 
00194   // Return a normalized zero for a zero fraction.
00195   if (fraction == 0) {
00196     result.double_as_uint = 0;
00197     return result.double_value;
00198   }
00199 
00200   bool is_negative = (fraction < 0);
00201   int64_t encoded_fraction = is_negative ? -fraction : fraction;
00202   int64_t encoded_shift = (shift - 1);
00203   while (encoded_fraction < 0x40000000) {
00204     encoded_fraction *= 2;
00205     encoded_shift -= 1;
00206   }
00207   while (encoded_fraction > 0x80000000) {
00208     encoded_fraction /= 2;
00209     encoded_shift += 1;
00210   }
00211   encoded_fraction -= 0x40000000;
00212   if (encoded_shift < -1022) {
00213     encoded_shift = -1023;
00214   } else if (encoded_shift > 1022) {
00215     encoded_shift = 1023;
00216   }
00217   encoded_shift += kExponentBias;
00218   uint64_t encoded_sign = is_negative ? kSignMask : 0;
00219   result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
00220                           (encoded_fraction << kFractionShift);
00221   return result.double_value;
00222 }
00223 
00224 double IntegerDoubleMultiply(double a, double b) {
00225   int a_shift;
00226   const int64_t a_fraction = IntegerFrExp(a, &a_shift);
00227   int b_shift;
00228   const int64_t b_fraction = IntegerFrExp(b, &b_shift);
00229   // Detect NaNs and infinities.
00230   if (a_shift == std::numeric_limits<int>::max() ||
00231       (b_shift == std::numeric_limits<int>::max())) {
00232     return NAN;
00233   }
00234   const int result_shift = a_shift + b_shift + 1;
00235   const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
00236   return DoubleFromFractionAndShift(result_fraction, result_shift);
00237 }
00238 
00239 int IntegerDoubleCompare(double a, double b) {
00240   int a_shift;
00241   const int64_t a_fraction = IntegerFrExp(a, &a_shift);
00242   int b_shift;
00243   const int64_t b_fraction = IntegerFrExp(b, &b_shift);
00244 
00245   // Detect NaNs and infinities.
00246   if (a_shift == std::numeric_limits<int>::max() ||
00247       (b_shift == std::numeric_limits<int>::max())) {
00248     return 1;
00249   }
00250 
00251   if ((a_fraction == 0) && (b_fraction < 0)) {
00252     return 1;
00253   } else if ((a_fraction < 0) && (b_fraction == 0)) {
00254     return -1;
00255   } else if (a_shift < b_shift) {
00256     return -1;
00257   } else if (a_shift > b_shift) {
00258     return 1;
00259   } else if (a_fraction < b_fraction) {
00260     return -1;
00261   } else if (a_fraction > b_fraction) {
00262     return 1;
00263   } else {
00264     return 0;
00265   }
00266 }
00267 
00268 void PreprocessSoftmaxScaling(double beta, double input_scale,
00269                               int input_integer_bits,
00270                               int32_t* quantized_multiplier, int* left_shift) {
00271   // If the overall multiplier (input and beta) is large, then exp() of an
00272   // input difference of 1 scaled by this will be large.  In other words, we
00273   // can cap the multiplier and know that, when it is used, the output will be
00274   // (round to) zero wherever the input is not at the maximum value.
00275 
00276   // If the overall scale is less than one, and input_integer_bits=0, then the
00277   // result is double equivalent of Q0.31 (actually with more precision). Thus
00278   // this generates a Q(input_integer_bits).(31-input_integer_bits)
00279   // representation.
00280 #ifdef TFLITE_EMULATE_FLOAT
00281   const double input_beta = IntegerDoubleMultiply(beta, input_scale);
00282   int shift;
00283   int64_t fraction = IntegerFrExp(input_beta, &shift);
00284   shift += (31 - input_integer_bits);
00285   double input_beta_real_multiplier =
00286       DoubleFromFractionAndShift(fraction, shift);
00287   if (IntegerDoubleCompare(input_beta_real_multiplier, (1ll << 31) - 1.0) > 0) {
00288     input_beta_real_multiplier = (1ll << 31) - 1.0;
00289   }
00290 #else   // TFLITE_EMULATE_FLOAT
00291   const double input_beta_real_multiplier = std::min(
00292       beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
00293 #endif  // TFLITE_EMULATE_FLOAT
00294 
00295   QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
00296                                    quantized_multiplier, left_shift);
00297 }
00298 
00299 void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
00300                                     int input_integer_bits,
00301                                     int32_t* quantized_multiplier,
00302                                     int* left_shift,
00303                                     int32_t* reverse_scaling_divisor,
00304                                     int* reverse_scaling_left_shift) {
00305   PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
00306                            quantized_multiplier, left_shift);
00307 
00308   // Also calculate what amounts to the inverse scaling factor for the input.
00309   const double real_reverse_scaling_divisor =
00310       (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
00311   tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
00312                                               reverse_scaling_divisor,
00313                                               reverse_scaling_left_shift);
00314 }
00315 
00316 int CalculateInputRadius(int input_integer_bits, int input_left_shift,
00317                          int total_signed_bits) {
00318 #ifdef TFLITE_EMULATE_FLOAT
00319   int64_t result = (1 << input_integer_bits) - 1;
00320   result <<= (total_signed_bits - input_integer_bits);
00321   result >>= input_left_shift;
00322   return result;
00323 #else   // TFLITE_EMULATE_FLOAT
00324   const double max_input_rescaled =
00325       1.0 * ((1 << input_integer_bits) - 1) *
00326       (1ll << (total_signed_bits - input_integer_bits)) /
00327       (1ll << input_left_shift);
00328   // Tighten bound using floor.  Suppose that we could use the exact value.
00329   // After scaling the difference, the result would be at the maximum.  Thus we
00330   // must ensure that our value has lower magnitude.
00331   return static_cast<int>(std::floor(max_input_rescaled));
00332 #endif  // TFLITE_EMULATE_FLOAT
00333 }
00334 
00335 void NudgeQuantizationRange(const float min, const float max,
00336                             const int quant_min, const int quant_max,
00337                             float* nudged_min, float* nudged_max,
00338                             float* nudged_scale) {
00339   // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
00340   const float quant_min_float = static_cast<float>(quant_min);
00341   const float quant_max_float = static_cast<float>(quant_max);
00342   *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
00343   const float zero_point_from_min = quant_min_float - min / *nudged_scale;
00344   uint16 nudged_zero_point;
00345   if (zero_point_from_min < quant_min_float) {
00346     nudged_zero_point = static_cast<uint16>(quant_min);
00347   } else if (zero_point_from_min > quant_max_float) {
00348     nudged_zero_point = static_cast<uint16>(quant_max);
00349   } else {
00350     nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
00351   }
00352   *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
00353   *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
00354 }
00355 
00356 void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
00357                        const float nudged_max, const float* input_data,
00358                        float* output_data, const float size) {
00359   // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
00360   const float inv_nudged_scale = 1.0f / nudged_scale;
00361 
00362   for (int i = 0; i < size; i++) {
00363     const float src_val = input_data[i];
00364     const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
00365     const float clamped_shifted = clamped - nudged_min;
00366     const float dst_val =
00367         TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
00368         nudged_min;
00369     output_data[i] = dst_val;
00370   }
00371 }
00372 
00373 bool CheckedLog2(const float x, int* log2_result) {
00374   // Using TfLiteRound instead of std::round and std::log instead of
00375   // std::log2 to work around these fuctions being missing in a toolchain
00376   // used in some TensorFlow tests as of May 2018.
00377   const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
00378   const float x_log2_rounded = TfLiteRound(x_log2);
00379   const float x_log2_fracpart = x_log2 - x_log2_rounded;
00380 
00381   *log2_result = static_cast<int>(x_log2_rounded);
00382   return std::abs(x_log2_fracpart) < 1e-3;
00383 }
00384 
00385 void QuantizeMultiplierArray(const double* effective_scales, size_t size,
00386                              int32_t* effective_scale_significand,
00387                              int* effective_shift) {
00388   for (size_t i = 0; i < size; ++i) {
00389     QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
00390                        &effective_shift[i]);
00391   }
00392 }
00393 
00394 }  // namespace tflite