Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
quantization_util.cc
00001 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 00002 00003 Licensed under the Apache License, Version 2.0 (the "License"); 00004 you may not use this file except in compliance with the License. 00005 You may obtain a copy of the License at 00006 00007 http://www.apache.org/licenses/LICENSE-2.0 00008 00009 Unless required by applicable law or agreed to in writing, software 00010 distributed under the License is distributed on an "AS IS" BASIS, 00011 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00012 See the License for the specific language governing permissions and 00013 limitations under the License. 00014 ==============================================================================*/ 00015 00016 #include <algorithm> 00017 #include <cmath> 00018 #include <limits> 00019 00020 #include "tensorflow/lite/kernels/internal/compatibility.h" 00021 #include "tensorflow/lite/kernels/internal/quantization_util.h" 00022 #include "tensorflow/lite/kernels/internal/round.h" 00023 00024 namespace tflite { 00025 00026 namespace { 00027 // These constants are used to manipulate the binary representation of doubles. 00028 // Double-precision binary64 floating point format is: 00029 // Bit | 63 | 62-52 | 51-0 | 00030 // | Sign | Exponent | Fraction | 00031 // To avoid 64-bit integers as much as possible, I break this into high and 00032 // low 32-bit chunks. High is: 00033 // Bit | 31 | 30-20 | 19-0 | 00034 // | Sign | Exponent | High Fraction | 00035 // Low is: 00036 // Bit | 31-0 | 00037 // | Low Fraction | 00038 // We then access the components through logical bit-wise operations to 00039 // extract the parts needed, with the positions and masks derived from the 00040 // layout shown above. 00041 constexpr uint64_t kSignMask = 0x8000000000000000LL; 00042 constexpr uint64_t kExponentMask = 0x7ff0000000000000LL; 00043 constexpr int32_t kExponentShift = 52; 00044 constexpr int32_t kExponentBias = 1023; 00045 constexpr uint32_t kExponentIsBadNum = 0x7ff; 00046 constexpr uint64_t kFractionMask = 0x000fffffffc00000LL; 00047 constexpr uint32_t kFractionShift = 22; 00048 constexpr uint32_t kFractionRoundingMask = 0x003fffff; 00049 constexpr uint32_t kFractionRoundingThreshold = 0x00200000; 00050 } // namespace 00051 00052 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier, 00053 int* shift) { 00054 if (double_multiplier == 0.) { 00055 *quantized_multiplier = 0; 00056 *shift = 0; 00057 return; 00058 } 00059 #ifdef TFLITE_EMULATE_FLOAT 00060 // If we're trying to avoid the use of floating-point instructions (for 00061 // example on microcontrollers) then use an alternative implementation 00062 // that only requires integer and bitwise operations. To enable this, you 00063 // need to set the define during the build process for your platform. 00064 int64_t q_fixed = IntegerFrExp(double_multiplier, shift); 00065 #else // TFLITE_EMULATE_FLOAT 00066 const double q = std::frexp(double_multiplier, shift); 00067 auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31))); 00068 #endif // TFLITE_EMULATE_FLOAT 00069 TFLITE_CHECK(q_fixed <= (1ll << 31)); 00070 if (q_fixed == (1ll << 31)) { 00071 q_fixed /= 2; 00072 ++*shift; 00073 } 00074 TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max()); 00075 // A shift amount smaller than -31 would cause all bits to be shifted out 00076 // and thus all results would be zero. We implement that instead with 00077 // q_fixed==0, so as to avoid hitting issues with right-shift 00078 // operations with shift amounts greater than 31. Note that this happens 00079 // roughly when abs(double_multiplier) < 2^-31 and the present handling means 00080 // that we're effectively flushing tiny double_multiplier's to zero. 00081 // We could conceivably handle values in the range (roughly) [32, 63] 00082 // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view 00083 // the present handling is just doing 'flush denormals to zero'. We could 00084 // reconsider and actually generate nonzero denormals if a need arises. 00085 if (*shift < -31) { 00086 *shift = 0; 00087 q_fixed = 0; 00088 } 00089 *quantized_multiplier = static_cast<int32_t>(q_fixed); 00090 } 00091 00092 void QuantizeMultiplierGreaterThanOne(double double_multiplier, 00093 int32_t* quantized_multiplier, 00094 int* left_shift) { 00095 TFLITE_CHECK_GT(double_multiplier, 1.); 00096 QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift); 00097 TFLITE_CHECK_GE(*left_shift, 0); 00098 } 00099 00100 void QuantizeMultiplierSmallerThanOneExp(double double_multiplier, 00101 int32_t* quantized_multiplier, 00102 int* left_shift) { 00103 TFLITE_CHECK_LT(double_multiplier, 1.); 00104 TFLITE_CHECK_GT(double_multiplier, 0.); 00105 int shift; 00106 QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift); 00107 TFLITE_CHECK_LE(shift, 0); 00108 *left_shift = shift; 00109 } 00110 00111 int64_t IntegerFrExp(double input, int* shift) { 00112 // Make sure our assumptions about the double layout hold. 00113 TFLITE_CHECK_EQ(8, sizeof(double)); 00114 00115 // We want to access the bits of the input double value directly, which is 00116 // tricky to do safely, so use a union to handle the casting. 00117 union { 00118 double double_value; 00119 uint64_t double_as_uint; 00120 } cast_union; 00121 cast_union.double_value = input; 00122 const uint64_t u = cast_union.double_as_uint; 00123 00124 // If the bitfield is all zeros apart from the sign bit, this is a normalized 00125 // zero value, so return standard values for this special case. 00126 if ((u & ~kSignMask) == 0) { 00127 *shift = 0; 00128 return 0; 00129 } 00130 00131 // Deal with NaNs and Infs, which are always indicated with a fixed pattern in 00132 // the exponent, and distinguished by whether the fractions are zero or 00133 // non-zero. 00134 const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift); 00135 if (exponent_part == kExponentIsBadNum) { 00136 *shift = std::numeric_limits<int>::max(); 00137 if (u & kFractionMask) { 00138 // NaN, so just return zero (with the exponent set to INT_MAX). 00139 return 0; 00140 } else { 00141 // Infinity, so return +/- INT_MAX. 00142 if (u & kSignMask) { 00143 return std::numeric_limits<int64_t>::min(); 00144 } else { 00145 return std::numeric_limits<int64_t>::max(); 00146 } 00147 } 00148 } 00149 00150 // The shift is fairly easy to extract from the high bits of the double value, 00151 // just by masking it out and applying a bias. The std::frexp() implementation 00152 // always returns values between 0.5 and 1.0 though, whereas the exponent 00153 // assumes 1.0 to 2.0 is the standard range, so I add on one to match that 00154 // interface. 00155 *shift = (exponent_part - kExponentBias) + 1; 00156 00157 // There's an implicit high bit in the double format definition, so make sure 00158 // we include that at the top, and then reconstruct the rest of the fractional 00159 // value from the remaining fragments. 00160 int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift); 00161 00162 // We're cutting off some bits at the bottom, so to exactly match the standard 00163 // frexp implementation here we'll apply rounding by adding one to the least 00164 // significant bit of the result if the discarded portion is over half of the 00165 // maximum. 00166 if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) { 00167 fraction += 1; 00168 } 00169 // Negate the fraction if the sign bit was set. 00170 if (u & kSignMask) { 00171 fraction *= -1; 00172 } 00173 00174 return fraction; 00175 } 00176 00177 double DoubleFromFractionAndShift(int64_t fraction, int shift) { 00178 union { 00179 double double_value; 00180 uint64_t double_as_uint; 00181 } result; 00182 00183 // Detect NaNs and infinities. 00184 if (shift == std::numeric_limits<int>::max()) { 00185 if (fraction == 0) { 00186 return NAN; 00187 } else if (fraction > 0) { 00188 return INFINITY; 00189 } else { 00190 return -INFINITY; 00191 } 00192 } 00193 00194 // Return a normalized zero for a zero fraction. 00195 if (fraction == 0) { 00196 result.double_as_uint = 0; 00197 return result.double_value; 00198 } 00199 00200 bool is_negative = (fraction < 0); 00201 int64_t encoded_fraction = is_negative ? -fraction : fraction; 00202 int64_t encoded_shift = (shift - 1); 00203 while (encoded_fraction < 0x40000000) { 00204 encoded_fraction *= 2; 00205 encoded_shift -= 1; 00206 } 00207 while (encoded_fraction > 0x80000000) { 00208 encoded_fraction /= 2; 00209 encoded_shift += 1; 00210 } 00211 encoded_fraction -= 0x40000000; 00212 if (encoded_shift < -1022) { 00213 encoded_shift = -1023; 00214 } else if (encoded_shift > 1022) { 00215 encoded_shift = 1023; 00216 } 00217 encoded_shift += kExponentBias; 00218 uint64_t encoded_sign = is_negative ? kSignMask : 0; 00219 result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) | 00220 (encoded_fraction << kFractionShift); 00221 return result.double_value; 00222 } 00223 00224 double IntegerDoubleMultiply(double a, double b) { 00225 int a_shift; 00226 const int64_t a_fraction = IntegerFrExp(a, &a_shift); 00227 int b_shift; 00228 const int64_t b_fraction = IntegerFrExp(b, &b_shift); 00229 // Detect NaNs and infinities. 00230 if (a_shift == std::numeric_limits<int>::max() || 00231 (b_shift == std::numeric_limits<int>::max())) { 00232 return NAN; 00233 } 00234 const int result_shift = a_shift + b_shift + 1; 00235 const int64_t result_fraction = (a_fraction * b_fraction) >> 32; 00236 return DoubleFromFractionAndShift(result_fraction, result_shift); 00237 } 00238 00239 int IntegerDoubleCompare(double a, double b) { 00240 int a_shift; 00241 const int64_t a_fraction = IntegerFrExp(a, &a_shift); 00242 int b_shift; 00243 const int64_t b_fraction = IntegerFrExp(b, &b_shift); 00244 00245 // Detect NaNs and infinities. 00246 if (a_shift == std::numeric_limits<int>::max() || 00247 (b_shift == std::numeric_limits<int>::max())) { 00248 return 1; 00249 } 00250 00251 if ((a_fraction == 0) && (b_fraction < 0)) { 00252 return 1; 00253 } else if ((a_fraction < 0) && (b_fraction == 0)) { 00254 return -1; 00255 } else if (a_shift < b_shift) { 00256 return -1; 00257 } else if (a_shift > b_shift) { 00258 return 1; 00259 } else if (a_fraction < b_fraction) { 00260 return -1; 00261 } else if (a_fraction > b_fraction) { 00262 return 1; 00263 } else { 00264 return 0; 00265 } 00266 } 00267 00268 void PreprocessSoftmaxScaling(double beta, double input_scale, 00269 int input_integer_bits, 00270 int32_t* quantized_multiplier, int* left_shift) { 00271 // If the overall multiplier (input and beta) is large, then exp() of an 00272 // input difference of 1 scaled by this will be large. In other words, we 00273 // can cap the multiplier and know that, when it is used, the output will be 00274 // (round to) zero wherever the input is not at the maximum value. 00275 00276 // If the overall scale is less than one, and input_integer_bits=0, then the 00277 // result is double equivalent of Q0.31 (actually with more precision). Thus 00278 // this generates a Q(input_integer_bits).(31-input_integer_bits) 00279 // representation. 00280 #ifdef TFLITE_EMULATE_FLOAT 00281 const double input_beta = IntegerDoubleMultiply(beta, input_scale); 00282 int shift; 00283 int64_t fraction = IntegerFrExp(input_beta, &shift); 00284 shift += (31 - input_integer_bits); 00285 double input_beta_real_multiplier = 00286 DoubleFromFractionAndShift(fraction, shift); 00287 if (IntegerDoubleCompare(input_beta_real_multiplier, (1ll << 31) - 1.0) > 0) { 00288 input_beta_real_multiplier = (1ll << 31) - 1.0; 00289 } 00290 #else // TFLITE_EMULATE_FLOAT 00291 const double input_beta_real_multiplier = std::min( 00292 beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0); 00293 #endif // TFLITE_EMULATE_FLOAT 00294 00295 QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier, 00296 quantized_multiplier, left_shift); 00297 } 00298 00299 void PreprocessLogSoftmaxScalingExp(double beta, double input_scale, 00300 int input_integer_bits, 00301 int32_t* quantized_multiplier, 00302 int* left_shift, 00303 int32_t* reverse_scaling_divisor, 00304 int* reverse_scaling_left_shift) { 00305 PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits, 00306 quantized_multiplier, left_shift); 00307 00308 // Also calculate what amounts to the inverse scaling factor for the input. 00309 const double real_reverse_scaling_divisor = 00310 (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier); 00311 tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor, 00312 reverse_scaling_divisor, 00313 reverse_scaling_left_shift); 00314 } 00315 00316 int CalculateInputRadius(int input_integer_bits, int input_left_shift, 00317 int total_signed_bits) { 00318 #ifdef TFLITE_EMULATE_FLOAT 00319 int64_t result = (1 << input_integer_bits) - 1; 00320 result <<= (total_signed_bits - input_integer_bits); 00321 result >>= input_left_shift; 00322 return result; 00323 #else // TFLITE_EMULATE_FLOAT 00324 const double max_input_rescaled = 00325 1.0 * ((1 << input_integer_bits) - 1) * 00326 (1ll << (total_signed_bits - input_integer_bits)) / 00327 (1ll << input_left_shift); 00328 // Tighten bound using floor. Suppose that we could use the exact value. 00329 // After scaling the difference, the result would be at the maximum. Thus we 00330 // must ensure that our value has lower magnitude. 00331 return static_cast<int>(std::floor(max_input_rescaled)); 00332 #endif // TFLITE_EMULATE_FLOAT 00333 } 00334 00335 void NudgeQuantizationRange(const float min, const float max, 00336 const int quant_min, const int quant_max, 00337 float* nudged_min, float* nudged_max, 00338 float* nudged_scale) { 00339 // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h. 00340 const float quant_min_float = static_cast<float>(quant_min); 00341 const float quant_max_float = static_cast<float>(quant_max); 00342 *nudged_scale = (max - min) / (quant_max_float - quant_min_float); 00343 const float zero_point_from_min = quant_min_float - min / *nudged_scale; 00344 uint16 nudged_zero_point; 00345 if (zero_point_from_min < quant_min_float) { 00346 nudged_zero_point = static_cast<uint16>(quant_min); 00347 } else if (zero_point_from_min > quant_max_float) { 00348 nudged_zero_point = static_cast<uint16>(quant_max); 00349 } else { 00350 nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min)); 00351 } 00352 *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale); 00353 *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale); 00354 } 00355 00356 void FakeQuantizeArray(const float nudged_scale, const float nudged_min, 00357 const float nudged_max, const float* input_data, 00358 float* output_data, const float size) { 00359 // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h. 00360 const float inv_nudged_scale = 1.0f / nudged_scale; 00361 00362 for (int i = 0; i < size; i++) { 00363 const float src_val = input_data[i]; 00364 const float clamped = std::min(nudged_max, std::max(nudged_min, src_val)); 00365 const float clamped_shifted = clamped - nudged_min; 00366 const float dst_val = 00367 TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale + 00368 nudged_min; 00369 output_data[i] = dst_val; 00370 } 00371 } 00372 00373 bool CheckedLog2(const float x, int* log2_result) { 00374 // Using TfLiteRound instead of std::round and std::log instead of 00375 // std::log2 to work around these fuctions being missing in a toolchain 00376 // used in some TensorFlow tests as of May 2018. 00377 const float x_log2 = std::log(x) * (1.0f / std::log(2.0f)); 00378 const float x_log2_rounded = TfLiteRound(x_log2); 00379 const float x_log2_fracpart = x_log2 - x_log2_rounded; 00380 00381 *log2_result = static_cast<int>(x_log2_rounded); 00382 return std::abs(x_log2_fracpart) < 1e-3; 00383 } 00384 00385 void QuantizeMultiplierArray(const double* effective_scales, size_t size, 00386 int32_t* effective_scale_significand, 00387 int* effective_shift) { 00388 for (size_t i = 0; i < size; ++i) { 00389 QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i], 00390 &effective_shift[i]); 00391 } 00392 } 00393 00394 } // namespace tflite
Generated on Wed Jul 13 2022 16:03:35 by
1.7.2