Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
quantization_util.h
00001 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 00002 00003 Licensed under the Apache License, Version 2.0 (the "License"); 00004 you may not use this file except in compliance with the License. 00005 You may obtain a copy of the License at 00006 00007 http://www.apache.org/licenses/LICENSE-2.0 00008 00009 Unless required by applicable law or agreed to in writing, software 00010 distributed under the License is distributed on an "AS IS" BASIS, 00011 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00012 See the License for the specific language governing permissions and 00013 limitations under the License. 00014 ==============================================================================*/ 00015 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_ 00016 #define TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_ 00017 00018 #include <cmath> 00019 #include <cstdint> 00020 #include <limits> 00021 00022 #include "tensorflow/lite/kernels/internal/compatibility.h" 00023 #include "tensorflow/lite/kernels/internal/round.h" 00024 #include "tensorflow/lite/kernels/internal/types.h" 00025 00026 namespace tflite { 00027 00028 // Given the min and max values of a float array, return 00029 // reasonable quantization parameters to use for this array. 00030 template <typename T> 00031 QuantizationParams ChooseQuantizationParams(double rmin, double rmax, 00032 bool narrow_range) { 00033 const T qmin = std::numeric_limits<T>::min() + (narrow_range ? 1 : 0); 00034 const T qmax = std::numeric_limits<T>::max(); 00035 const double qmin_double = qmin; 00036 const double qmax_double = qmax; 00037 // 0 should always be a representable value. Let's assume that the initial 00038 // min,max range contains 0. 00039 TFLITE_CHECK_LE(rmin, 0.); 00040 TFLITE_CHECK_GE(rmax, 0.); 00041 if (rmin == rmax) { 00042 // Special case where the min,max range is a point. Should be {0}. 00043 TFLITE_CHECK_EQ(rmin, 0.); 00044 TFLITE_CHECK_EQ(rmax, 0.); 00045 QuantizationParams quantization_params; 00046 quantization_params.zero_point = 0; 00047 quantization_params.scale = 0.; 00048 return quantization_params; 00049 } 00050 00051 // General case. 00052 // 00053 // First determine the scale. 00054 const double scale = (rmax - rmin) / (qmax_double - qmin_double); 00055 00056 // Zero-point computation. 00057 // First the initial floating-point computation. The zero-point can be 00058 // determined from solving an affine equation for any known pair 00059 // (real value, corresponding quantized value). 00060 // We know two such pairs: (rmin, qmin) and (rmax, qmax). 00061 // The arithmetic error on the zero point computed from either pair 00062 // will be roughly machine_epsilon * (sum of absolute values of terms) 00063 // so we want to use the variant that adds the smaller terms. 00064 const double zero_point_from_min = qmin_double - rmin / scale; 00065 const double zero_point_from_max = qmax_double - rmax / scale; 00066 const double zero_point_from_min_error = 00067 std::abs(qmin_double) + std::abs(rmin / scale); 00068 const double zero_point_from_max_error = 00069 std::abs(qmax_double) + std::abs(rmax / scale); 00070 00071 const double zero_point_double = 00072 zero_point_from_min_error < zero_point_from_max_error 00073 ? zero_point_from_min 00074 : zero_point_from_max; 00075 00076 // Now we need to nudge the zero point to be an integer 00077 // (our zero points are integer, and this is motivated by the requirement 00078 // to be able to represent the real value "0" exactly as a quantized value, 00079 // which is required in multiple places, for example in Im2col with SAME 00080 // padding). 00081 T nudged_zero_point = 0; 00082 if (zero_point_double < qmin_double) { 00083 nudged_zero_point = qmin; 00084 } else if (zero_point_double > qmax_double) { 00085 nudged_zero_point = qmax; 00086 } else { 00087 nudged_zero_point = static_cast<T>(round(zero_point_double)); 00088 } 00089 // The zero point should always be in the range of quantized value, 00090 // [qmin, qmax]. 00091 TFLITE_CHECK_GE(nudged_zero_point, qmin); 00092 TFLITE_CHECK_LE(nudged_zero_point, qmax); 00093 00094 // Finally, store the result nudged quantization params. 00095 QuantizationParams quantization_params; 00096 quantization_params.zero_point = nudged_zero_point; 00097 quantization_params.scale = scale; 00098 return quantization_params; 00099 } 00100 00101 template <typename T> 00102 QuantizationParams ChooseQuantizationParams(double rmin, double rmax) { 00103 return ChooseQuantizationParams<T>(rmin, rmax, false); 00104 } 00105 00106 // Converts a floating-point number to an integer. For all inputs x where 00107 // static_cast<IntOut>(x) is legal according to the C++ standard, the result 00108 // is identical to that cast (i.e. the result is x with its fractional part 00109 // truncated whenever that is representable as IntOut). 00110 // 00111 // static_cast would cause undefined behavior for the following cases, which 00112 // have well-defined behavior for this function: 00113 // 00114 // 1. If x is NaN, the result is zero. 00115 // 00116 // 2. If the truncated form of x is above the representable range of IntOut, 00117 // the result is std::numeric_limits<IntOut>::max(). 00118 // 00119 // 3. If the truncated form of x is below the representable range of IntOut, 00120 // the result is std::numeric_limits<IntOut>::min(). 00121 // 00122 // Note that cases #2 and #3 cover infinities as well as finite numbers. 00123 // 00124 // The range of FloatIn must include the range of IntOut, otherwise 00125 // the results are undefined. 00126 // TODO(sfeuz): Replace by absl::SafeCast once available. 00127 template <class IntOut, class FloatIn> 00128 IntOut SafeCast(FloatIn x) { 00129 static_assert(!std::numeric_limits<FloatIn>::is_integer, 00130 "FloatIn is integer"); 00131 static_assert(std::numeric_limits<IntOut>::is_integer, 00132 "IntOut is not integer"); 00133 static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2"); 00134 00135 // Special case NaN, for which the logic below doesn't work. 00136 if (std::isnan(x)) { 00137 return 0; 00138 } 00139 00140 // Negative values all clip to zero for unsigned results. 00141 if (!std::numeric_limits<IntOut>::is_signed && x < 0) { 00142 return 0; 00143 } 00144 00145 // Handle infinities. 00146 if (std::isinf(x)) { 00147 return x < 0 ? std::numeric_limits<IntOut>::min() 00148 : std::numeric_limits<IntOut>::max(); 00149 } 00150 00151 // Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0), 00152 // unless x is zero in which case exp == 0. Note that this implies that the 00153 // magnitude of x is strictly less than 2^exp. 00154 int exp = 0; 00155 std::frexp(x, &exp); 00156 00157 // Let N be the number of non-sign bits in the representation of IntOut. If 00158 // the magnitude of x is strictly less than 2^N, the truncated version of x 00159 // is representable as IntOut. The only representable integer for which this 00160 // is not the case is kMin for signed types (i.e. -2^N), but that is covered 00161 // by the fall-through below. 00162 if (exp <= std::numeric_limits<IntOut>::digits) { 00163 return x; 00164 } 00165 00166 // Handle numbers with magnitude >= 2^N. 00167 return x < 0 ? std::numeric_limits<IntOut>::min() 00168 : std::numeric_limits<IntOut>::max(); 00169 } 00170 00171 // Decompose a double multiplier into a Q0.31 int32 representation of its 00172 // significand, and shift representation of NEGATIVE its exponent --- 00173 // this is intended as a RIGHT-shift. 00174 // 00175 // Restricted to the case where the multiplier < 1 (and non-negative). 00176 void QuantizeMultiplierSmallerThanOneExp(double double_multiplier, 00177 int32_t* quantized_multiplier, 00178 int* left_shift); 00179 00180 // Decompose a double multiplier into a Q0.31 int32 representation of its 00181 // significand, and shift representation of its exponent. 00182 // 00183 // Restricted to the case where the multiplier > 1. 00184 void QuantizeMultiplierGreaterThanOne(double double_multiplier, 00185 int32_t* quantized_multiplier, 00186 int* left_shift); 00187 00188 // Decompose a double multiplier into a Q0.31 int32 representation of its 00189 // significand, and shift representation of its exponent. 00190 // 00191 // Handles an arbitrary positive multiplier. The 'shift' output-value is 00192 // basically the 'floating-point exponent' of the multiplier: 00193 // Negative for a right-shift (when the multiplier is <1), positive for a 00194 // left-shift (when the multiplier is >1) 00195 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier, 00196 int* shift); 00197 00198 // Splits a double input value into a returned fraction, and a shift value from 00199 // the exponent, using only bitwise and integer operations to support 00200 // microcontrollers and other environments without floating-point support. 00201 // 00202 // This is designed to be a replacement for how std::frexp() is used within the 00203 // QuantizeMultiplier() function, and so has a different signature than the 00204 // standard version, returning a 64-bit integer rather than a double. This 00205 // result has a maximum value of 1<<31, with the fraction expressed as a 00206 // proportion of that maximum. 00207 // 00208 // std::frexp() returns NaNs and infinities unmodified, but since we're 00209 // returning integers that can't represent those values, instead we return 00210 // a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64 00211 // result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and 00212 // std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will 00213 // result in return values that end up truncating some bits at the end, 00214 // reflecting the loss of precision inherent in denormalization. 00215 int64_t IntegerFrExp(double input, int* shift); 00216 00217 // Converts an integer fraction in the format produced by IntegerFrExp (where 00218 // 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an 00219 // IEEE binary64 double format result. The implementation uses only integer and 00220 // bitwise operators, so no floating point hardware support or emulation is 00221 // needed. This is here so quantized operations can run non-time-critical 00222 // preparation calculations on microcontrollers and other platforms without 00223 // float support. 00224 double DoubleFromFractionAndShift(int64_t fraction, int shift); 00225 00226 // Performs a multiplication of two numbers in double format, using only integer 00227 // and bitwise instructions. This is aimed at supporting housekeeping functions 00228 // for quantized operations on microcontrollers without floating-point hardware. 00229 double IntegerDoubleMultiply(double a, double b); 00230 00231 // Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is 00232 // greater than b. It is implemented using only integer and logical instructions 00233 // so that it can be easily run on microcontrollers for quantized operations. 00234 int IntegerDoubleCompare(double a, double b); 00235 00236 // This first creates a multiplier in a double equivalent of 00237 // Q(input_integer_bits).(31-input_integer_bits) representation, with extra 00238 // precision in the double's fractional bits. It then splits the result into 00239 // significand and exponent. 00240 void PreprocessSoftmaxScaling(double beta, double input_scale, 00241 int input_integer_bits, 00242 int32_t* quantized_multiplier, int* left_shift); 00243 // Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated. 00244 void PreprocessLogSoftmaxScalingExp(double beta, double input_scale, 00245 int input_integer_bits, 00246 int32_t* quantized_multiplier, 00247 int* left_shift, 00248 int32_t* reverse_scaling_divisor, 00249 int* reverse_scaling_left_shift); 00250 // Calculate the largest input that will result in a within-bounds intermediate 00251 // result within MultiplyByQuantizedMultiplierGreaterThanOne. In other words, 00252 // it must not overflow before we reduce the value by multiplication by the 00253 // input multiplier. The negative radius is used as the minimum difference in 00254 // Softmax. 00255 int CalculateInputRadius(int input_integer_bits, int input_left_shift, 00256 int total_signed_bits = 31); 00257 00258 // Nudges a min/max quantization range to ensure zero is zero. 00259 // Gymnastics with nudged zero point is to ensure that real zero maps to 00260 // an integer, which is required for e.g. zero-padding in convolutional layers. 00261 // Outputs nudged_min, nudged_max, nudged_scale. 00262 void NudgeQuantizationRange(const float min, const float max, 00263 const int quant_min, const int quant_max, 00264 float* nudged_min, float* nudged_max, 00265 float* nudged_scale); 00266 00267 // Fake quantizes (quantizes and dequantizes) input_data using the scale, 00268 // nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code 00269 // in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor. 00270 void FakeQuantizeArray(const float nudged_scale, const float nudged_min, 00271 const float nudged_max, const float* input_data, 00272 float* output_data, const float size); 00273 00274 // If x is approximately a power of two (with any positive or negative 00275 // exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise 00276 // returns false. 00277 bool CheckedLog2(const float x, int* log2_result); 00278 00279 // Decomposes an array of double multipliers into a Q0.31 int32 representation 00280 // of its significand, and shift representation of its exponent. 00281 // 00282 // Handles an arbitrary multiplier. The 'shift' output-value is 00283 // basically the 'floating-point exponent' of the multiplier: 00284 // Negative for a right-shift (when the multiplier is <1), positive for a 00285 // left-shift (when the multiplier is >1) 00286 void QuantizeMultiplierArray(const double* effective_scales, size_t size, 00287 int32_t* effective_scale_significand, 00288 int* effective_shift); 00289 00290 } // namespace tflite 00291 00292 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
Generated on Wed Jul 13 2022 16:03:35 by
