Daniel Konegen / MNIST_example

Dependencies:   mbed-os

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers fully_connected.h Source File

fully_connected.h

00001 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
00002 
00003 Licensed under the Apache License, Version 2.0 (the "License");
00004 you may not use this file except in compliance with the License.
00005 You may obtain a copy of the License at
00006 
00007     http://www.apache.org/licenses/LICENSE-2.0
00008 
00009 Unless required by applicable law or agreed to in writing, software
00010 distributed under the License is distributed on an "AS IS" BASIS,
00011 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00012 See the License for the specific language governing permissions and
00013 limitations under the License.
00014 ==============================================================================*/
00015 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
00016 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
00017 
00018 #include "tensorflow/lite/kernels/internal/common.h"
00019 #include "tensorflow/lite/kernels/internal/quantization_util.h"
00020 #include "tensorflow/lite/kernels/internal/round.h"
00021 #include "tensorflow/lite/kernels/internal/types.h"
00022 
00023 namespace tflite {
00024 namespace reference_ops {
00025 
00026 const int kReverseShift = -1;
00027 
00028 inline void FullyConnected(
00029     const FullyConnectedParams& params, const RuntimeShape& input_shape,
00030     const float* input_data, const RuntimeShape& weights_shape,
00031     const float* weights_data, const RuntimeShape& bias_shape,
00032     const float* bias_data, const RuntimeShape& output_shape,
00033     float* output_data) {
00034   const float output_activation_min = params.float_activation_min;
00035   const float output_activation_max = params.float_activation_max;
00036   // TODO(benoitjacob): This really should be:
00037   //     const int batches = ArraySize(output_dims, 1);
00038   // but the current --variable_batch hack consists in overwriting the 3rd
00039   // dimension with the runtime batch size, as we don't keep track for each
00040   // array of which dimension is the batch dimension in it.
00041   const int output_dims_count = output_shape.DimensionsCount();
00042   const int weights_dims_count = weights_shape.DimensionsCount();
00043   const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
00044   const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
00045                                        output_shape, output_dims_count - 1);
00046   const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
00047   for (int b = 0; b < batches; ++b) {
00048     for (int out_c = 0; out_c < output_depth; ++out_c) {
00049       float total = 0.f;
00050       for (int d = 0; d < accum_depth; ++d) {
00051         total += input_data[b * accum_depth + d] *
00052                  weights_data[out_c * accum_depth + d];
00053       }
00054       float bias_value = 0.0f;
00055       if (bias_data) {
00056         bias_value = bias_data[out_c];
00057       }
00058       output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
00059           total + bias_value, output_activation_min, output_activation_max);
00060     }
00061   }
00062 }
00063 
00064 inline void FullyConnected(
00065     const FullyConnectedParams& params, const RuntimeShape& input_shape,
00066     const uint8* input_data, const RuntimeShape& filter_shape,
00067     const uint8* filter_data, const RuntimeShape& bias_shape,
00068     const int32* bias_data, const RuntimeShape& output_shape,
00069     uint8* output_data) {
00070   const int32 input_offset = params.input_offset;
00071   const int32 filter_offset = params.weights_offset;
00072   const int32 output_offset = params.output_offset;
00073   const int32 output_multiplier = params.output_multiplier;
00074   const int output_shift = params.output_shift;
00075   const int32 output_activation_min = params.quantized_activation_min;
00076   const int32 output_activation_max = params.quantized_activation_max;
00077   TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
00078   TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
00079 
00080   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
00081   // TODO(benoitjacob): This really should be:
00082   //     const int batches = ArraySize(output_dims, 1);
00083   // but the current --variable_batch hack consists in overwriting the 3rd
00084   // dimension with the runtime batch size, as we don't keep track for each
00085   // array of which dimension is the batch dimension in it.
00086   const int output_dim_count = output_shape.DimensionsCount();
00087   const int filter_dim_count = filter_shape.DimensionsCount();
00088   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
00089   const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
00090                                        output_shape, output_dim_count - 1);
00091   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
00092   for (int b = 0; b < batches; ++b) {
00093     for (int out_c = 0; out_c < output_depth; ++out_c) {
00094       int32 acc = 0;
00095       for (int d = 0; d < accum_depth; ++d) {
00096         int32 input_val = input_data[b * accum_depth + d];
00097         int32 filter_val = filter_data[out_c * accum_depth + d];
00098         acc += (filter_val + filter_offset) * (input_val + input_offset);
00099       }
00100       if (bias_data) {
00101         acc += bias_data[out_c];
00102       }
00103       acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
00104       acc += output_offset;
00105       acc = std::max(acc, output_activation_min);
00106       acc = std::min(acc, output_activation_max);
00107       output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
00108     }
00109   }
00110 }
00111 
00112 inline void FullyConnected(
00113     const FullyConnectedParams& params, const RuntimeShape& input_shape,
00114     const uint8* input_data, const RuntimeShape& filter_shape,
00115     const uint8* filter_data, const RuntimeShape& bias_shape,
00116     const int32* bias_data, const RuntimeShape& output_shape,
00117     int16* output_data) {
00118   const int32 input_offset = params.input_offset;
00119   const int32 filter_offset = params.weights_offset;
00120   const int32 output_offset = params.output_offset;
00121   const int32 output_multiplier = params.output_multiplier;
00122   const int output_shift = params.output_shift;
00123   const int32 output_activation_min = params.quantized_activation_min;
00124   const int32 output_activation_max = params.quantized_activation_max;
00125 
00126   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
00127   TFLITE_DCHECK_EQ(output_offset, 0);
00128   // TODO(benoitjacob): This really should be:
00129   //     const int batches = ArraySize(output_dims, 1);
00130   // but the current --variable_batch hack consists in overwriting the 3rd
00131   // dimension with the runtime batch size, as we don't keep track for each
00132   // array of which dimension is the batch dimension in it.
00133   const int output_dim_count = output_shape.DimensionsCount();
00134   const int filter_dim_count = filter_shape.DimensionsCount();
00135   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
00136   const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
00137                                        output_shape, output_dim_count - 1);
00138   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
00139   for (int b = 0; b < batches; ++b) {
00140     for (int out_c = 0; out_c < output_depth; ++out_c) {
00141       // Internal accumulation.
00142       // Initialize accumulator with the bias-value.
00143       int32 accum = bias_data[out_c];
00144       // Accumulation loop.
00145       for (int d = 0; d < accum_depth; ++d) {
00146         int16 input_val = input_data[b * accum_depth + d] + input_offset;
00147         int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset;
00148         accum += filter_val * input_val;
00149       }
00150       // Down-scale the final int32 accumulator to the scale used by our
00151       // (16-bit, typically 3 integer bits) fixed-point format. The quantized
00152       // multiplier and shift here have been pre-computed offline
00153       // (e.g. by toco).
00154       accum =
00155           MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
00156       // Saturate, cast to int16, and store to output array.
00157       accum = std::max(accum, output_activation_min - output_offset);
00158       accum = std::min(accum, output_activation_max - output_offset);
00159       accum += output_offset;
00160       output_data[out_c + output_depth * b] = accum;
00161     }
00162   }
00163 }
00164 
00165 inline void ShuffledFullyConnected(
00166     const FullyConnectedParams& params, const RuntimeShape& input_shape,
00167     const uint8* input_data, const RuntimeShape& weights_shape,
00168     const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
00169     const int32* bias_data, const RuntimeShape& output_shape,
00170     int16* output_data, uint8* shuffled_input_workspace_data) {
00171   const int32 output_multiplier = params.output_multiplier;
00172   const int output_shift = params.output_shift;
00173   const int32 output_activation_min = params.quantized_activation_min;
00174   const int32 output_activation_max = params.quantized_activation_max;
00175   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
00176 
00177   TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
00178   TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
00179   TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
00180   // TODO(benoitjacob): This really should be:
00181   //     const int batches = ArraySize(output_dims, 1);
00182   // but the current --variable_batch hack consists in overwriting the 3rd
00183   // dimension with the runtime batch size, as we don't keep track for each
00184   // array of which dimension is the batch dimension in it.
00185   const int output_dim_count = output_shape.DimensionsCount();
00186   const int weights_dim_count = weights_shape.DimensionsCount();
00187   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
00188   const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
00189                                        output_shape, output_dim_count - 1);
00190   const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
00191   TFLITE_DCHECK((accum_depth % 16) == 0);
00192   TFLITE_DCHECK((output_depth % 4) == 0);
00193 
00194   // Shuffling and xoring of input activations into the workspace buffer
00195   uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
00196   if (batches == 1) {
00197     for (int i = 0; i < accum_depth; i++) {
00198       shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
00199     }
00200   } else if (batches == 4) {
00201     for (int c = 0; c < accum_depth; c += 16) {
00202       for (int b = 0; b < 4; b++) {
00203         const uint8* src_data_ptr = input_data + b * accum_depth + c;
00204         for (int j = 0; j < 16; j++) {
00205           uint8 src_val = *src_data_ptr++;
00206           // Flip the sign bit, so that the kernel will only need to
00207           // reinterpret these uint8 values as int8, getting for free the
00208           // subtraction of the zero_point value 128.
00209           uint8 dst_val = src_val ^ 0x80;
00210           *shuffled_input_workspace_ptr++ = dst_val;
00211         }
00212       }
00213     }
00214   } else {
00215     TFLITE_DCHECK(false);
00216     return;
00217   }
00218 
00219   // Actual computation
00220   if (batches == 1) {
00221     int16* output_ptr = output_data;
00222     // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
00223     // so that just reinterpreting them as int8 values is equivalent to
00224     // subtracting 128 from them, thus implementing for free the subtraction of
00225     // the zero_point value 128.
00226     const int8* shuffled_weights_ptr =
00227         reinterpret_cast<const int8*>(shuffled_weights_data);
00228     // Likewise, we preshuffled and pre-xored the input data above.
00229     const int8* shuffled_input_data =
00230         reinterpret_cast<const int8*>(shuffled_input_workspace_data);
00231     for (int c = 0; c < output_depth; c += 4) {
00232       // Internal accumulation.
00233       // Initialize accumulator with the bias-value.
00234       int32 accum[4] = {0};
00235       // Accumulation loop.
00236       for (int d = 0; d < accum_depth; d += 16) {
00237         for (int i = 0; i < 4; i++) {
00238           for (int j = 0; j < 16; j++) {
00239             int8 input_val = shuffled_input_data[d + j];
00240             int8 weights_val = *shuffled_weights_ptr++;
00241             accum[i] += weights_val * input_val;
00242           }
00243         }
00244       }
00245       for (int i = 0; i < 4; i++) {
00246         // Add bias value
00247         int32 acc = accum[i] + bias_data[c + i];
00248         // Down-scale the final int32 accumulator to the scale used by our
00249         // (16-bit, typically 3 integer bits) fixed-point format. The quantized
00250         // multiplier and shift here have been pre-computed offline
00251         // (e.g. by toco).
00252         acc =
00253             MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
00254         // Saturate, cast to int16, and store to output array.
00255         acc = std::max(acc, output_activation_min);
00256         acc = std::min(acc, output_activation_max);
00257         output_ptr[c + i] = acc;
00258       }
00259     }
00260   } else if (batches == 4) {
00261     int16* output_ptr = output_data;
00262     // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
00263     // so that just reinterpreting them as int8 values is equivalent to
00264     // subtracting 128 from them, thus implementing for free the subtraction of
00265     // the zero_point value 128.
00266     const int8* shuffled_weights_ptr =
00267         reinterpret_cast<const int8*>(shuffled_weights_data);
00268     // Likewise, we preshuffled and pre-xored the input data above.
00269     const int8* shuffled_input_data =
00270         reinterpret_cast<const int8*>(shuffled_input_workspace_data);
00271     for (int c = 0; c < output_depth; c += 4) {
00272       const int8* shuffled_input_ptr = shuffled_input_data;
00273       // Accumulation loop.
00274       // Internal accumulation.
00275       // Initialize accumulator with the bias-value.
00276       int32 accum[4][4];
00277       for (int i = 0; i < 4; i++) {
00278         for (int b = 0; b < 4; b++) {
00279           accum[i][b] = 0;
00280         }
00281       }
00282       for (int d = 0; d < accum_depth; d += 16) {
00283         for (int i = 0; i < 4; i++) {
00284           for (int b = 0; b < 4; b++) {
00285             for (int j = 0; j < 16; j++) {
00286               int8 input_val = shuffled_input_ptr[16 * b + j];
00287               int8 weights_val = shuffled_weights_ptr[16 * i + j];
00288               accum[i][b] += weights_val * input_val;
00289             }
00290           }
00291         }
00292         shuffled_input_ptr += 64;
00293         shuffled_weights_ptr += 64;
00294       }
00295       for (int i = 0; i < 4; i++) {
00296         for (int b = 0; b < 4; b++) {
00297           // Add bias value
00298           int32 acc = accum[i][b] + bias_data[c + i];
00299           // Down-scale the final int32 accumulator to the scale used by our
00300           // (16-bit, typically 3 integer bits) fixed-point format. The
00301           // quantized multiplier and shift here have been pre-computed offline
00302           // (e.g. by toco).
00303           acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
00304                                               output_shift);
00305           // Saturate, cast to int16, and store to output array.
00306           acc = std::max(acc, output_activation_min);
00307           acc = std::min(acc, output_activation_max);
00308           output_ptr[b * output_depth + c + i] = acc;
00309         }
00310       }
00311     }
00312   } else {
00313     TFLITE_DCHECK(false);
00314     return;
00315   }
00316 }
00317 
00318 }  // namespace reference_ops
00319 }  // namespace tflite
00320 
00321 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_