mirror of
https://github.com/jomjol/AI-on-the-edge-device.git
synced 2025-12-12 14:37:06 +03:00
Update tflite
This commit is contained in:
@@ -55,9 +55,12 @@ inline void GetActivationMinMax(FusedActivationFunctionType ac,
|
||||
}
|
||||
}
|
||||
|
||||
inline float ActivationFunctionWithMinMax(float x, float output_activation_min,
|
||||
float output_activation_max) {
|
||||
return std::min(std::max(x, output_activation_min), output_activation_max);
|
||||
template <typename T>
|
||||
inline T ActivationFunctionWithMinMax(T x, T output_activation_min,
|
||||
T output_activation_max) {
|
||||
using std::max;
|
||||
using std::min;
|
||||
return min(max(x, output_activation_min), output_activation_max);
|
||||
}
|
||||
|
||||
// Legacy function, left for compatibility only.
|
||||
@@ -135,23 +138,24 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
|
||||
#endif
|
||||
}
|
||||
|
||||
inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
int32 x, int32 quantized_multiplier, int left_shift) {
|
||||
inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
int32_t x, int32_t quantized_multiplier, int left_shift) {
|
||||
using gemmlowp::RoundingDivideByPOT;
|
||||
using gemmlowp::SaturatingRoundingDoublingHighMul;
|
||||
return RoundingDivideByPOT(
|
||||
SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
|
||||
}
|
||||
|
||||
inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
|
||||
int32 x, int32 quantized_multiplier, int left_shift) {
|
||||
inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
|
||||
int32_t x, int32_t quantized_multiplier, int left_shift) {
|
||||
using gemmlowp::SaturatingRoundingDoublingHighMul;
|
||||
return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
|
||||
quantized_multiplier);
|
||||
}
|
||||
|
||||
inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
|
||||
int shift) {
|
||||
inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
|
||||
int32_t quantized_multiplier,
|
||||
int shift) {
|
||||
using gemmlowp::RoundingDivideByPOT;
|
||||
using gemmlowp::SaturatingRoundingDoublingHighMul;
|
||||
int left_shift = shift > 0 ? shift : 0;
|
||||
@@ -161,16 +165,16 @@ inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
|
||||
right_shift);
|
||||
}
|
||||
|
||||
inline int32 MultiplyByQuantizedMultiplier(int64_t x,
|
||||
int32 quantized_multiplier,
|
||||
int shift) {
|
||||
inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
|
||||
int32_t quantized_multiplier,
|
||||
int shift) {
|
||||
// Inputs:
|
||||
// - quantized_multiplier has fixed point at bit 31
|
||||
// - shift is -31 to +7 (negative for right shift)
|
||||
//
|
||||
// Assumptions: The following input ranges are assumed
|
||||
// - quantize_scale>=0 (the usual range is (1<<30) to (1>>31)-1)
|
||||
// - scaling is chosen so final scaled result fits in int32
|
||||
// - scaling is chosen so final scaled result fits in int32_t
|
||||
// - input x is in the range -(1<<47) <= x < (1<<47)
|
||||
assert(quantized_multiplier >= 0);
|
||||
assert(shift >= -31 && shift < 8);
|
||||
@@ -215,9 +219,9 @@ inline int CountLeadingSignBits(T integer_input) {
|
||||
using U = typename std::make_unsigned<T>::type;
|
||||
return integer_input >= 0
|
||||
? CountLeadingZeros(static_cast<U>(integer_input)) - 1
|
||||
: integer_input != std::numeric_limits<T>::min()
|
||||
? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1)
|
||||
: 0;
|
||||
: integer_input != std::numeric_limits<T>::min()
|
||||
? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1)
|
||||
: 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -237,8 +241,12 @@ inline Integer FloorLog2(Integer n) {
|
||||
|
||||
// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
|
||||
// softmax
|
||||
inline void gen_lut(const std::function<double(double)>& func, double min,
|
||||
double max, int16_t* table, const int num) {
|
||||
// func - the function to build the LUT for (e.g exp(x))
|
||||
// min,max - table limits
|
||||
// table - pointer to buffer
|
||||
// num - number of elements in the LUT
|
||||
inline void gen_lut(double (*func)(double), double min, double max,
|
||||
int16_t* table, const int num) {
|
||||
// size of table should equal to num + 1
|
||||
// last element only for slope calculation
|
||||
double step = (max - min) / (num - 1);
|
||||
@@ -259,7 +267,35 @@ inline void gen_lut(const std::function<double(double)>& func, double min,
|
||||
std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
|
||||
}
|
||||
|
||||
// int16 func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
|
||||
// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
|
||||
// softmax
|
||||
// func - the function to build the LUT for (e.g exp(x))
|
||||
// min,max - table limits
|
||||
// table - pointer to buffer
|
||||
// num - number of elements in the LUT
|
||||
inline void gen_lut(float (*func)(float), float min, float max, int16_t* table,
|
||||
const int num) {
|
||||
// size of table should equal to num + 1
|
||||
// last element only for slope calculation
|
||||
float step = (max - min) / (num - 1);
|
||||
float half_step = step / 2.0f;
|
||||
for (int i = 0; i < num - 1; i++) {
|
||||
float sample_val = TfLiteRound(func(min + i * step) * 32768.0f);
|
||||
float midpoint_interp_val =
|
||||
TfLiteRound((func(min + (i + 1) * step) * 32768.0f +
|
||||
TfLiteRound(func(min + i * step) * 32768.0f)) /
|
||||
2.0f);
|
||||
float midpoint_val =
|
||||
TfLiteRound(func(min + i * step + half_step) * 32768.0f);
|
||||
float midpoint_err = midpoint_interp_val - midpoint_val;
|
||||
float bias = TfLiteRound(midpoint_err / 2.0f);
|
||||
table[i] = std::min(std::max(sample_val - bias, -32768.0f), 32767.0f);
|
||||
}
|
||||
table[num - 1] = std::min(
|
||||
std::max(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
|
||||
}
|
||||
|
||||
// int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
|
||||
inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) {
|
||||
// 512 base value, lut[513] only for calculate slope
|
||||
uint16_t index = static_cast<uint16_t>(256 + (value >> 7));
|
||||
@@ -410,6 +446,23 @@ SaturatingRoundingMultiplyByPOTParam(
|
||||
SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
|
||||
}
|
||||
|
||||
// Convert int32_t multiplier to int16_t with rounding.
|
||||
inline void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32_t,
|
||||
int16_t* multiplier_int16_t) {
|
||||
TFLITE_DCHECK_GE(multiplier_int32_t, 0);
|
||||
static constexpr int32_t kRoundingOffset = 1 << 15;
|
||||
if (multiplier_int32_t >=
|
||||
std::numeric_limits<int32_t>::max() - kRoundingOffset) {
|
||||
*multiplier_int16_t = std::numeric_limits<int16_t>::max();
|
||||
return;
|
||||
}
|
||||
const int32_t result = (multiplier_int32_t + kRoundingOffset) >> 16;
|
||||
TFLITE_DCHECK_LE(result << 16, multiplier_int32_t + kRoundingOffset);
|
||||
TFLITE_DCHECK_GT(result << 16, multiplier_int32_t - kRoundingOffset);
|
||||
*multiplier_int16_t = result;
|
||||
TFLITE_DCHECK_EQ(*multiplier_int16_t, result);
|
||||
}
|
||||
|
||||
// Minimum output bits to accommodate log of maximum input range. It actually
|
||||
// does not matter if one considers, say, [-64,64] or [-64,64).
|
||||
//
|
||||
@@ -418,15 +471,13 @@ SaturatingRoundingMultiplyByPOTParam(
|
||||
// ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
|
||||
// ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
|
||||
constexpr int min_log_x_output_bits(int input_bits) {
|
||||
return input_bits > 90
|
||||
? 7
|
||||
: input_bits > 44
|
||||
? 6
|
||||
: input_bits > 21
|
||||
? 5
|
||||
: input_bits > 10
|
||||
? 4
|
||||
: input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
|
||||
return input_bits > 90 ? 7
|
||||
: input_bits > 44 ? 6
|
||||
: input_bits > 21 ? 5
|
||||
: input_bits > 10 ? 4
|
||||
: input_bits > 4 ? 3
|
||||
: input_bits > 1 ? 2
|
||||
: 1;
|
||||
}
|
||||
|
||||
// Although currently the name of this function says that it cannot handle
|
||||
@@ -434,17 +485,17 @@ constexpr int min_log_x_output_bits(int input_bits) {
|
||||
// x_max is the largest representable input. In other words, the output range
|
||||
// is symmetric.
|
||||
template <int OutputIntegerBits, int InputIntegerBits>
|
||||
inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
|
||||
inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
|
||||
log_x_for_x_greater_than_or_equal_to_1_impl(
|
||||
gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
|
||||
// assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1);
|
||||
// assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits);
|
||||
using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
|
||||
gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
|
||||
// assert(__builtin_clz(0u) >= std::numeric_limits<uint32_t>::digits - 1);
|
||||
// assert(__builtin_clz(0u) <= std::numeric_limits<uint32_t>::digits);
|
||||
using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
|
||||
// The reason for accumulating the result with an extra bit of headroom is
|
||||
// that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
|
||||
// recip_denom will otherwise introduce an error.
|
||||
static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
|
||||
using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
|
||||
using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumIntegerBits>;
|
||||
|
||||
const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
|
||||
FixedPoint0, 1488522236, std::log(2.0));
|
||||
@@ -472,10 +523,10 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
|
||||
// required shift "ourselves" instead of using, say, Rescale.
|
||||
FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
|
||||
// z_a_pow_2 = input_integer_bits - z_a_headroom;
|
||||
int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
|
||||
int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32_t>(z_a.raw()));
|
||||
FixedPoint0 r_a_tmp =
|
||||
SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
|
||||
const int32 r_a_raw =
|
||||
const int32_t r_a_raw =
|
||||
SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
|
||||
// z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
|
||||
// z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
|
||||
@@ -487,8 +538,8 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
|
||||
|
||||
// z_b is treated like z_a, but premultiplying by sqrt(0.5).
|
||||
FixedPoint0 z_b = z_a * sqrt_half;
|
||||
int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
|
||||
const int32 r_b_raw =
|
||||
int z_b_headroom = CountLeadingZeros(static_cast<uint32_t>(z_b.raw())) - 1;
|
||||
const int32_t r_b_raw =
|
||||
SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
|
||||
const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
|
||||
FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
|
||||
@@ -516,9 +567,9 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
|
||||
}
|
||||
|
||||
template <int OutputIntegerBits, int InputIntegerBits>
|
||||
inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
|
||||
inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
|
||||
log_x_for_x_greater_than_or_equal_to_1(
|
||||
gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
|
||||
gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
|
||||
static_assert(
|
||||
OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
|
||||
"Output integer bits must be sufficient to accommodate logs of inputs.");
|
||||
@@ -527,25 +578,25 @@ log_x_for_x_greater_than_or_equal_to_1(
|
||||
input_val);
|
||||
}
|
||||
|
||||
inline int32 GetReciprocal(int32 x, int x_integer_digits,
|
||||
int* num_bits_over_unit) {
|
||||
int headroom_plus_one = CountLeadingZeros(static_cast<uint32>(x));
|
||||
inline int32_t GetReciprocal(int32_t x, int x_integer_digits,
|
||||
int* num_bits_over_unit) {
|
||||
int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(x));
|
||||
// This is the number of bits to the left of the binary point above 1.0.
|
||||
// Consider x=1.25. In that case shifted_scale=0.8 and
|
||||
// no later adjustment will be needed.
|
||||
*num_bits_over_unit = x_integer_digits - headroom_plus_one;
|
||||
const int32 shifted_sum_minus_one =
|
||||
static_cast<int32>((static_cast<uint32>(x) << headroom_plus_one) -
|
||||
(static_cast<uint32>(1) << 31));
|
||||
const int32_t shifted_sum_minus_one =
|
||||
static_cast<int32_t>((static_cast<uint32_t>(x) << headroom_plus_one) -
|
||||
(static_cast<uint32_t>(1) << 31));
|
||||
|
||||
gemmlowp::FixedPoint<int32, 0> shifted_scale =
|
||||
gemmlowp::FixedPoint<int32_t, 0> shifted_scale =
|
||||
gemmlowp::one_over_one_plus_x_for_x_in_0_1(
|
||||
gemmlowp::FixedPoint<int32, 0>::FromRaw(shifted_sum_minus_one));
|
||||
gemmlowp::FixedPoint<int32_t, 0>::FromRaw(shifted_sum_minus_one));
|
||||
return shifted_scale.raw();
|
||||
}
|
||||
|
||||
inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
|
||||
int32* output_inv_sqrt,
|
||||
inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
|
||||
int32_t* output_inv_sqrt,
|
||||
int* output_shift) {
|
||||
TFLITE_DCHECK_GE(input, 0);
|
||||
if (input <= 1) {
|
||||
@@ -565,7 +616,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
|
||||
++*output_shift;
|
||||
}
|
||||
const unsigned max_left_shift_bits =
|
||||
CountLeadingZeros(static_cast<uint32>(input)) - 1;
|
||||
CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
|
||||
const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
|
||||
const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
|
||||
*output_shift -= left_shift_bit_pairs;
|
||||
@@ -577,8 +628,8 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
|
||||
using gemmlowp::SaturatingRoundingMultiplyByPOT;
|
||||
// Using 3 integer bits gives us enough room for the internal arithmetic in
|
||||
// this Newton-Raphson iteration.
|
||||
using F3 = FixedPoint<int32, 3>;
|
||||
using F0 = FixedPoint<int32, 0>;
|
||||
using F3 = FixedPoint<int32_t, 3>;
|
||||
using F0 = FixedPoint<int32_t, 0>;
|
||||
const F3 fixedpoint_input = F3::FromRaw(input >> 1);
|
||||
const F3 fixedpoint_half_input =
|
||||
SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
|
||||
@@ -645,6 +696,13 @@ inline int SubscriptToIndex(const NdArrayDesc<5>& desc, int indexes[5]) {
|
||||
indexes[4] * desc.strides[4];
|
||||
}
|
||||
|
||||
inline int SubscriptToIndex(const NdArrayDesc<8>& desc, int indexes[8]) {
|
||||
return indexes[0] * desc.strides[0] + indexes[1] * desc.strides[1] +
|
||||
indexes[2] * desc.strides[2] + indexes[3] * desc.strides[3] +
|
||||
indexes[4] * desc.strides[4] + indexes[5] * desc.strides[5] +
|
||||
indexes[6] * desc.strides[6] + indexes[7] * desc.strides[7];
|
||||
}
|
||||
|
||||
// Given the dimensions of the operands for an element-wise binary broadcast,
|
||||
// adjusts them so that they can be directly iterated over with simple loops.
|
||||
// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
|
||||
|
||||
@@ -76,13 +76,15 @@ limitations under the License.
|
||||
#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
|
||||
#endif
|
||||
|
||||
// TODO(ahentz): Clean up.
|
||||
#ifndef TF_LITE_STATIC_MEMORY
|
||||
// TODO(b/162019032): Consider removing these type-aliases.
|
||||
using int8 = std::int8_t;
|
||||
using uint8 = std::uint8_t;
|
||||
using int16 = std::int16_t;
|
||||
using uint16 = std::uint16_t;
|
||||
using int32 = std::int32_t;
|
||||
using uint32 = std::uint32_t;
|
||||
#endif // !defined(TF_LITE_STATIC_MEMORY)
|
||||
|
||||
// TFLITE_DEPRECATED()
|
||||
//
|
||||
|
||||
@@ -19,8 +19,9 @@ limitations under the License.
|
||||
|
||||
namespace tflite {
|
||||
|
||||
#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
|
||||
(defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO)
|
||||
#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
|
||||
(defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO) || \
|
||||
defined(__ZEPHYR__)
|
||||
#define TF_LITE_GLOBAL_STD_PREFIX
|
||||
#else
|
||||
#define TF_LITE_GLOBAL_STD_PREFIX std
|
||||
|
||||
35
code/lib/tfmicro/tensorflow/lite/kernels/internal/max.h
Normal file
35
code/lib/tfmicro/tensorflow/lite/kernels/internal/max.h
Normal file
@@ -0,0 +1,35 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
namespace tflite {
|
||||
|
||||
#if defined(TF_LITE_USE_GLOBAL_MAX) || defined(__ZEPHYR__)
|
||||
inline float TfLiteMax(const float& x, const float& y) {
|
||||
return std::max(x, y);
|
||||
}
|
||||
#else
|
||||
template <class T>
|
||||
inline T TfLiteMax(const T& x, const T& y) {
|
||||
return std::fmax(x, y);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
|
||||
35
code/lib/tfmicro/tensorflow/lite/kernels/internal/min.h
Normal file
35
code/lib/tfmicro/tensorflow/lite/kernels/internal/min.h
Normal file
@@ -0,0 +1,35 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
namespace tflite {
|
||||
|
||||
#if defined(TF_LITE_USE_GLOBAL_MIN) || defined(__ZEPHYR__)
|
||||
inline float TfLiteMin(const float& x, const float& y) {
|
||||
return std::min(x, y);
|
||||
}
|
||||
#else
|
||||
template <class T>
|
||||
inline T TfLiteMin(const T& x, const T& y) {
|
||||
return std::fmin(x, y);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
|
||||
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
|
||||
|
||||
#include <complex>
|
||||
#include <vector>
|
||||
@@ -21,7 +21,6 @@ limitations under the License.
|
||||
#include "tensorflow/lite/c/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
#include "tensorflow/lite/string_util.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
@@ -76,12 +75,12 @@ class VectorOfTensors {
|
||||
|
||||
// A list of quantized tensors in a format that can be used by kernels like
|
||||
// split and concatenation.
|
||||
class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
|
||||
class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t> {
|
||||
public:
|
||||
// Build with the tensors in 'tensor_list'.
|
||||
VectorOfQuantizedTensors(const TfLiteContext& context,
|
||||
const TfLiteIntArray& tensor_list)
|
||||
: VectorOfTensors<uint8>(context, tensor_list) {
|
||||
: VectorOfTensors<uint8_t>(context, tensor_list) {
|
||||
for (int i = 0; i < tensor_list.size; ++i) {
|
||||
TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
|
||||
zero_point_.push_back(t->params.zero_point);
|
||||
@@ -90,10 +89,10 @@ class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
|
||||
}
|
||||
|
||||
const float* scale() const { return scale_.data(); }
|
||||
const int32* zero_point() const { return zero_point_.data(); }
|
||||
const int32_t* zero_point() const { return zero_point_.data(); }
|
||||
|
||||
private:
|
||||
std::vector<int32> zero_point_;
|
||||
std::vector<int32_t> zero_point_;
|
||||
std::vector<float> scale_;
|
||||
};
|
||||
|
||||
@@ -119,26 +118,6 @@ class SequentialTensorWriter {
|
||||
T* output_ptr_;
|
||||
};
|
||||
|
||||
template <>
|
||||
class SequentialTensorWriter<string> {
|
||||
public:
|
||||
SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output)
|
||||
: input_(input), output_(output) {}
|
||||
~SequentialTensorWriter() { buffer_.WriteToTensor(output_, nullptr); }
|
||||
|
||||
void Write(int position) { this->WriteN(position, 1); }
|
||||
void WriteN(int position, int len) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
buffer_.AddString(GetString(input_, position + i));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const TfLiteTensor* input_;
|
||||
TfLiteTensor* output_;
|
||||
DynamicBuffer buffer_;
|
||||
};
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
|
||||
@@ -342,13 +342,13 @@ void NudgeQuantizationRange(const float min, const float max,
|
||||
const float quant_max_float = static_cast<float>(quant_max);
|
||||
*nudged_scale = (max - min) / (quant_max_float - quant_min_float);
|
||||
const float zero_point_from_min = quant_min_float - min / *nudged_scale;
|
||||
uint16 nudged_zero_point;
|
||||
uint16_t nudged_zero_point;
|
||||
if (zero_point_from_min < quant_min_float) {
|
||||
nudged_zero_point = static_cast<uint16>(quant_min);
|
||||
nudged_zero_point = static_cast<uint16_t>(quant_min);
|
||||
} else if (zero_point_from_min > quant_max_float) {
|
||||
nudged_zero_point = static_cast<uint16>(quant_max);
|
||||
nudged_zero_point = static_cast<uint16_t>(quant_max);
|
||||
} else {
|
||||
nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
|
||||
nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
|
||||
}
|
||||
*nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
|
||||
*nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
|
||||
|
||||
@@ -51,34 +51,39 @@ inline void Add(const ArithmeticParams& params,
|
||||
|
||||
// Element-wise add that can often be used for inner loop of broadcast add as
|
||||
// well as the non-broadcast add.
|
||||
|
||||
// This function is used for 8-bit as well as for 16-bit, but the accumulator
|
||||
// is 32-bit for both cases. The overflow does not happen due to the
|
||||
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
|
||||
template <typename T>
|
||||
inline void AddElementwise(int size, const ArithmeticParams& params,
|
||||
const uint8* input1_data, const uint8* input2_data,
|
||||
uint8* output_data) {
|
||||
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
||||
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
||||
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
||||
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
||||
const T* input1_data, const T* input2_data,
|
||||
T* output_data) {
|
||||
TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
|
||||
TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
|
||||
TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
|
||||
TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32 input1_val = params.input1_offset + input1_data[i];
|
||||
const int32 input2_val = params.input2_offset + input2_data[i];
|
||||
const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32 scaled_input1_val =
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
const int32 scaled_input2_val =
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32 raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32 raw_output =
|
||||
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sum, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32 clamped_output =
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[i] = static_cast<uint8>(clamped_output);
|
||||
output_data[i] = static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,40 +91,40 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
|
||||
// broadcast add, so that, for example, scalar-broadcast with batch will still
|
||||
// be fast.
|
||||
inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
|
||||
uint8 input1_data, const uint8* input2_data,
|
||||
uint8* output_data) {
|
||||
uint8_t input1_data, const uint8_t* input2_data,
|
||||
uint8_t* output_data) {
|
||||
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
||||
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
||||
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
||||
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
||||
|
||||
const int32 input1_val = params.input1_offset + input1_data;
|
||||
const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32 scaled_input1_val =
|
||||
const int32_t input1_val = params.input1_offset + input1_data;
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32 input2_val = params.input2_offset + input2_data[i];
|
||||
const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32 scaled_input2_val =
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32 raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32 raw_output =
|
||||
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sum, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32 clamped_output =
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[i] = static_cast<uint8>(clamped_output);
|
||||
output_data[i] = static_cast<uint8_t>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
inline void Add(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const uint8* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8* input2_data,
|
||||
const RuntimeShape& output_shape, uint8* output_data) {
|
||||
const RuntimeShape& input1_shape, const uint8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8_t* input2_data,
|
||||
const RuntimeShape& output_shape, uint8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
@@ -132,24 +137,53 @@ inline void Add(const ArithmeticParams& params,
|
||||
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void AddGeneralParamScale(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int16_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int16_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
int max_value = std::numeric_limits<int16_t>::max();
|
||||
|
||||
TFLITE_DCHECK_GT(params.input1_offset, -max_value);
|
||||
TFLITE_DCHECK_GT(params.input2_offset, -max_value);
|
||||
TFLITE_DCHECK_LT(params.input1_offset, max_value);
|
||||
TFLITE_DCHECK_LT(params.input2_offset, max_value);
|
||||
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void Add(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const int16* input1_data,
|
||||
const RuntimeShape& input2_shape, const int16* input2_data,
|
||||
const RuntimeShape& output_shape, int16* output_data) {
|
||||
const RuntimeShape& input1_shape, const int16_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const int16_t* input2_data,
|
||||
const RuntimeShape& output_shape, int16_t* output_data,
|
||||
bool pot_scale = true) {
|
||||
if (!pot_scale) {
|
||||
AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
|
||||
input2_data, output_shape, output_data);
|
||||
return;
|
||||
}
|
||||
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
|
||||
const int input1_shift = params.input1_shift;
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
const int16 output_activation_min = params.quantized_activation_min;
|
||||
const int16 output_activation_max = params.quantized_activation_max;
|
||||
const int16_t output_activation_min = params.quantized_activation_min;
|
||||
const int16_t output_activation_max = params.quantized_activation_max;
|
||||
|
||||
TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
|
||||
TFLITE_DCHECK_LE(input1_shift, 0);
|
||||
TFLITE_DCHECK_LE(params.input2_shift, 0);
|
||||
const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
|
||||
const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
|
||||
const int16_t* not_shift_input =
|
||||
input1_shift == 0 ? input1_data : input2_data;
|
||||
const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
|
||||
const int input_right_shift =
|
||||
input1_shift == 0 ? -params.input2_shift : -input1_shift;
|
||||
|
||||
@@ -161,8 +195,8 @@ inline void Add(const ArithmeticParams& params,
|
||||
F0 scaled_input = F0::FromRaw(
|
||||
gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
|
||||
F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
|
||||
const int16 raw_output = result.raw();
|
||||
const int16 clamped_output = std::min(
|
||||
const int16_t raw_output = result.raw();
|
||||
const int16_t clamped_output = std::min(
|
||||
output_activation_max, std::max(output_activation_min, raw_output));
|
||||
output_data[i] = clamped_output;
|
||||
}
|
||||
@@ -218,11 +252,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
|
||||
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int32* input1_data,
|
||||
const int32_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int32* input2_data,
|
||||
const int32_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int32* output_data) {
|
||||
int32_t* output_data) {
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
@@ -257,13 +291,14 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
}
|
||||
}
|
||||
|
||||
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const uint8* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const uint8* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
// This function is used for 8-bit as well as for 16-bit, but the accumulator
|
||||
// is 32-bit for both cases. The overflow does not happen due to the
|
||||
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
|
||||
template <typename T>
|
||||
inline void BroadcastAdd4DSlow(
|
||||
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
||||
const T* input1_data, const RuntimeShape& input2_shape,
|
||||
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
@@ -286,34 +321,34 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
const int32 input1_val =
|
||||
const int32_t input1_val =
|
||||
params.input1_offset +
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
|
||||
const int32 input2_val =
|
||||
const int32_t input2_val =
|
||||
params.input2_offset +
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
|
||||
const int32 shifted_input1_val =
|
||||
const int32_t shifted_input1_val =
|
||||
input1_val * (1 << params.left_shift);
|
||||
const int32 shifted_input2_val =
|
||||
const int32_t shifted_input2_val =
|
||||
input2_val * (1 << params.left_shift);
|
||||
const int32 scaled_input1_val =
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier,
|
||||
params.input1_shift);
|
||||
const int32 scaled_input2_val =
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier,
|
||||
params.input2_shift);
|
||||
const int32 raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32 raw_output =
|
||||
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sum, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32 clamped_output =
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
||||
static_cast<uint8>(clamped_output);
|
||||
static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -322,11 +357,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
|
||||
inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
|
||||
const RuntimeShape& unswitched_input1_shape,
|
||||
const uint8* unswitched_input1_data,
|
||||
const uint8_t* unswitched_input1_data,
|
||||
const RuntimeShape& unswitched_input2_shape,
|
||||
const uint8* unswitched_input2_data,
|
||||
const uint8_t* unswitched_input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
uint8_t* output_data) {
|
||||
ArithmeticParams switched_params = unswitched_params;
|
||||
switched_params.input1_offset = unswitched_params.input2_offset;
|
||||
switched_params.input1_multiplier = unswitched_params.input2_multiplier;
|
||||
@@ -341,18 +376,18 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
|
||||
|
||||
const ArithmeticParams& params =
|
||||
use_unswitched ? unswitched_params : switched_params;
|
||||
const uint8* input1_data =
|
||||
const uint8_t* input1_data =
|
||||
use_unswitched ? unswitched_input1_data : unswitched_input2_data;
|
||||
const uint8* input2_data =
|
||||
const uint8_t* input2_data =
|
||||
use_unswitched ? unswitched_input2_data : unswitched_input1_data;
|
||||
|
||||
// Fivefold nested loops. The second input resets its position for each
|
||||
// iteration of the second loop. The first input resets its position at the
|
||||
// beginning of the fourth loop. The innermost loop is an elementwise add of
|
||||
// sections of the arrays.
|
||||
uint8* output_data_ptr = output_data;
|
||||
const uint8* input1_data_ptr = input1_data;
|
||||
const uint8* input2_data_reset = input2_data;
|
||||
uint8_t* output_data_ptr = output_data;
|
||||
const uint8_t* input1_data_ptr = input1_data;
|
||||
const uint8_t* input2_data_reset = input2_data;
|
||||
// In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
|
||||
// between input shapes. y3 for input 1 is always broadcast, and so the
|
||||
// dimension there is 1, whereas optionally y1 might be broadcast for input 2.
|
||||
@@ -368,7 +403,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
|
||||
// General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
|
||||
// dimension.
|
||||
for (int i0 = 0; i0 < y0; ++i0) {
|
||||
const uint8* input2_data_ptr;
|
||||
const uint8_t* input2_data_ptr;
|
||||
for (int i1 = 0; i1 < y1; ++i1) {
|
||||
input2_data_ptr = input2_data_reset;
|
||||
for (int i2 = 0; i2 < y2; ++i2) {
|
||||
@@ -397,7 +432,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
|
||||
// for y4 == 1 and the loop over y3 is contained within the
|
||||
// AddScalarBroadcast function.
|
||||
for (int i0 = 0; i0 < y0; ++i0) {
|
||||
const uint8* input2_data_ptr;
|
||||
const uint8_t* input2_data_ptr;
|
||||
for (int i1 = 0; i1 < y1; ++i1) {
|
||||
input2_data_ptr = input2_data_reset;
|
||||
for (int i2 = 0; i2 < y2; ++i2) {
|
||||
|
||||
@@ -18,7 +18,6 @@ limitations under the License.
|
||||
#include "tensorflow/lite/c/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
#include "tensorflow/lite/string_util.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
@@ -51,18 +50,6 @@ inline bool LessEqualFn(T lhs, T rhs) {
|
||||
return lhs <= rhs;
|
||||
}
|
||||
|
||||
inline bool StringRefEqualFn(const StringRef& lhs, const StringRef& rhs) {
|
||||
if (lhs.len != rhs.len) return false;
|
||||
for (int i = 0; i < lhs.len; ++i) {
|
||||
if (lhs.str[i] != rhs.str[i]) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool StringRefNotEqualFn(const StringRef& lhs, const StringRef& rhs) {
|
||||
return !StringRefEqualFn(lhs, rhs);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
using ComparisonFn = bool (*)(T, T);
|
||||
|
||||
@@ -78,22 +65,6 @@ inline void ComparisonImpl(
|
||||
}
|
||||
}
|
||||
|
||||
template <bool (*F)(const StringRef&, const StringRef&)>
|
||||
inline void ComparisonStringImpl(const RuntimeShape& input1_shape,
|
||||
const TfLiteTensor* input1,
|
||||
const RuntimeShape& input2_shape,
|
||||
const TfLiteTensor* input2,
|
||||
const RuntimeShape& output_shape,
|
||||
bool* output_data) {
|
||||
const int64_t flatsize =
|
||||
MatchingFlatSize(input1_shape, input2_shape, output_shape);
|
||||
for (int64_t i = 0; i < flatsize; ++i) {
|
||||
const auto lhs = GetString(input1, i);
|
||||
const auto rhs = GetString(input2, i);
|
||||
output_data[i] = F(lhs, rhs);
|
||||
}
|
||||
}
|
||||
|
||||
template <ComparisonFn<float> F>
|
||||
inline void Comparison(const ComparisonParams& op_params,
|
||||
const RuntimeShape& input1_shape,
|
||||
@@ -105,30 +76,30 @@ inline void Comparison(const ComparisonParams& op_params,
|
||||
input2_data, output_shape, output_data);
|
||||
}
|
||||
|
||||
template <typename T, ComparisonFn<int32> F>
|
||||
template <typename T, ComparisonFn<int32_t> F>
|
||||
inline void ComparisonWithScaling(
|
||||
const ComparisonParams& op_params, const RuntimeShape& input1_shape,
|
||||
const T* input1_data, const RuntimeShape& input2_shape,
|
||||
const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
|
||||
int left_shift = op_params.left_shift;
|
||||
int32 input1_offset = op_params.input1_offset;
|
||||
int32 input1_multiplier = op_params.input1_multiplier;
|
||||
int32_t input1_offset = op_params.input1_offset;
|
||||
int32_t input1_multiplier = op_params.input1_multiplier;
|
||||
int input1_shift = op_params.input1_shift;
|
||||
int32 input2_offset = op_params.input2_offset;
|
||||
int32 input2_multiplier = op_params.input2_multiplier;
|
||||
int32_t input2_offset = op_params.input2_offset;
|
||||
int32_t input2_multiplier = op_params.input2_multiplier;
|
||||
int input2_shift = op_params.input2_shift;
|
||||
|
||||
const int64_t flatsize =
|
||||
MatchingFlatSize(input1_shape, input2_shape, output_shape);
|
||||
for (int64_t i = 0; i < flatsize; ++i) {
|
||||
const int32 input1_val = input1_offset + input1_data[i];
|
||||
const int32 input2_val = input2_offset + input2_data[i];
|
||||
const int32 shifted_input1_val = input1_val * (1 << left_shift);
|
||||
const int32 shifted_input2_val = input2_val * (1 << left_shift);
|
||||
const int32 scaled_input1_val =
|
||||
const int32_t input1_val = input1_offset + input1_data[i];
|
||||
const int32_t input2_val = input2_offset + input2_data[i];
|
||||
const int32_t shifted_input1_val = input1_val * (1 << left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, input1_multiplier, input1_shift);
|
||||
const int32 scaled_input2_val =
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, input2_multiplier, input2_shift);
|
||||
output_data[i] = F(scaled_input1_val, scaled_input2_val);
|
||||
@@ -180,31 +151,6 @@ inline void BroadcastComparison4DSlowImpl(
|
||||
}
|
||||
}
|
||||
|
||||
template <bool (*F)(const StringRef&, const StringRef&)>
|
||||
inline void BroadcastComparison4DSlowStringImpl(
|
||||
const RuntimeShape& unextended_input1_shape, const TfLiteTensor* input1,
|
||||
const RuntimeShape& unextended_input2_shape, const TfLiteTensor* input2,
|
||||
const RuntimeShape& unextended_output_shape, bool* output_data) {
|
||||
const BroadcastComparison4DSlowCommon dims =
|
||||
BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
|
||||
unextended_input2_shape,
|
||||
unextended_output_shape);
|
||||
|
||||
for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
|
||||
const auto lhs =
|
||||
GetString(input1, SubscriptToIndex(dims.desc1, b, y, x, c));
|
||||
const auto rhs =
|
||||
GetString(input2, SubscriptToIndex(dims.desc2, b, y, x, c));
|
||||
output_data[Offset(dims.output_shape, b, y, x, c)] = F(lhs, rhs);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <ComparisonFn<float> F>
|
||||
inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
|
||||
const RuntimeShape& input1_shape,
|
||||
@@ -218,7 +164,7 @@ inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
|
||||
output_shape, output_data);
|
||||
}
|
||||
|
||||
template <typename T, ComparisonFn<int32> F>
|
||||
template <typename T, ComparisonFn<int32_t> F>
|
||||
inline void BroadcastComparison4DSlowWithScaling(
|
||||
const ComparisonParams& op_params,
|
||||
const RuntimeShape& unextended_input1_shape, const T* input1_data,
|
||||
@@ -230,29 +176,29 @@ inline void BroadcastComparison4DSlowWithScaling(
|
||||
unextended_output_shape);
|
||||
|
||||
int left_shift = op_params.left_shift;
|
||||
int32 input1_offset = op_params.input1_offset;
|
||||
int32 input1_multiplier = op_params.input1_multiplier;
|
||||
int32_t input1_offset = op_params.input1_offset;
|
||||
int32_t input1_multiplier = op_params.input1_multiplier;
|
||||
int input1_shift = op_params.input1_shift;
|
||||
int32 input2_offset = op_params.input2_offset;
|
||||
int32 input2_multiplier = op_params.input2_multiplier;
|
||||
int32_t input2_offset = op_params.input2_offset;
|
||||
int32_t input2_multiplier = op_params.input2_multiplier;
|
||||
int input2_shift = op_params.input2_shift;
|
||||
|
||||
for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
|
||||
const int32 input1_val =
|
||||
const int32_t input1_val =
|
||||
input1_offset +
|
||||
input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)];
|
||||
const int32 input2_val =
|
||||
const int32_t input2_val =
|
||||
input2_offset +
|
||||
input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)];
|
||||
const int32 shifted_input1_val = input1_val * (1 << left_shift);
|
||||
const int32 shifted_input2_val = input2_val * (1 << left_shift);
|
||||
const int32 scaled_input1_val =
|
||||
const int32_t shifted_input1_val = input1_val * (1 << left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, input1_multiplier, input1_shift);
|
||||
const int32 scaled_input2_val =
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, input2_multiplier, input2_shift);
|
||||
output_data[Offset(dims.output_shape, b, y, x, c)] =
|
||||
|
||||
@@ -74,14 +74,14 @@ inline void Concatenation(const ConcatenationParams& params,
|
||||
// when optimizng this routine further.
|
||||
inline void ConcatenationWithScaling(const ConcatenationParams& params,
|
||||
const RuntimeShape* const* input_shapes,
|
||||
const uint8* const* input_data,
|
||||
const uint8_t* const* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
uint8_t* output_data) {
|
||||
int axis = params.axis;
|
||||
const int32* input_zeropoint = params.input_zeropoint;
|
||||
const int32_t* input_zeropoint = params.input_zeropoint;
|
||||
const float* input_scale = params.input_scale;
|
||||
int inputs_count = params.inputs_count;
|
||||
const int32 output_zeropoint = params.output_zeropoint;
|
||||
const int32_t output_zeropoint = params.output_zeropoint;
|
||||
const float output_scale = params.output_scale;
|
||||
|
||||
const int concat_dimensions = output_shape.DimensionsCount();
|
||||
@@ -110,11 +110,11 @@ inline void ConcatenationWithScaling(const ConcatenationParams& params,
|
||||
}
|
||||
|
||||
const float inverse_output_scale = 1.f / output_scale;
|
||||
uint8* output_ptr = output_data;
|
||||
uint8_t* output_ptr = output_data;
|
||||
for (int k = 0; k < outer_size; k++) {
|
||||
for (int i = 0; i < inputs_count; ++i) {
|
||||
const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
|
||||
const uint8* input_ptr = input_data[i] + k * copy_size;
|
||||
const uint8_t* input_ptr = input_data[i] + k * copy_size;
|
||||
if (input_zeropoint[i] == output_zeropoint &&
|
||||
input_scale[i] == output_scale) {
|
||||
memcpy(output_ptr, input_ptr, copy_size);
|
||||
|
||||
@@ -59,28 +59,31 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
||||
const int output_width = output_shape.Dims(2);
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
float total = 0.f;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
const int in_y = in_y_origin + dilation_height_factor * filter_y;
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
|
||||
if (!is_point_inside_image) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
const int in_y =
|
||||
in_y_origin + dilation_height_factor * filter_y;
|
||||
// If the location is outside the bounds of the input image,
|
||||
// use zero as a default value.
|
||||
if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height)) {
|
||||
float input_value = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
float filter_value =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
total += (input_value * filter_value);
|
||||
}
|
||||
float input_value = input_data[Offset(input_shape, batch, in_y,
|
||||
in_x, in_channel)];
|
||||
float filter_value = filter_data[Offset(
|
||||
filter_shape, out_channel, filter_y, filter_x, in_channel)];
|
||||
total += (input_value * filter_value);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -99,11 +102,11 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
||||
}
|
||||
|
||||
inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
uint8* output_data, const RuntimeShape& im2col_shape,
|
||||
uint8* im2col_data, void* cpu_backend_context) {
|
||||
const uint8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data, const RuntimeShape& im2col_shape,
|
||||
uint8_t* im2col_data, void* cpu_backend_context) {
|
||||
(void)cpu_backend_context; // only used in optimized code.
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
@@ -113,13 +116,13 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int32 input_offset = params.input_offset;
|
||||
const int32 filter_offset = params.weights_offset;
|
||||
const int32 output_offset = params.output_offset;
|
||||
const int32 output_multiplier = params.output_multiplier;
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
@@ -139,29 +142,32 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
||||
const int output_width = output_shape.Dims(2);
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
const int in_y = in_y_origin + dilation_height_factor * filter_y;
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
|
||||
if (!is_point_inside_image) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
const int in_y =
|
||||
in_y_origin + dilation_height_factor * filter_y;
|
||||
// If the location is outside the bounds of the input image,
|
||||
// use zero as a default value.
|
||||
if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height)) {
|
||||
int32 input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
int32_t input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
in_x, in_channel)];
|
||||
int32 filter_val =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
acc +=
|
||||
(filter_val + filter_offset) * (input_val + input_offset);
|
||||
}
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, out_channel, filter_y, filter_x, in_channel)];
|
||||
acc +=
|
||||
(filter_val + filter_offset) * (input_val + input_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -174,7 +180,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
|
||||
static_cast<uint8>(acc);
|
||||
static_cast<uint8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -220,7 +226,7 @@ inline void HybridConvPerChannel(
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
@@ -231,9 +237,9 @@ inline void HybridConvPerChannel(
|
||||
// use zero as a default value.
|
||||
if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height)) {
|
||||
int32 input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
in_x, in_channel)];
|
||||
int32 filter_val =
|
||||
int32_t input_val = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
int32_t filter_val =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
acc += filter_val * (input_val - input_offset[batch]);
|
||||
@@ -258,5 +264,4 @@ inline void HybridConvPerChannel(
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
|
||||
|
||||
@@ -62,21 +62,21 @@ namespace reference_ops {
|
||||
namespace depthwise_conv {
|
||||
|
||||
template <DepthwiseConvOutputRounding output_rounding>
|
||||
inline int32 DepthwiseConvRound(int32 x, int32 quantized_multiplier,
|
||||
int shift) {
|
||||
inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
|
||||
int shift) {
|
||||
TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
|
||||
return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
|
||||
int32 x, int32 quantized_multiplier, int shift) {
|
||||
inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
|
||||
int32_t x, int32_t quantized_multiplier, int shift) {
|
||||
return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
|
||||
int32 x, int32 quantized_multiplier, int shift) {
|
||||
inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
|
||||
int32_t x, int32_t quantized_multiplier, int shift) {
|
||||
using gemmlowp::SaturatingRoundingDoublingHighMul;
|
||||
const int left_shift = shift > 0 ? shift : 0;
|
||||
const int right_shift = shift > 0 ? 0 : -shift;
|
||||
@@ -89,13 +89,12 @@ inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
|
||||
|
||||
template <DepthwiseConvOutputRounding output_rounding>
|
||||
struct DepthwiseConvBasicKernel {
|
||||
static inline void Run(const DepthwiseParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const uint8* input_data,
|
||||
const RuntimeShape& filter_shape,
|
||||
const uint8* filter_data,
|
||||
const RuntimeShape& bias_shape, const int32* bias_data,
|
||||
const RuntimeShape& output_shape, uint8* output_data) {
|
||||
static inline void Run(
|
||||
const DepthwiseParams& params, const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
@@ -103,12 +102,12 @@ struct DepthwiseConvBasicKernel {
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int depth_multiplier = params.depth_multiplier;
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
const int32 input_offset = params.input_offset;
|
||||
const int32 filter_offset = params.weights_offset;
|
||||
const int32 output_offset = params.output_offset;
|
||||
const int32 output_multiplier = params.output_multiplier;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
@@ -135,7 +134,7 @@ struct DepthwiseConvBasicKernel {
|
||||
const int oc = m + ic * depth_multiplier;
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x =
|
||||
@@ -146,9 +145,9 @@ struct DepthwiseConvBasicKernel {
|
||||
// use zero as a default value.
|
||||
if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height)) {
|
||||
int32 input_val =
|
||||
int32_t input_val =
|
||||
input_data[Offset(input_shape, b, in_y, in_x, ic)];
|
||||
int32 filter_val = filter_data[Offset(
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, 0, filter_y, filter_x, oc)];
|
||||
acc += (filter_val + filter_offset) *
|
||||
(input_val + input_offset);
|
||||
@@ -164,7 +163,7 @@ struct DepthwiseConvBasicKernel {
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_data[Offset(output_shape, b, out_y, out_x, oc)] =
|
||||
static_cast<uint8>(acc);
|
||||
static_cast<uint8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -176,10 +175,10 @@ struct DepthwiseConvBasicKernel {
|
||||
// MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
|
||||
static inline void RunPerChannel(
|
||||
const DepthwiseParams& params, const RuntimeShape& input_shape,
|
||||
const int8* input_data, const RuntimeShape& filter_shape,
|
||||
const int8* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
int8* output_data) {
|
||||
const int8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
// Get parameters.
|
||||
// TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
|
||||
const int stride_width = params.stride_width;
|
||||
@@ -189,12 +188,12 @@ struct DepthwiseConvBasicKernel {
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int depth_multiplier = params.depth_multiplier;
|
||||
const int32 input_offset = params.input_offset;
|
||||
const int32 output_offset = params.output_offset;
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
const int32* output_multiplier = params.output_multiplier_per_channel;
|
||||
const int32* output_shift = params.output_shift_per_channel;
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
const int32_t* output_multiplier = params.output_multiplier_per_channel;
|
||||
const int32_t* output_shift = params.output_shift_per_channel;
|
||||
|
||||
// Check dimensions of the tensors.
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
@@ -222,7 +221,7 @@ struct DepthwiseConvBasicKernel {
|
||||
const int output_channel = m + in_channel * depth_multiplier;
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x =
|
||||
@@ -234,17 +233,18 @@ struct DepthwiseConvBasicKernel {
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
if (is_point_inside_image) {
|
||||
int32 input_val = input_data[Offset(
|
||||
int32_t input_val = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
int32 filter_val = filter_data[Offset(
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, 0, filter_y, filter_x, output_channel)];
|
||||
// Accumulate with 32 bits accumulator.
|
||||
// In the nudging process during model quantization, we
|
||||
// force real value of 0.0 be represented by a quantized
|
||||
// value. This guarantees that the input_offset is a int8,
|
||||
// even though it is represented using int32. int32 += int8
|
||||
// * (int8 - int8) so the highest value we can get from each
|
||||
// accumulation is [-127, 127] * ([-128, 127] -
|
||||
// value. This guarantees that the input_offset is a int8_t,
|
||||
// even though it is represented using int32_t. int32_t +=
|
||||
// int8_t
|
||||
// * (int8_t - int8_t) so the highest value we can get from
|
||||
// each accumulation is [-127, 127] * ([-128, 127] -
|
||||
// [-128, 127]), which is [-32512, 32512]. log2(32512)
|
||||
// = 14.98, which means we can accumulate at least 2^16
|
||||
// multiplications without overflow. The accumulator is
|
||||
@@ -279,10 +279,10 @@ struct DepthwiseConvBasicKernel {
|
||||
|
||||
inline void DepthwiseConv(
|
||||
const DepthwiseParams& params, const RuntimeShape& input_shape,
|
||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
const uint8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
return depthwise_conv::DepthwiseConvBasicKernel<
|
||||
DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
|
||||
input_data, filter_shape,
|
||||
|
||||
@@ -32,12 +32,12 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
|
||||
const RuntimeShape& input_shape,
|
||||
const InputT* input_data,
|
||||
const RuntimeShape& output_shape, OutputT* output_data) {
|
||||
int32 zero_point = op_params.zero_point;
|
||||
int32_t zero_point = op_params.zero_point;
|
||||
const double scale = op_params.scale;
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
const int32 val = input_data[i];
|
||||
const int32_t val = input_data[i];
|
||||
const OutputT result = static_cast<OutputT>(scale * (val - zero_point));
|
||||
output_data[i] = result;
|
||||
}
|
||||
@@ -52,11 +52,11 @@ inline void PerChannelDequantize(
|
||||
// Ensure flat size is same.
|
||||
MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
const int32* zero_point = op_params.zero_point;
|
||||
const int32_t* zero_point = op_params.zero_point;
|
||||
const float* scale = op_params.scale;
|
||||
const int32 quantized_dimension = op_params.quantized_dimension;
|
||||
const int32 num_dims = input_shape.DimensionsCount();
|
||||
const int32* dims_data = input_shape.DimsData();
|
||||
const int32_t quantized_dimension = op_params.quantized_dimension;
|
||||
const int32_t num_dims = input_shape.DimensionsCount();
|
||||
const int32_t* dims_data = input_shape.DimsData();
|
||||
std::vector<int> current_dim(num_dims, 0);
|
||||
|
||||
do {
|
||||
@@ -64,7 +64,7 @@ inline void PerChannelDequantize(
|
||||
ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
|
||||
current_dim.data(), 0, nullptr);
|
||||
const int channel = current_dim[quantized_dimension];
|
||||
const int32 val = input_data[offset];
|
||||
const int32_t val = input_data[offset];
|
||||
const float result =
|
||||
static_cast<float>(scale[channel] * (val - zero_point[channel]));
|
||||
output_data[offset] = result;
|
||||
|
||||
@@ -61,17 +61,17 @@ inline void FullyConnected(
|
||||
|
||||
inline void FullyConnected(
|
||||
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
const int32 input_offset = params.input_offset;
|
||||
const int32 filter_offset = params.weights_offset;
|
||||
const int32 output_offset = params.output_offset;
|
||||
const int32 output_multiplier = params.output_multiplier;
|
||||
const uint8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
|
||||
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
|
||||
|
||||
@@ -89,10 +89,10 @@ inline void FullyConnected(
|
||||
const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int out_c = 0; out_c < output_depth; ++out_c) {
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
for (int d = 0; d < accum_depth; ++d) {
|
||||
int32 input_val = input_data[b * accum_depth + d];
|
||||
int32 filter_val = filter_data[out_c * accum_depth + d];
|
||||
int32_t input_val = input_data[b * accum_depth + d];
|
||||
int32_t filter_val = filter_data[out_c * accum_depth + d];
|
||||
acc += (filter_val + filter_offset) * (input_val + input_offset);
|
||||
}
|
||||
if (bias_data) {
|
||||
@@ -102,24 +102,24 @@ inline void FullyConnected(
|
||||
acc += output_offset;
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
|
||||
output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void FullyConnected(
|
||||
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
||||
const uint8* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
int16* output_data) {
|
||||
const int32 input_offset = params.input_offset;
|
||||
const int32 filter_offset = params.weights_offset;
|
||||
const int32 output_offset = params.output_offset;
|
||||
const int32 output_multiplier = params.output_multiplier;
|
||||
const uint8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
TFLITE_DCHECK_EQ(output_offset, 0);
|
||||
@@ -138,20 +138,21 @@ inline void FullyConnected(
|
||||
for (int out_c = 0; out_c < output_depth; ++out_c) {
|
||||
// Internal accumulation.
|
||||
// Initialize accumulator with the bias-value.
|
||||
int32 accum = bias_data[out_c];
|
||||
int32_t accum = bias_data[out_c];
|
||||
// Accumulation loop.
|
||||
for (int d = 0; d < accum_depth; ++d) {
|
||||
int16 input_val = input_data[b * accum_depth + d] + input_offset;
|
||||
int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset;
|
||||
int16_t input_val = input_data[b * accum_depth + d] + input_offset;
|
||||
int16_t filter_val =
|
||||
filter_data[out_c * accum_depth + d] + filter_offset;
|
||||
accum += filter_val * input_val;
|
||||
}
|
||||
// Down-scale the final int32 accumulator to the scale used by our
|
||||
// Down-scale the final int32_t accumulator to the scale used by our
|
||||
// (16-bit, typically 3 integer bits) fixed-point format. The quantized
|
||||
// multiplier and shift here have been pre-computed offline
|
||||
// (e.g. by toco).
|
||||
accum =
|
||||
MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
|
||||
// Saturate, cast to int16, and store to output array.
|
||||
// Saturate, cast to int16_t, and store to output array.
|
||||
accum = std::max(accum, output_activation_min - output_offset);
|
||||
accum = std::min(accum, output_activation_max - output_offset);
|
||||
accum += output_offset;
|
||||
@@ -162,14 +163,14 @@ inline void FullyConnected(
|
||||
|
||||
inline void ShuffledFullyConnected(
|
||||
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
||||
const uint8* input_data, const RuntimeShape& weights_shape,
|
||||
const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
int16* output_data, uint8* shuffled_input_workspace_data) {
|
||||
const int32 output_multiplier = params.output_multiplier;
|
||||
const uint8_t* input_data, const RuntimeShape& weights_shape,
|
||||
const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data, uint8_t* shuffled_input_workspace_data) {
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
|
||||
TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
|
||||
@@ -190,7 +191,7 @@ inline void ShuffledFullyConnected(
|
||||
TFLITE_DCHECK((output_depth % 4) == 0);
|
||||
|
||||
// Shuffling and xoring of input activations into the workspace buffer
|
||||
uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
|
||||
uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
|
||||
if (batches == 1) {
|
||||
for (int i = 0; i < accum_depth; i++) {
|
||||
shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
|
||||
@@ -198,13 +199,13 @@ inline void ShuffledFullyConnected(
|
||||
} else if (batches == 4) {
|
||||
for (int c = 0; c < accum_depth; c += 16) {
|
||||
for (int b = 0; b < 4; b++) {
|
||||
const uint8* src_data_ptr = input_data + b * accum_depth + c;
|
||||
const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
|
||||
for (int j = 0; j < 16; j++) {
|
||||
uint8 src_val = *src_data_ptr++;
|
||||
uint8_t src_val = *src_data_ptr++;
|
||||
// Flip the sign bit, so that the kernel will only need to
|
||||
// reinterpret these uint8 values as int8, getting for free the
|
||||
// reinterpret these uint8_t values as int8_t, getting for free the
|
||||
// subtraction of the zero_point value 128.
|
||||
uint8 dst_val = src_val ^ 0x80;
|
||||
uint8_t dst_val = src_val ^ 0x80;
|
||||
*shuffled_input_workspace_ptr++ = dst_val;
|
||||
}
|
||||
}
|
||||
@@ -216,62 +217,62 @@ inline void ShuffledFullyConnected(
|
||||
|
||||
// Actual computation
|
||||
if (batches == 1) {
|
||||
int16* output_ptr = output_data;
|
||||
int16_t* output_ptr = output_data;
|
||||
// Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
|
||||
// so that just reinterpreting them as int8 values is equivalent to
|
||||
// so that just reinterpreting them as int8_t values is equivalent to
|
||||
// subtracting 128 from them, thus implementing for free the subtraction of
|
||||
// the zero_point value 128.
|
||||
const int8* shuffled_weights_ptr =
|
||||
reinterpret_cast<const int8*>(shuffled_weights_data);
|
||||
const int8_t* shuffled_weights_ptr =
|
||||
reinterpret_cast<const int8_t*>(shuffled_weights_data);
|
||||
// Likewise, we preshuffled and pre-xored the input data above.
|
||||
const int8* shuffled_input_data =
|
||||
reinterpret_cast<const int8*>(shuffled_input_workspace_data);
|
||||
const int8_t* shuffled_input_data =
|
||||
reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
|
||||
for (int c = 0; c < output_depth; c += 4) {
|
||||
// Internal accumulation.
|
||||
// Initialize accumulator with the bias-value.
|
||||
int32 accum[4] = {0};
|
||||
int32_t accum[4] = {0};
|
||||
// Accumulation loop.
|
||||
for (int d = 0; d < accum_depth; d += 16) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int j = 0; j < 16; j++) {
|
||||
int8 input_val = shuffled_input_data[d + j];
|
||||
int8 weights_val = *shuffled_weights_ptr++;
|
||||
int8_t input_val = shuffled_input_data[d + j];
|
||||
int8_t weights_val = *shuffled_weights_ptr++;
|
||||
accum[i] += weights_val * input_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 4; i++) {
|
||||
// Add bias value
|
||||
int32 acc = accum[i] + bias_data[c + i];
|
||||
// Down-scale the final int32 accumulator to the scale used by our
|
||||
int32_t acc = accum[i] + bias_data[c + i];
|
||||
// Down-scale the final int32_t accumulator to the scale used by our
|
||||
// (16-bit, typically 3 integer bits) fixed-point format. The quantized
|
||||
// multiplier and shift here have been pre-computed offline
|
||||
// (e.g. by toco).
|
||||
acc =
|
||||
MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
|
||||
// Saturate, cast to int16, and store to output array.
|
||||
// Saturate, cast to int16_t, and store to output array.
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_ptr[c + i] = acc;
|
||||
}
|
||||
}
|
||||
} else if (batches == 4) {
|
||||
int16* output_ptr = output_data;
|
||||
int16_t* output_ptr = output_data;
|
||||
// Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
|
||||
// so that just reinterpreting them as int8 values is equivalent to
|
||||
// so that just reinterpreting them as int8_t values is equivalent to
|
||||
// subtracting 128 from them, thus implementing for free the subtraction of
|
||||
// the zero_point value 128.
|
||||
const int8* shuffled_weights_ptr =
|
||||
reinterpret_cast<const int8*>(shuffled_weights_data);
|
||||
const int8_t* shuffled_weights_ptr =
|
||||
reinterpret_cast<const int8_t*>(shuffled_weights_data);
|
||||
// Likewise, we preshuffled and pre-xored the input data above.
|
||||
const int8* shuffled_input_data =
|
||||
reinterpret_cast<const int8*>(shuffled_input_workspace_data);
|
||||
const int8_t* shuffled_input_data =
|
||||
reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
|
||||
for (int c = 0; c < output_depth; c += 4) {
|
||||
const int8* shuffled_input_ptr = shuffled_input_data;
|
||||
const int8_t* shuffled_input_ptr = shuffled_input_data;
|
||||
// Accumulation loop.
|
||||
// Internal accumulation.
|
||||
// Initialize accumulator with the bias-value.
|
||||
int32 accum[4][4];
|
||||
int32_t accum[4][4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int b = 0; b < 4; b++) {
|
||||
accum[i][b] = 0;
|
||||
@@ -281,8 +282,8 @@ inline void ShuffledFullyConnected(
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int b = 0; b < 4; b++) {
|
||||
for (int j = 0; j < 16; j++) {
|
||||
int8 input_val = shuffled_input_ptr[16 * b + j];
|
||||
int8 weights_val = shuffled_weights_ptr[16 * i + j];
|
||||
int8_t input_val = shuffled_input_ptr[16 * b + j];
|
||||
int8_t weights_val = shuffled_weights_ptr[16 * i + j];
|
||||
accum[i][b] += weights_val * input_val;
|
||||
}
|
||||
}
|
||||
@@ -293,14 +294,14 @@ inline void ShuffledFullyConnected(
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int b = 0; b < 4; b++) {
|
||||
// Add bias value
|
||||
int32 acc = accum[i][b] + bias_data[c + i];
|
||||
// Down-scale the final int32 accumulator to the scale used by our
|
||||
int32_t acc = accum[i][b] + bias_data[c + i];
|
||||
// Down-scale the final int32_t accumulator to the scale used by our
|
||||
// (16-bit, typically 3 integer bits) fixed-point format. The
|
||||
// quantized multiplier and shift here have been pre-computed offline
|
||||
// (e.g. by toco).
|
||||
acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
|
||||
output_shift);
|
||||
// Saturate, cast to int16, and store to output array.
|
||||
// Saturate, cast to int16_t, and store to output array.
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_ptr[b * output_depth + c + i] = acc;
|
||||
|
||||
@@ -0,0 +1,166 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline int16_t SaturatingLeftShift(int16_t value, int amount) {
|
||||
int32_t result = static_cast<int32_t>(value) * (1 << amount);
|
||||
result = std::min<int32_t>(result, std::numeric_limits<int16_t>::max());
|
||||
result = std::max<int32_t>(result, std::numeric_limits<int16_t>::min());
|
||||
return result;
|
||||
}
|
||||
|
||||
// Similar to ARM instruction SQDMULH.
|
||||
// Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
|
||||
// rounding to zero instead of to nearest (SQRDMULH).
|
||||
inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
|
||||
bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
|
||||
std::int32_t a_32(a);
|
||||
std::int32_t b_32(b);
|
||||
std::int32_t ab_32 = a_32 * b_32;
|
||||
std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
|
||||
return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("ReferenceHardSwish/Float");
|
||||
auto matching_size = MatchingFlatSize(input_shape, output_shape);
|
||||
const T* in_end = input_data + matching_size;
|
||||
for (; input_data < in_end; input_data++, output_data++) {
|
||||
const float in = *input_data;
|
||||
*output_data =
|
||||
in * std::min(static_cast<T>(6), std::max(static_cast<T>(0), in + 3)) /
|
||||
6;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void HardSwish(const HardSwishParams& params,
|
||||
const RuntimeShape& input_shape, const T* input_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("ReferenceHardSwish/Quantized");
|
||||
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
const int16_t input_value = input_data[i] - params.input_zero_point;
|
||||
// Left-shift as much as we can without overflow/saturation to put
|
||||
// significant bits in the high bits of our 16-bit fixedpoint values, so
|
||||
// that fixed-point approximate computations below are as accurate as
|
||||
// possible.
|
||||
const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
|
||||
// Compute the input value on essentially the output scale, just not
|
||||
// right-shifted yet. This is the value that we'll use in the (x >= +3)
|
||||
// case, and that in the general case we'll multiply against the "relu-ish"
|
||||
// fixed-point multiplier in [0, 1].
|
||||
const int16_t input_value_on_preshift_output_scale =
|
||||
gemmlowp::SaturatingRoundingDoublingHighMul(
|
||||
input_value_on_hires_input_scale,
|
||||
params.output_multiplier_fixedpoint_int16);
|
||||
// Now compute the "relu-ish multiplier". In the (-3 <= x <= +3) case, that
|
||||
// is just an affine rescaling of x from [-3, 3] to [0, 1]. In the general
|
||||
// case, it is just that plus saturation at the boundaries of [-3, 3].
|
||||
// First, we rescale from [-3, 3] to [-1, 1], saturating.
|
||||
// That is done by rescaling the input value with a fixed-point multiplier
|
||||
// (reluish_multiplier_fixedpoint) and bit-shift such that we represent
|
||||
// that input value on the scale where the real value 3.0f is represented
|
||||
// by the quantized value 32768. (+32768 is actually not representable as
|
||||
// int16_t, so this saturates at +32767, and that is seen empirically to be
|
||||
// a negligible contribution to numerical error/bias).
|
||||
//
|
||||
// This code is careful to correctly implement any magnitude of multiplier,
|
||||
// involving either a right shift or a left shift, with correct saturation
|
||||
// behavior in the left-shift case. This forces this code to be more
|
||||
// complicated, but is necessary for real applications: a partially
|
||||
// trained quantized MobileNet v3-small model that motivated this code
|
||||
// exhibits some large [min, max] range boundaries, of the order of
|
||||
// magnitude of 10 or 100 depending on layers.
|
||||
//
|
||||
// The next few lines are basically just an ordinary
|
||||
// MultiplyByQuantizedMultiplier, except that we are more careful here
|
||||
// about the fine details of saturation when left-shifting, because here
|
||||
// overflow in left-shift is a common case, not an anomaly as
|
||||
// MultiplyByQuantizedMultiplier assumes.
|
||||
int16_t reluish_value = input_value_on_hires_input_scale;
|
||||
// Shift left, saturating, as much as we can while ensuring that this
|
||||
// saturation will not contribute to the result. That is, left shift amount
|
||||
// reduced by 1.
|
||||
if (params.reluish_multiplier_exponent > 0) {
|
||||
reluish_value = SaturatingLeftShift(
|
||||
reluish_value, params.reluish_multiplier_exponent - 1);
|
||||
}
|
||||
// Apply the fixed-point multiplier, dividing the value by a divisor
|
||||
// ranging in [1, 2].
|
||||
reluish_value = gemmlowp::SaturatingRoundingDoublingHighMul(
|
||||
reluish_value, params.reluish_multiplier_fixedpoint_int16);
|
||||
// Apply the last bit of left-shift. Thus, in the left-shifting case, if
|
||||
// any saturation affects the result, it is happening here --- any
|
||||
// saturation having occurred above is overwritten here, not affecting the
|
||||
// result.
|
||||
if (params.reluish_multiplier_exponent > 0) {
|
||||
reluish_value = SaturatingLeftShift(reluish_value, 1);
|
||||
}
|
||||
// Shift right, in the right-shifting case.
|
||||
if (params.reluish_multiplier_exponent < 0) {
|
||||
reluish_value = gemmlowp::RoundingDivideByPOT(
|
||||
reluish_value, -params.reluish_multiplier_exponent);
|
||||
}
|
||||
// At this point we have rescaled the value into a 16bit fixedpoint
|
||||
// reluish_value in [-1, 1].
|
||||
// We now convert that to a 16bit fixedpoint value in [0, 1].
|
||||
reluish_value = (reluish_value + (1 << 15)) >> 1;
|
||||
// Use of SaturatingDoublingHighMul here is important to cancel the biases
|
||||
// from the above SaturatingRoundingDoublingHighMul.
|
||||
//
|
||||
// On a partially trained MobileNet-v3-small,
|
||||
//
|
||||
// | bias on | ImageNet
|
||||
// | quantized | Top-1
|
||||
// Operation used here | values | accuracy (50k)
|
||||
// --------------------------------------+------------+-----------
|
||||
// SaturatingDoublingHighMul | -0.0024 | 58.920
|
||||
// SaturatingRoundingDoublingHighMul | -0.0067 | 58.064
|
||||
//
|
||||
// In activations_test, this is covered by this testcase:
|
||||
// QuantizedActivationsOpTest.HardSwishBias
|
||||
//
|
||||
const int16_t preshift_output_value = SaturatingDoublingHighMul(
|
||||
reluish_value, input_value_on_preshift_output_scale);
|
||||
// We were so far operating on the pre-shift output scale. Now we finally
|
||||
// apply that output shift, arriving at the final output scale.
|
||||
int16_t output_value = gemmlowp::RoundingDivideByPOT(
|
||||
preshift_output_value, -params.output_multiplier_exponent);
|
||||
output_value += params.output_zero_point;
|
||||
output_value =
|
||||
std::min<int16_t>(output_value, std::numeric_limits<T>::max());
|
||||
output_value =
|
||||
std::max<int16_t>(output_value, std::numeric_limits<T>::min());
|
||||
output_data[i] = output_value;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
|
||||
@@ -23,34 +23,41 @@ limitations under the License.
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
inline void CheckArithmeticParams(const ArithmeticParams& params) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
// Input offset is negative input zero point. Activation tensors are
|
||||
// asymmetric quantized so they span the full int8 range.
|
||||
TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
|
||||
TFLITE_DCHECK_GE(-params.input2_offset, std::numeric_limits<int8_t>::min());
|
||||
TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
|
||||
TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
|
||||
}
|
||||
|
||||
// Element-wise add that can often be used for inner loop of broadcast add as
|
||||
// well as the non-broadcast add.
|
||||
inline void AddElementwise(int size, const ArithmeticParams& params,
|
||||
const int8_t* input1_data, const int8_t* input2_data,
|
||||
int8_t* output_data) {
|
||||
const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
|
||||
TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
|
||||
TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
|
||||
TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
|
||||
TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
|
||||
CheckArithmeticParams(params);
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32 input1_val = params.input1_offset + input1_data[i];
|
||||
const int32 input2_val = params.input2_offset + input2_data[i];
|
||||
const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32 scaled_input1_val =
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
const int32 scaled_input2_val =
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32 raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32 raw_output =
|
||||
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sum, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32 clamped_output =
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[i] = static_cast<int8_t>(clamped_output);
|
||||
@@ -61,16 +68,11 @@ inline void Add(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const int8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const int8_t* input2_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
CheckArithmeticParams(params);
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
|
||||
TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
|
||||
TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
|
||||
TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
|
||||
TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
|
||||
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
|
||||
@@ -22,27 +22,27 @@ namespace reference_integer_ops {
|
||||
|
||||
// Fixed-point per-channel-quantization convolution reference kernel.
|
||||
inline void ConvPerChannel(
|
||||
const ConvParams& params, const int32* output_multiplier,
|
||||
const int32* output_shift, const RuntimeShape& input_shape,
|
||||
const int8* input_data, const RuntimeShape& filter_shape,
|
||||
const int8* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
int8* output_data) {
|
||||
const ConvParams& params, const int32_t* output_multiplier,
|
||||
const int32_t* output_shift, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
// Get parameters.
|
||||
const int32 input_offset = params.input_offset; // r = s(q - Z)
|
||||
const int32_t input_offset = params.input_offset; // r = s(q - Z)
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int32 output_offset = params.output_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
|
||||
// Set min and max value of the output.
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
|
||||
// Sanity check.
|
||||
// Consistency check.
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
@@ -63,45 +63,47 @@ inline void ConvPerChannel(
|
||||
const int output_width = output_shape.Dims(2);
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
const int in_y = in_y_origin + dilation_height_factor * filter_y;
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
|
||||
if (!is_point_inside_image) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
const int in_y =
|
||||
in_y_origin + dilation_height_factor * filter_y;
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
if (is_point_inside_image) {
|
||||
int32 input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
int32_t input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
in_x, in_channel)];
|
||||
int32 filter_val =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
// Accumulate with 32 bits accumulator.
|
||||
// In the nudging process during model quantization, we force
|
||||
// real value of 0.0 be represented by a quantized value. This
|
||||
// guarantees that the input_offset is a int8, even though it
|
||||
// is represented using int32.
|
||||
// int32 += int8 * (int8 - int8) so the highest value we can
|
||||
// get from each accumulation is [-127, 127] * ([-128, 127] -
|
||||
// [-128, 127]), which is [-32512, 32512]. log2(32512)
|
||||
// = 14.98, which means we can accumulate at least 2^16
|
||||
// multiplications without overflow. The accumulator is
|
||||
// applied to a filter so the accumulation logic will hold as
|
||||
// long as the filter size (filter_y * filter_x * in_channel)
|
||||
// does not exceed 2^16, which is the case in all the models
|
||||
// we have seen so far.
|
||||
// TODO(jianlijianli): Add a check to make sure the
|
||||
// accumulator depth is smaller than 2^16.
|
||||
acc += filter_val * (input_val + input_offset);
|
||||
}
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, out_channel, filter_y, filter_x, in_channel)];
|
||||
// Accumulate with 32 bits accumulator.
|
||||
// In the nudging process during model quantization, we force
|
||||
// real value of 0.0 be represented by a quantized value. This
|
||||
// guarantees that the input_offset is a int8_t, even though
|
||||
// it is represented using int32_t. int32_t += int8_t *
|
||||
// (int8_t - int8_t) so the highest value we can get from each
|
||||
// accumulation is [-127, 127] * ([-128, 127] -
|
||||
// [-128, 127]), which is [-32512, 32512]. log2(32512)
|
||||
// = 14.98, which means we can accumulate at least 2^16
|
||||
// multiplications without overflow. The accumulator is
|
||||
// applied to a filter so the accumulation logic will hold as
|
||||
// long as the filter size (filter_y * filter_x * in_channel)
|
||||
// does not exceed 2^16, which is the case in all the models
|
||||
// we have seen so far.
|
||||
// TODO(jianlijianli): Add a check to make sure the
|
||||
// accumulator depth is smaller than 2^16.
|
||||
acc += filter_val * (input_val + input_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -125,12 +127,12 @@ inline void ConvPerChannel(
|
||||
// Fixed-point per-channel-quantization convolution reference kernel.
|
||||
// 16-bit data and 8-bit filter
|
||||
inline void ConvPerChannel(
|
||||
const ConvParams& params, const int32* output_multiplier,
|
||||
const int32* output_shift, const RuntimeShape& input_shape,
|
||||
const int16* input_data, const RuntimeShape& filter_shape,
|
||||
const int8* filter_data, const RuntimeShape& bias_shape,
|
||||
const ConvParams& params, const int32_t* output_multiplier,
|
||||
const int32_t* output_shift, const RuntimeShape& input_shape,
|
||||
const int16_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const std::int64_t* bias_data, const RuntimeShape& output_shape,
|
||||
int16* output_data) {
|
||||
int16_t* output_data) {
|
||||
// Get parameters.
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
@@ -140,10 +142,10 @@ inline void ConvPerChannel(
|
||||
const int pad_height = params.padding_values.height;
|
||||
|
||||
// Set min and max value of the output.
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
|
||||
// Sanity check.
|
||||
// Consistency check.
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
@@ -164,35 +166,37 @@ inline void ConvPerChannel(
|
||||
const int output_width = output_shape.Dims(2);
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
std::int64_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
const int in_y = in_y_origin + dilation_height_factor * filter_y;
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
|
||||
if (!is_point_inside_image) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
const int in_y =
|
||||
in_y_origin + dilation_height_factor * filter_y;
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
if (is_point_inside_image) {
|
||||
int32 input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
int32_t input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
in_x, in_channel)];
|
||||
int32 filter_val =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
// Accumulate with 64 bits accumulator.
|
||||
// int64 += int8 * int16 so the highest value we can
|
||||
// get from each accumulation is [-127, 127] * ([-32768,
|
||||
// 32767] -
|
||||
// [-32768, 32767]), which is [-8322945, 8322945].
|
||||
// log2(8322945) = 22.99.
|
||||
acc += filter_val * input_val;
|
||||
}
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, out_channel, filter_y, filter_x, in_channel)];
|
||||
// Accumulate with 64 bits accumulator.
|
||||
// int64_t += int8_t * int16_t so the highest value we can
|
||||
// get from each accumulation is [-127, 127] * ([-32768,
|
||||
// 32767] -
|
||||
// [-32768, 32767]), which is [-8322945, 8322945].
|
||||
// log2(8322945) = 22.99.
|
||||
acc += filter_val * input_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,12 +20,12 @@ limitations under the License.
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
inline void DepthwiseConvPerChannel(
|
||||
const DepthwiseParams& params, const int32* output_multiplier,
|
||||
const int32* output_shift, const RuntimeShape& input_shape,
|
||||
const int8* input_data, const RuntimeShape& filter_shape,
|
||||
const int8* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
int8* output_data) {
|
||||
const DepthwiseParams& params, const int32_t* output_multiplier,
|
||||
const int32_t* output_shift, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
// Get parameters.
|
||||
// TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
|
||||
const int stride_width = params.stride_width;
|
||||
@@ -35,10 +35,10 @@ inline void DepthwiseConvPerChannel(
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int depth_multiplier = params.depth_multiplier;
|
||||
const int32 input_offset = params.input_offset;
|
||||
const int32 output_offset = params.output_offset;
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
|
||||
// Check dimensions of the tensors.
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
@@ -66,7 +66,7 @@ inline void DepthwiseConvPerChannel(
|
||||
const int output_channel = m + in_channel * depth_multiplier;
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
@@ -77,17 +77,17 @@ inline void DepthwiseConvPerChannel(
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
if (is_point_inside_image) {
|
||||
int32 input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
in_x, in_channel)];
|
||||
int32 filter_val = filter_data[Offset(
|
||||
int32_t input_val = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, 0, filter_y, filter_x, output_channel)];
|
||||
// Accumulate with 32 bits accumulator.
|
||||
// In the nudging process during model quantization, we force
|
||||
// real value of 0.0 be represented by a quantized value. This
|
||||
// guarantees that the input_offset is a int8, even though it
|
||||
// is represented using int32.
|
||||
// int32 += int8 * (int8 - int8) so the highest value we can
|
||||
// get from each accumulation is [-127, 127] * ([-128, 127] -
|
||||
// guarantees that the input_offset is a int8_t, even though
|
||||
// it is represented using int32_t. int32_t += int8_t *
|
||||
// (int8_t - int8_t) so the highest value we can get from each
|
||||
// accumulation is [-127, 127] * ([-128, 127] -
|
||||
// [-128, 127]), which is [-32512, 32512]. log2(32512)
|
||||
// = 14.98, which means we can accumulate at least 2^16
|
||||
// multiplications without overflow. The accumulator is
|
||||
@@ -120,12 +120,12 @@ inline void DepthwiseConvPerChannel(
|
||||
}
|
||||
|
||||
inline void DepthwiseConvPerChannel(
|
||||
const DepthwiseParams& params, const int32* output_multiplier,
|
||||
const int32* output_shift, const RuntimeShape& input_shape,
|
||||
const int16* input_data, const RuntimeShape& filter_shape,
|
||||
const int8* filter_data, const RuntimeShape& bias_shape,
|
||||
const DepthwiseParams& params, const int32_t* output_multiplier,
|
||||
const int32_t* output_shift, const RuntimeShape& input_shape,
|
||||
const int16_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const std::int64_t* bias_data, const RuntimeShape& output_shape,
|
||||
int16* output_data) {
|
||||
int16_t* output_data) {
|
||||
// Get parameters.
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
@@ -134,8 +134,8 @@ inline void DepthwiseConvPerChannel(
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int depth_multiplier = params.depth_multiplier;
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
|
||||
// Check dimensions of the tensors.
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
@@ -174,9 +174,9 @@ inline void DepthwiseConvPerChannel(
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
if (is_point_inside_image) {
|
||||
int32 input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
in_x, in_channel)];
|
||||
int32 filter_val = filter_data[Offset(
|
||||
int32_t input_val = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, 0, filter_y, filter_x, output_channel)];
|
||||
// Accumulate with 64 bits accumulator.
|
||||
// We assume maximum of 2^16 accumulations as with the 8-bit
|
||||
@@ -190,7 +190,7 @@ inline void DepthwiseConvPerChannel(
|
||||
if (bias_data) {
|
||||
acc += bias_data[output_channel];
|
||||
}
|
||||
int32 scaled_acc = MultiplyByQuantizedMultiplier(
|
||||
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
|
||||
acc, output_multiplier[output_channel],
|
||||
output_shift[output_channel]);
|
||||
scaled_acc = std::max(scaled_acc, output_activation_min);
|
||||
@@ -207,8 +207,8 @@ inline void DepthwiseConvPerChannel(
|
||||
|
||||
inline void DepthwiseConvHybridPerChannel(
|
||||
const DepthwiseParams& params, float* scaling_factors_ptr,
|
||||
const RuntimeShape& input_shape, const int8* input_data,
|
||||
const RuntimeShape& filter_shape, const int8* filter_data,
|
||||
const RuntimeShape& input_shape, const int8_t* input_data,
|
||||
const RuntimeShape& filter_shape, const int8_t* filter_data,
|
||||
const RuntimeShape& bias_shape, const float* bias_data,
|
||||
const RuntimeShape& output_shape, float* output_data,
|
||||
const float* per_channel_scale, int32_t* input_offset) {
|
||||
@@ -247,7 +247,7 @@ inline void DepthwiseConvHybridPerChannel(
|
||||
const int output_channel = m + in_channel * depth_multiplier;
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
@@ -258,9 +258,9 @@ inline void DepthwiseConvHybridPerChannel(
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
if (is_point_inside_image) {
|
||||
int32 input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
in_x, in_channel)];
|
||||
int32 filter_val = filter_data[Offset(
|
||||
int32_t input_val = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, 0, filter_y, filter_x, output_channel)];
|
||||
acc += filter_val * (input_val - input_offset[batch]);
|
||||
}
|
||||
|
||||
@@ -24,15 +24,15 @@ inline void FullyConnected(
|
||||
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32* bias_data, const RuntimeShape& output_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
const int32 input_offset = params.input_offset;
|
||||
const int32 filter_offset = params.weights_offset;
|
||||
const int32 output_offset = params.output_offset;
|
||||
const int32 output_multiplier = params.output_multiplier;
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
|
||||
|
||||
@@ -44,10 +44,10 @@ inline void FullyConnected(
|
||||
const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int out_c = 0; out_c < output_depth; ++out_c) {
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
for (int d = 0; d < accum_depth; ++d) {
|
||||
int32 input_val = input_data[b * accum_depth + d];
|
||||
int32 filter_val = filter_data[out_c * accum_depth + d];
|
||||
int32_t input_val = input_data[b * accum_depth + d];
|
||||
int32_t filter_val = filter_data[out_c * accum_depth + d];
|
||||
acc += (filter_val + filter_offset) * (input_val + input_offset);
|
||||
}
|
||||
if (bias_data) {
|
||||
@@ -68,11 +68,11 @@ inline void FullyConnected(
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int64_t* bias_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
const int32 filter_offset = params.weights_offset;
|
||||
const int32 output_multiplier = params.output_multiplier;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32 output_activation_min = params.quantized_activation_min;
|
||||
const int32 output_activation_max = params.quantized_activation_max;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
|
||||
|
||||
@@ -86,8 +86,8 @@ inline void FullyConnected(
|
||||
for (int out_c = 0; out_c < output_depth; ++out_c) {
|
||||
int64_t acc = 0;
|
||||
for (int d = 0; d < accum_depth; ++d) {
|
||||
int32 input_val = input_data[b * accum_depth + d];
|
||||
int32 filter_val = filter_data[out_c * accum_depth + d];
|
||||
int32_t input_val = input_data[b * accum_depth + d];
|
||||
int32_t filter_val = filter_data[out_c * accum_depth + d];
|
||||
acc += (filter_val + filter_offset) * input_val;
|
||||
}
|
||||
if (bias_data) {
|
||||
|
||||
@@ -21,8 +21,8 @@ namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
|
||||
int32_t depth, const int8* input_data,
|
||||
int8* output_data) {
|
||||
int32_t depth, const int8_t* input_data,
|
||||
int8_t* output_data) {
|
||||
static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
|
||||
static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
|
||||
// The output scale must be in sync with Prepare().
|
||||
@@ -30,7 +30,7 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
|
||||
// to [-1, 127/128].
|
||||
static constexpr int32_t kOutputScale = 7;
|
||||
for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
|
||||
// int32 = (int8 - int8) ^ 2.
|
||||
// int32_t = (int8_t - int8_t) ^ 2.
|
||||
// ([-128, 127] - [-128, 127]) ^ 2 = [0, (2^8 - 1)^2] so the accumulator is
|
||||
// safe from overflowing in at least 2^16 steps.
|
||||
int32_t acc = 0;
|
||||
@@ -55,7 +55,7 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
|
||||
std::min(static_cast<int32_t>(kMaxInt8),
|
||||
std::max(static_cast<int32_t>(kMinInt8), output_in_q24));
|
||||
output_data[depth * outer_index + inner_index] =
|
||||
static_cast<int8>(output_in_q24);
|
||||
static_cast<int8_t>(output_in_q24);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,12 +58,15 @@ inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
|
||||
}
|
||||
}
|
||||
|
||||
inline void Logistic(int32_t input_size, const int16_t* ptr_input_data,
|
||||
int16_t* ptr_output_data) {
|
||||
inline void Logistic(int32_t input_multiplier, int32_t input_size,
|
||||
const int16_t* ptr_input_data, int16_t* ptr_output_data) {
|
||||
// We use the LUT for sigmoid and take into account, that
|
||||
// tanh(x) = 2*sigmoid(2*x) - 1
|
||||
|
||||
int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
|
||||
|
||||
for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
|
||||
int32_t input_data = *ptr_input_data;
|
||||
int32_t input_data = (*ptr_input_data) * input_data_mul;
|
||||
|
||||
// Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
|
||||
// we do interpolation on unsigned values.
|
||||
@@ -72,13 +75,20 @@ inline void Logistic(int32_t input_size, const int16_t* ptr_input_data,
|
||||
// We divide by 2 power of 9, because
|
||||
// we need to divide by 2 in power of 7 for
|
||||
// the input conversion + 1/4 from the scale above.
|
||||
uint8_t uh = abs_input_data >> 9;
|
||||
uint32_t ua = sigmoid_table_uint16[uh];
|
||||
uint32_t ub = sigmoid_table_uint16[uh + 1];
|
||||
uint32_t ut = abs_input_data & 0x1ff;
|
||||
// Define uh as uint32_t type not to make this function overflow.
|
||||
uint32_t uh = abs_input_data >> 9;
|
||||
uint32_t result;
|
||||
|
||||
// Interpolation is done using the fractional bit.
|
||||
uint32_t result = (ua << 9) + ut * (ub - ua);
|
||||
if (uh >= 255) {
|
||||
// Saturate to maximum.
|
||||
result = 0x7FFF << 10;
|
||||
} else {
|
||||
uint32_t ua = sigmoid_table_uint16[uh];
|
||||
uint32_t ub = sigmoid_table_uint16[uh + 1];
|
||||
uint32_t ut = abs_input_data & 0x1ff;
|
||||
// Interpolation is done using the fractional bit.
|
||||
result = (ua << 9) + ut * (ub - ua);
|
||||
}
|
||||
|
||||
result = (input_data >= 0) ? (result + (1 << 9))
|
||||
: ((1 << (16 + 9)) - result + (1 << 9) - 1);
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
template <typename integer_type>
|
||||
inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
|
||||
int32_t shift, const RuntimeShape& unextended_input_shape,
|
||||
const integer_type* input_data, int32_t input_zero_point,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
integer_type* output_data, int32_t output_zero_point) {
|
||||
// Current implementation only supports dimension equals 4 and simultaneous
|
||||
// reduction over width and height.
|
||||
TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
const int output_batch = output_shape.Dims(0);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_depth = output_shape.Dims(3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int num_elements_in_axis = input_width * input_height;
|
||||
|
||||
TFLITE_CHECK_EQ(op_params.axis_count, 2);
|
||||
TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
|
||||
(op_params.axis[0] == 2 && op_params.axis[1] == 1));
|
||||
TFLITE_CHECK_EQ(output_height, 1);
|
||||
TFLITE_CHECK_EQ(output_width, 1);
|
||||
|
||||
static constexpr int32_t kMinInt = std::numeric_limits<integer_type>::min();
|
||||
static constexpr int32_t kMaxInt = std::numeric_limits<integer_type>::max();
|
||||
|
||||
for (int out_b = 0; out_b < output_batch; ++out_b) {
|
||||
for (int out_d = 0; out_d < output_depth; ++out_d) {
|
||||
int32_t acc = 0;
|
||||
for (int in_h = 0; in_h < input_height; ++in_h) {
|
||||
for (int in_w = 0; in_w < input_width; ++in_w) {
|
||||
acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)] -
|
||||
input_zero_point;
|
||||
}
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
|
||||
acc = acc > 0 ? (acc + num_elements_in_axis / 2) / num_elements_in_axis
|
||||
: (acc - num_elements_in_axis / 2) / num_elements_in_axis;
|
||||
acc += output_zero_point;
|
||||
acc = std::min(std::max(acc, kMinInt), kMaxInt);
|
||||
output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
|
||||
static_cast<integer_type>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
|
||||
@@ -27,14 +27,14 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
|
||||
const T* input1_data, const T* input2_data,
|
||||
T* output_data) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32 input1_val = params.input1_offset + input1_data[i];
|
||||
const int32 input2_val = params.input2_offset + input2_data[i];
|
||||
const int32 unclamped_result =
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplier(input1_val * input2_val,
|
||||
params.output_multiplier,
|
||||
params.output_shift);
|
||||
const int32 clamped_output =
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[i] = static_cast<T>(clamped_output);
|
||||
@@ -57,13 +57,13 @@ inline void Mul(const ArithmeticParams& params,
|
||||
|
||||
// Mul with 16 bit inputs and int8_t outputs.
|
||||
inline void Mul(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const int16* input1_data,
|
||||
const RuntimeShape& input2_shape, const int16* input2_data,
|
||||
const RuntimeShape& input1_shape, const int16_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const int16_t* input2_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
ruy::profiler::ScopeLabel label("Mul/Int16Int8");
|
||||
int32 output_offset = params.output_offset;
|
||||
int32 output_activation_min = params.quantized_activation_min;
|
||||
int32 output_activation_max = params.quantized_activation_max;
|
||||
int32_t output_offset = params.output_offset;
|
||||
int32_t output_activation_min = params.quantized_activation_min;
|
||||
int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
|
||||
const int flat_size =
|
||||
@@ -75,12 +75,12 @@ inline void Mul(const ArithmeticParams& params,
|
||||
|
||||
F0 unclamped_result =
|
||||
F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
|
||||
int16 rescaled_result =
|
||||
int16_t rescaled_result =
|
||||
gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
|
||||
int16 clamped_result =
|
||||
std::min<int16>(output_activation_max - output_offset, rescaled_result);
|
||||
clamped_result =
|
||||
std::max<int16>(output_activation_min - output_offset, clamped_result);
|
||||
int16_t clamped_result = std::min<int16_t>(
|
||||
output_activation_max - output_offset, rescaled_result);
|
||||
clamped_result = std::max<int16_t>(output_activation_min - output_offset,
|
||||
clamped_result);
|
||||
output_data[i] = output_offset + clamped_result;
|
||||
}
|
||||
}
|
||||
@@ -104,18 +104,18 @@ inline void BroadcastMul4DSlow(
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
const int32 input1_val =
|
||||
const int32_t input1_val =
|
||||
params.input1_offset +
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
|
||||
const int32 input2_val =
|
||||
const int32_t input2_val =
|
||||
params.input2_offset +
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
|
||||
const int32 unclamped_result =
|
||||
const int32_t unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplier(input1_val * input2_val,
|
||||
params.output_multiplier,
|
||||
params.output_shift);
|
||||
const int32 clamped_output = std::min(
|
||||
const int32_t clamped_output = std::min(
|
||||
params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
||||
|
||||
@@ -22,8 +22,9 @@ namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
inline void AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape, const int8* input_data,
|
||||
const RuntimeShape& output_shape, int8* output_data) {
|
||||
const RuntimeShape& input_shape,
|
||||
const int8_t* input_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
@@ -52,7 +53,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
int filter_count = 0;
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
@@ -71,7 +72,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
acc = std::max(acc, params.quantized_activation_min);
|
||||
acc = std::min(acc, params.quantized_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
static_cast<int8>(acc);
|
||||
static_cast<int8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -79,8 +80,8 @@ inline void AveragePool(const PoolParams& params,
|
||||
}
|
||||
|
||||
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
const int8* input_data, const RuntimeShape& output_shape,
|
||||
int8* output_data) {
|
||||
const int8_t* input_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
TFLITE_DCHECK_GE(params.quantized_activation_min,
|
||||
@@ -137,8 +138,9 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
|
||||
inline void AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const int16* input_data,
|
||||
const RuntimeShape& output_shape, int16* output_data) {
|
||||
const int16_t* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
@@ -167,7 +169,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
int filter_count = 0;
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
@@ -186,7 +188,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
acc = std::max(acc, params.quantized_activation_min);
|
||||
acc = std::min(acc, params.quantized_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
static_cast<int16>(acc);
|
||||
static_cast<int16_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -194,8 +196,8 @@ inline void AveragePool(const PoolParams& params,
|
||||
}
|
||||
|
||||
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
const int16* input_data, const RuntimeShape& output_shape,
|
||||
int16* output_data) {
|
||||
const int16_t* input_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
TFLITE_DCHECK_GE(params.quantized_activation_min,
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
|
||||
int32_t input_multiplier, int32_t input_shift,
|
||||
const RuntimeShape& input_shape, const int8_t* input_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
// Integer bits must be in sync with Prepare() function.
|
||||
static constexpr int32_t kInputIntegerBits = 4;
|
||||
static constexpr int32_t kOutputScale = 7;
|
||||
static constexpr int32_t kMinInt8 = std::numeric_limits<int8_t>::min();
|
||||
static constexpr int32_t kMaxInt8 = std::numeric_limits<int8_t>::max();
|
||||
using F4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
|
||||
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
const int32_t input =
|
||||
static_cast<int32_t>(input_data[i]) - input_zero_point;
|
||||
if (input <= -input_range_radius) {
|
||||
output_data[i] = kMinInt8;
|
||||
} else if (input >= input_range_radius) {
|
||||
output_data[i] = kMaxInt8;
|
||||
} else {
|
||||
const int32_t input_in_q4 =
|
||||
MultiplyByQuantizedMultiplier(input, input_multiplier, input_shift);
|
||||
const int32_t output_in_q0 =
|
||||
gemmlowp::tanh(F4::FromRaw(input_in_q4)).raw();
|
||||
|
||||
// Rescale and downcast.
|
||||
using gemmlowp::RoundingDivideByPOT;
|
||||
int32_t output_in_q24 =
|
||||
RoundingDivideByPOT(output_in_q0, 31 - kOutputScale);
|
||||
output_in_q24 = std::min(std::max(output_in_q24, kMinInt8), kMaxInt8);
|
||||
output_data[i] = static_cast<int8_t>(output_in_q24);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
|
||||
const RuntimeShape& input_shape, const int16_t* ptr_input_data,
|
||||
const RuntimeShape& output_shape, int16_t* ptr_output_data) {
|
||||
// We use the LUT for sigmoid and take into account, that
|
||||
// tanh(x) = 2*sigmoid(2*x) - 1
|
||||
|
||||
int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
|
||||
|
||||
int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++) {
|
||||
int32_t input_data = (*ptr_input_data) * input_data_mul;
|
||||
|
||||
if (input_left_shift == 1) {
|
||||
input_data <<= 1;
|
||||
}
|
||||
|
||||
// Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
|
||||
uint32_t abs_input_data = 3 * abs(input_data);
|
||||
uint32_t uh = abs_input_data >> 8;
|
||||
int32_t result;
|
||||
|
||||
if (uh >= 255) {
|
||||
// Saturate to maximum.
|
||||
result = 0xFFFF << 8;
|
||||
} else {
|
||||
uint32_t ua = sigmoid_table_uint16[uh];
|
||||
uint32_t ub = sigmoid_table_uint16[uh + 1];
|
||||
|
||||
uint8_t ut = abs_input_data & 0xFF;
|
||||
|
||||
result = (ua << 8) + ut * (ub - ua);
|
||||
}
|
||||
|
||||
result = (input_data >= 0)
|
||||
? (result - (1 << (14 + 9)) + (1 << (9 - 2)))
|
||||
: (-result + (1 << (14 + 9)) + (1 << (9 - 2)) - 1);
|
||||
|
||||
// Convert back to 16-bit.
|
||||
result >>= (9 - 1);
|
||||
|
||||
*ptr_output_data = result;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
|
||||
@@ -52,40 +52,39 @@ inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
|
||||
|
||||
inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
|
||||
const RuntimeShape& input_shape,
|
||||
const uint8* input_data,
|
||||
const uint8_t* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
uint8_t* output_data) {
|
||||
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
||||
const int depth =
|
||||
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
||||
const int outer_size =
|
||||
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
||||
const int32 input_zero_point = op_params.input_zero_point;
|
||||
const int32_t input_zero_point = op_params.input_zero_point;
|
||||
|
||||
for (int i = 0; i < outer_size; ++i) {
|
||||
int32 square_l2_norm = 0;
|
||||
int32_t square_l2_norm = 0;
|
||||
for (int c = 0; c < depth; c++) {
|
||||
int32 diff = input_data[depth * i + c] - input_zero_point;
|
||||
int32_t diff = input_data[depth * i + c] - input_zero_point;
|
||||
square_l2_norm += diff * diff;
|
||||
}
|
||||
int32 inv_l2norm_multiplier;
|
||||
int32_t inv_l2norm_multiplier;
|
||||
int inv_l2norm_shift;
|
||||
GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
|
||||
&inv_l2norm_multiplier, &inv_l2norm_shift);
|
||||
for (int c = 0; c < depth; c++) {
|
||||
int32 diff = input_data[depth * i + c] - input_zero_point;
|
||||
int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
int32_t diff = input_data[depth * i + c] - input_zero_point;
|
||||
int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
|
||||
int32 unclamped_output_val = 128 + rescaled_diff;
|
||||
int32 output_val =
|
||||
std::min(static_cast<int32>(255),
|
||||
std::max(static_cast<int32>(0), unclamped_output_val));
|
||||
output_data[depth * i + c] = static_cast<uint8>(output_val);
|
||||
int32_t unclamped_output_val = 128 + rescaled_diff;
|
||||
int32_t output_val =
|
||||
std::min(static_cast<int32_t>(255),
|
||||
std::max(static_cast<int32_t>(0), unclamped_output_val));
|
||||
output_data[depth * i + c] = static_cast<uint8_t>(output_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
|
||||
|
||||
@@ -66,8 +66,8 @@ inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
|
||||
}
|
||||
|
||||
inline void Logistic(const LogisticParams& params,
|
||||
const RuntimeShape& input_shape, const int16* input_data,
|
||||
const RuntimeShape& output_shape, int16* output_data) {
|
||||
const RuntimeShape& input_shape, const int16_t* input_data,
|
||||
const RuntimeShape& output_shape, int16_t* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
@@ -84,12 +84,12 @@ inline void Logistic(const LogisticParams& params,
|
||||
}
|
||||
}
|
||||
|
||||
// Quantized int8 logistic activation. Cheats by dequantizing and requantizing
|
||||
// around the floating point logistic method. This implementation is slow on
|
||||
// platforms without a floating point unit.
|
||||
// Quantized int8_t logistic activation. Cheats by dequantizing and
|
||||
// requantizing around the floating point logistic method. This implementation
|
||||
// is slow on platforms without a floating point unit.
|
||||
|
||||
// TODO(b/141211002): Delete this int8 implementation once we can reuse the
|
||||
// approach used in TFLite for int8 Logistic.
|
||||
// TODO(b/141211002): Delete this int8_t implementation once we can reuse the
|
||||
// approach used in TFLite for int8_t Logistic.
|
||||
inline void Logistic(const RuntimeShape& input_shape, const int8_t* input_data,
|
||||
float input_scale, int input_zero_point,
|
||||
const RuntimeShape& output_shape, int8_t* output_data,
|
||||
|
||||
@@ -24,20 +24,20 @@ namespace reference_ops {
|
||||
// Element-wise mul that can often be used for inner loop of broadcast Mul as
|
||||
// well as the non-broadcast Mul.
|
||||
inline void MulElementwise(int size, const ArithmeticParams& params,
|
||||
const uint8* input1_data, const uint8* input2_data,
|
||||
uint8* output_data) {
|
||||
const uint8_t* input1_data,
|
||||
const uint8_t* input2_data, uint8_t* output_data) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32 input1_val = params.input1_offset + input1_data[i];
|
||||
const int32 input2_val = params.input2_offset + input2_data[i];
|
||||
const int32 unclamped_result =
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplier(input1_val * input2_val,
|
||||
params.output_multiplier,
|
||||
params.output_shift);
|
||||
const int32 clamped_output =
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[i] = static_cast<uint8>(clamped_output);
|
||||
output_data[i] = static_cast<uint8_t>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,9 +60,9 @@ inline void Mul(const ArithmeticParams& params,
|
||||
}
|
||||
|
||||
inline void Mul(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const uint8* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8* input2_data,
|
||||
const RuntimeShape& output_shape, uint8* output_data) {
|
||||
const RuntimeShape& input1_shape, const uint8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8_t* input2_data,
|
||||
const RuntimeShape& output_shape, uint8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
@@ -73,11 +73,11 @@ inline void Mul(const ArithmeticParams& params,
|
||||
|
||||
inline void BroadcastMul4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const uint8* input1_data,
|
||||
const uint8_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const uint8* input2_data,
|
||||
const uint8_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
uint8_t* output_data) {
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
@@ -89,22 +89,22 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
const int32 input1_val =
|
||||
const int32_t input1_val =
|
||||
params.input1_offset +
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
|
||||
const int32 input2_val =
|
||||
const int32_t input2_val =
|
||||
params.input2_offset +
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
|
||||
const int32 unclamped_result =
|
||||
const int32_t unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplier(input1_val * input2_val,
|
||||
params.output_multiplier,
|
||||
params.output_shift);
|
||||
const int32 clamped_output = std::min(
|
||||
const int32_t clamped_output = std::min(
|
||||
params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
||||
static_cast<uint8>(clamped_output);
|
||||
static_cast<uint8_t>(clamped_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,8 +32,8 @@ constexpr int PadKernelMaxDimensionCount() { return 4; }
|
||||
// equivalent to a simple input1_data. For Pad, it should point to a zero
|
||||
// value.
|
||||
//
|
||||
// Note that two typenames are required, so that T=P=int32 is considered a
|
||||
// specialization distinct from P=int32.
|
||||
// Note that two typenames are required, so that T=P=int32_t is considered a
|
||||
// specialization distinct from P=int32_t.
|
||||
template <typename T, typename P>
|
||||
inline void PadImpl(const tflite::PadParams& op_params,
|
||||
const RuntimeShape& input_shape, const T* input_data,
|
||||
@@ -116,11 +116,11 @@ inline void Pad(const tflite::PadParams& op_params,
|
||||
output_data);
|
||||
}
|
||||
|
||||
// The second (pad-value) input can be int32 when, say, the first is uint8.
|
||||
// The second (pad-value) input can be int32_t when, say, the first is uint8_t.
|
||||
template <typename T>
|
||||
inline void Pad(const tflite::PadParams& op_params,
|
||||
const RuntimeShape& input_shape, const T* input_data,
|
||||
const int32* pad_value_ptr, const RuntimeShape& output_shape,
|
||||
const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
|
||||
T* output_data) {
|
||||
const T converted_pad_value = static_cast<T>(*pad_value_ptr);
|
||||
PadImpl(op_params, input_shape, input_data, &converted_pad_value,
|
||||
@@ -130,40 +130,18 @@ inline void Pad(const tflite::PadParams& op_params,
|
||||
// This version avoids conflicting template matching.
|
||||
template <>
|
||||
inline void Pad(const tflite::PadParams& op_params,
|
||||
const RuntimeShape& input_shape, const int32* input_data,
|
||||
const int32* pad_value_ptr, const RuntimeShape& output_shape,
|
||||
int32* output_data) {
|
||||
const RuntimeShape& input_shape, const int32_t* input_data,
|
||||
const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
|
||||
int32_t* output_data) {
|
||||
PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
|
||||
output_data);
|
||||
}
|
||||
|
||||
// One could make all PadImageStyle calls simply delegate the work to the
|
||||
// ordinary Pad. However, it is better that the reference code asserts false in
|
||||
// similar cases.
|
||||
template <typename T, typename P>
|
||||
inline void PadImageStyle(const tflite::PadParams& op_params,
|
||||
const RuntimeShape& input_shape, const T* input_data,
|
||||
const P* pad_value_ptr,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
TFLITE_ASSERT_FALSE;
|
||||
}
|
||||
|
||||
template <typename P>
|
||||
inline void PadImageStyle(const tflite::PadParams& op_params,
|
||||
const RuntimeShape& input_shape,
|
||||
const uint8* input_data, const P* pad_value_ptr,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
|
||||
output_data);
|
||||
}
|
||||
|
||||
template <typename P>
|
||||
inline void PadImageStyle(const tflite::PadParams& op_params,
|
||||
const RuntimeShape& input_shape,
|
||||
const int8_t* input_data, const P* pad_value_ptr,
|
||||
const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
|
||||
output_data);
|
||||
}
|
||||
|
||||
@@ -78,8 +78,9 @@ inline void AveragePool(const PoolParams& params,
|
||||
|
||||
inline void AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const uint8* input_data,
|
||||
const RuntimeShape& output_shape, uint8* output_data) {
|
||||
const uint8_t* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
@@ -108,7 +109,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
int filter_count = 0;
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
@@ -125,7 +126,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
acc = std::max(acc, params.quantized_activation_min);
|
||||
acc = std::min(acc, params.quantized_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
static_cast<uint8>(acc);
|
||||
static_cast<uint8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -237,8 +238,8 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
}
|
||||
|
||||
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
const uint8* input_data, const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
const uint8_t* input_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
TFLITE_DCHECK_GE(params.quantized_activation_min, 0);
|
||||
@@ -269,7 +270,7 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
uint8 max = 0;
|
||||
uint8_t max = 0;
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
for (int filter_x = filter_x_start; filter_x < filter_x_end;
|
||||
@@ -281,10 +282,10 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
|
||||
}
|
||||
}
|
||||
max = std::max<uint8>(max, params.quantized_activation_min);
|
||||
max = std::min<uint8>(max, params.quantized_activation_max);
|
||||
max = std::max<uint8_t>(max, params.quantized_activation_min);
|
||||
max = std::min<uint8_t>(max, params.quantized_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
static_cast<uint8>(max);
|
||||
static_cast<uint8_t>(max);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
// Broadcast prelu to output_shape for quantized uint8/int8 data.
|
||||
// Broadcast prelu to output_shape for quantized uint8_t/int8_t data.
|
||||
template <typename T>
|
||||
inline void BroadcastPrelu4DSlow(
|
||||
const PreluParams& params, const RuntimeShape& input_shape,
|
||||
@@ -44,24 +44,26 @@ inline void BroadcastPrelu4DSlow(
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
int output_index = Offset(extended_output_shape, b, y, x, c);
|
||||
int input_index = SubscriptToIndex(desc1, b, y, x, c);
|
||||
const int32 input_value =
|
||||
const int32_t input_value =
|
||||
params.input_offset + input_data[input_index];
|
||||
int32 output_value;
|
||||
int32_t output_value;
|
||||
if (input_value >= 0) {
|
||||
output_value = input_value;
|
||||
output_value = MultiplyByQuantizedMultiplier(
|
||||
input_value, params.output_multiplier_1, params.output_shift_1);
|
||||
} else {
|
||||
auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
|
||||
const int32 alpha_value =
|
||||
const int32_t alpha_value =
|
||||
params.alpha_offset + alpha_data[alpha_index];
|
||||
|
||||
output_value = MultiplyByQuantizedMultiplier(
|
||||
input_value * alpha_value, params.output_multiplier,
|
||||
params.output_shift);
|
||||
input_value * alpha_value, params.output_multiplier_2,
|
||||
params.output_shift_2);
|
||||
}
|
||||
output_value += params.output_offset;
|
||||
|
||||
const int32 quantized_min = std::numeric_limits<T>::min();
|
||||
const int32 quantized_max = std::numeric_limits<T>::max();
|
||||
const int32 clamped_output =
|
||||
const int32_t quantized_min = std::numeric_limits<T>::min();
|
||||
const int32_t quantized_max = std::numeric_limits<T>::max();
|
||||
const int32_t clamped_output =
|
||||
std::min(quantized_max, std::max(quantized_min, output_value));
|
||||
output_data[output_index] = static_cast<T>(clamped_output);
|
||||
}
|
||||
@@ -70,6 +72,37 @@ inline void BroadcastPrelu4DSlow(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
|
||||
const T* input_data, const RuntimeShape& alpha_shape,
|
||||
const T* alpha_data, const RuntimeShape& output_shape,
|
||||
T* output_data) {
|
||||
const int32_t quantized_min = std::numeric_limits<T>::min();
|
||||
const int32_t quantized_max = std::numeric_limits<T>::max();
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input_shape, alpha_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
const int32_t input_value = params.input_offset + input_data[i];
|
||||
int32_t output_value;
|
||||
if (input_value >= 0) {
|
||||
output_value = MultiplyByQuantizedMultiplier(
|
||||
input_value, params.output_multiplier_1, params.output_shift_1);
|
||||
} else {
|
||||
const int32_t alpha_value = params.alpha_offset + alpha_data[i];
|
||||
|
||||
output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value,
|
||||
params.output_multiplier_2,
|
||||
params.output_shift_2);
|
||||
}
|
||||
output_value += params.output_offset;
|
||||
|
||||
const int32_t clamped_output =
|
||||
std::min(quantized_max, std::max(quantized_min, output_value));
|
||||
output_data[i] = static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
|
||||
@@ -76,6 +76,10 @@ inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
|
||||
BroadcastableOpCategory::kFirstInputBroadcastsFast &&
|
||||
params->broadcast_category !=
|
||||
BroadcastableOpCategory::kSecondInputBroadcastsFast) {
|
||||
// This is unreachable because at least one else clause in the above loop
|
||||
// must be reached.
|
||||
TFLITE_DCHECK(false);
|
||||
params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,11 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
@@ -29,18 +33,18 @@ inline void AffineQuantize(const tflite::QuantizationParams& op_params,
|
||||
const InputT* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
OutputT* output_data) {
|
||||
const int32 zero_point = op_params.zero_point;
|
||||
const int32_t zero_point = op_params.zero_point;
|
||||
const double scale = op_params.scale;
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
static constexpr int32 min_val = std::numeric_limits<OutputT>::min();
|
||||
static constexpr int32 max_val = std::numeric_limits<OutputT>::max();
|
||||
static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
|
||||
static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
const InputT val = input_data[i];
|
||||
int32 unclamped =
|
||||
static_cast<int32>(TfLiteRound(val / static_cast<float>(scale))) +
|
||||
int32_t unclamped =
|
||||
static_cast<int32_t>(TfLiteRound(val / static_cast<float>(scale))) +
|
||||
zero_point;
|
||||
int32 clamped = std::min(std::max(unclamped, min_val), max_val);
|
||||
int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
|
||||
output_data[i] = clamped;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,8 @@ limitations under the License.
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/max.h"
|
||||
#include "tensorflow/lite/kernels/internal/min.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
@@ -68,6 +70,9 @@ inline bool ResolveAxis(const int num_dims, const int* axis,
|
||||
// eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1] */
|
||||
int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
|
||||
TFLITE_DCHECK(current >= 0 && current < num_dims);
|
||||
if (current < 0 || current >= num_dims) {
|
||||
return false;
|
||||
}
|
||||
bool is_dup = false;
|
||||
for (int j = 0; j < *out_num_axis; ++j) {
|
||||
if (out_axis[j] == current) {
|
||||
@@ -127,6 +132,11 @@ inline bool ReduceGeneric(const T* input_data, const int* input_dims,
|
||||
bool keep_dims, int* temp_index, int* resolved_axis,
|
||||
T init_value,
|
||||
T reducer(const T current, const T in)) {
|
||||
// Return early when input shape has zero dim.
|
||||
for (int i = 0; i < input_num_dims; ++i) {
|
||||
if (input_dims[i] == 0) return true;
|
||||
}
|
||||
|
||||
// Reset output data.
|
||||
if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
|
||||
output_data)) {
|
||||
@@ -184,11 +194,11 @@ inline bool Mean(const T* input_data, const int* input_dims,
|
||||
}
|
||||
|
||||
// Calculate mean by dividing output_data by num of aggregated element.
|
||||
U num_elements_in_axis = 1;
|
||||
size_t num_elements_in_axis = 1;
|
||||
for (int idx = 0; idx < num_resolved_axis; ++idx) {
|
||||
size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
|
||||
// Overflow prevention.
|
||||
if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
|
||||
if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
|
||||
return false;
|
||||
}
|
||||
num_elements_in_axis *= current;
|
||||
@@ -249,9 +259,9 @@ inline void Mean(const tflite::MeanParams& op_params,
|
||||
|
||||
inline void Mean(const tflite::MeanParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const uint8_t* input_data, int32 input_zero_point,
|
||||
const uint8_t* input_data, int32_t input_zero_point,
|
||||
float input_scale, const RuntimeShape& unextended_output_shape,
|
||||
uint8_t* output_data, int32 output_zero_point,
|
||||
uint8_t* output_data, int32_t output_zero_point,
|
||||
float output_scale) {
|
||||
ruy::profiler::ScopeLabel label("Mean4D/Uint8");
|
||||
|
||||
@@ -280,9 +290,9 @@ inline void Mean(const tflite::MeanParams& op_params,
|
||||
constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
|
||||
constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
|
||||
|
||||
int32 bias =
|
||||
int32_t bias =
|
||||
output_zero_point -
|
||||
static_cast<int32>(input_zero_point * input_scale / output_scale);
|
||||
static_cast<int32_t>(input_zero_point * input_scale / output_scale);
|
||||
double real_scale =
|
||||
static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
|
||||
|
||||
@@ -291,7 +301,7 @@ inline void Mean(const tflite::MeanParams& op_params,
|
||||
QuantizeMultiplier(real_scale, &multiplier, &shift);
|
||||
for (int out_b = 0; out_b < output_batch; ++out_b) {
|
||||
for (int out_d = 0; out_d < output_depth; ++out_d) {
|
||||
int32 acc = 0;
|
||||
int32_t acc = 0;
|
||||
for (int in_h = 0; in_h < input_height; ++in_h) {
|
||||
for (int in_w = 0; in_w < input_width; ++in_w) {
|
||||
acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
|
||||
@@ -310,18 +320,21 @@ inline void Mean(const tflite::MeanParams& op_params,
|
||||
// It does so in two stages, first calculates the sum of elements along the axis
|
||||
// then divides it by the number of element in axis for quantized values.
|
||||
template <typename T, typename U>
|
||||
inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
|
||||
inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
|
||||
float input_scale, const int* input_dims,
|
||||
const int input_num_dims, T* output_data,
|
||||
int32 output_zero_point, float output_scale,
|
||||
int32_t output_zero_point, float output_scale,
|
||||
const int* output_dims,
|
||||
const int output_num_dims, const int* axis,
|
||||
const int num_axis_dimensions, bool keep_dims,
|
||||
int* temp_index, int* resolved_axis, U* temp_sum,
|
||||
bool compute_sum) {
|
||||
const bool uint8_case = std::is_same<T, int8_t>::value;
|
||||
const bool uint8_case = std::is_same<T, uint8_t>::value;
|
||||
const bool int16_case = std::is_same<T, int16_t>::value;
|
||||
if (uint8_case) {
|
||||
ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
|
||||
} else if (int16_case) {
|
||||
ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
|
||||
} else {
|
||||
ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
|
||||
}
|
||||
@@ -354,11 +367,11 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
|
||||
}
|
||||
|
||||
// Calculate mean by dividing output_data by num of aggregated element.
|
||||
U num_elements_in_axis = 1;
|
||||
size_t num_elements_in_axis = 1;
|
||||
for (int idx = 0; idx < num_resolved_axis; ++idx) {
|
||||
size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
|
||||
// Overflow prevention.
|
||||
if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
|
||||
if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
|
||||
return false;
|
||||
}
|
||||
num_elements_in_axis *= current;
|
||||
@@ -368,8 +381,7 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
|
||||
const float scale = input_scale / output_scale;
|
||||
if (compute_sum) {
|
||||
// TODO(b/116341117): Eliminate float and do this completely in 8bit.
|
||||
const float bias =
|
||||
-input_zero_point * scale * num_elements_in_axis + 0.5f;
|
||||
const float bias = -input_zero_point * scale * num_elements_in_axis;
|
||||
for (size_t idx = 0; idx < num_outputs; ++idx) {
|
||||
const U value =
|
||||
static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
|
||||
@@ -377,15 +389,15 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
|
||||
output_data[idx] = static_cast<T>(value);
|
||||
}
|
||||
} else {
|
||||
const float bias = -input_zero_point * scale + 0.5f;
|
||||
const float bias = -input_zero_point * scale;
|
||||
for (size_t idx = 0; idx < num_outputs; ++idx) {
|
||||
float float_mean = static_cast<float>(temp_sum[idx]) /
|
||||
static_cast<float>(num_elements_in_axis);
|
||||
float result =
|
||||
std::min(TfLiteRound(float_mean * scale + bias) + output_zero_point,
|
||||
static_cast<float>(std::numeric_limits<T>::max()));
|
||||
result =
|
||||
std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
|
||||
float result = TfLiteMin(
|
||||
TfLiteRound(float_mean * scale + bias) + output_zero_point,
|
||||
static_cast<float>(std::numeric_limits<T>::max()));
|
||||
result = TfLiteMax(result,
|
||||
static_cast<float>(std::numeric_limits<T>::min()));
|
||||
output_data[idx] = static_cast<T>(result);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,28 +17,30 @@ limitations under the License.
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
inline int32 GetNearestNeighbor(const int input_value, const int32 input_size,
|
||||
const int32 output_size,
|
||||
const bool align_corners,
|
||||
const bool half_pixel_centers) {
|
||||
inline int32_t GetNearestNeighbor(const int input_value,
|
||||
const int32_t input_size,
|
||||
const int32_t output_size,
|
||||
const bool align_corners,
|
||||
const bool half_pixel_centers) {
|
||||
const float scale =
|
||||
(align_corners && output_size > 1)
|
||||
? (input_size - 1) / static_cast<float>(output_size - 1)
|
||||
: input_size / static_cast<float>(output_size);
|
||||
const float offset = half_pixel_centers ? 0.5f : 0.0f;
|
||||
int32 output_value = std::min(
|
||||
int32_t output_value = std::min(
|
||||
align_corners
|
||||
? static_cast<int32>(std::round((input_value + offset) * scale))
|
||||
: static_cast<int32>(std::floor((input_value + offset) * scale)),
|
||||
? static_cast<int32_t>(TfLiteRound((input_value + offset) * scale))
|
||||
: static_cast<int32_t>(std::floor((input_value + offset) * scale)),
|
||||
input_size - 1);
|
||||
if (half_pixel_centers) {
|
||||
output_value = std::max(static_cast<int32>(0), output_value);
|
||||
output_value = std::max(static_cast<int32_t>(0), output_value);
|
||||
}
|
||||
return output_value;
|
||||
}
|
||||
@@ -47,7 +49,7 @@ template <typename T>
|
||||
inline void ResizeNearestNeighbor(
|
||||
const tflite::ResizeNearestNeighborParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape, const T* input_data,
|
||||
const RuntimeShape& output_size_shape, const int32* output_size_data,
|
||||
const RuntimeShape& output_size_shape, const int32_t* output_size_data,
|
||||
const RuntimeShape& unextended_output_shape, T* output_data) {
|
||||
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
@@ -57,16 +59,16 @@ inline void ResizeNearestNeighbor(
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
int32 input_height = input_shape.Dims(1);
|
||||
int32 input_width = input_shape.Dims(2);
|
||||
int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
int32_t input_height = input_shape.Dims(1);
|
||||
int32_t input_width = input_shape.Dims(2);
|
||||
int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
|
||||
// The Tensorflow version of this op allows resize on the width and height
|
||||
// axis only.
|
||||
TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
|
||||
int32 output_height = output_size_data[0];
|
||||
int32 output_width = output_size_data[1];
|
||||
int32_t output_height = output_size_data[0];
|
||||
int32_t output_width = output_size_data[1];
|
||||
|
||||
const int col_offset = input_shape.Dims(3);
|
||||
const int row_offset = input_shape.Dims(2) * col_offset;
|
||||
@@ -76,14 +78,14 @@ inline void ResizeNearestNeighbor(
|
||||
T* output_ptr = output_data;
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int y = 0; y < output_height; ++y) {
|
||||
int32 in_y = GetNearestNeighbor(y, input_height, output_height,
|
||||
op_params.align_corners,
|
||||
op_params.half_pixel_centers);
|
||||
const T* y_input_ptr = input_ptr + in_y * row_offset;
|
||||
for (int x = 0; x < output_width; ++x) {
|
||||
int32 in_x = GetNearestNeighbor(x, input_width, output_width,
|
||||
int32_t in_y = GetNearestNeighbor(y, input_height, output_height,
|
||||
op_params.align_corners,
|
||||
op_params.half_pixel_centers);
|
||||
const T* y_input_ptr = input_ptr + in_y * row_offset;
|
||||
for (int x = 0; x < output_width; ++x) {
|
||||
int32_t in_x = GetNearestNeighbor(x, input_width, output_width,
|
||||
op_params.align_corners,
|
||||
op_params.half_pixel_centers);
|
||||
const T* x_input_ptr = y_input_ptr + in_x * col_offset;
|
||||
memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
|
||||
output_ptr += depth;
|
||||
|
||||
@@ -16,7 +16,6 @@ limitations under the License.
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
|
||||
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
@@ -49,26 +48,27 @@ inline void Softmax(const SoftmaxParams& params,
|
||||
// Compute sum.
|
||||
float sum = 0.f;
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
sum += std::exp((input_data[i * depth + c] - max) *
|
||||
static_cast<float>(params.beta));
|
||||
const float exp_c = std::exp((input_data[i * depth + c] - max) *
|
||||
static_cast<float>(params.beta));
|
||||
output_data[i * depth + c] = exp_c;
|
||||
sum += exp_c;
|
||||
}
|
||||
|
||||
// Compute result.
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
output_data[i * depth + c] = std::exp((input_data[i * depth + c] - max) *
|
||||
static_cast<float>(params.beta)) /
|
||||
sum;
|
||||
output_data[i * depth + c] = output_data[i * depth + c] / sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Quantized softmax with int8/uint8 input and int8/uint8/int16 output.
|
||||
// Quantized softmax with int8_t/uint8_t input and int8_t/uint8_t/int16_t
|
||||
// output.
|
||||
template <typename InputT, typename OutputT>
|
||||
inline void Softmax(const SoftmaxParams& params,
|
||||
const RuntimeShape& input_shape, const InputT* input_data,
|
||||
const RuntimeShape& output_shape, OutputT* output_data) {
|
||||
const int32 input_beta_multiplier = params.input_multiplier;
|
||||
const int32 input_beta_left_shift = params.input_left_shift;
|
||||
const int32_t input_beta_multiplier = params.input_multiplier;
|
||||
const int32_t input_beta_left_shift = params.input_left_shift;
|
||||
const int diff_min = params.diff_min;
|
||||
// The representation chosen for the input to the exp() function is Q5.26.
|
||||
// We need to leave extra space since values that we skip might be as large as
|
||||
@@ -78,9 +78,10 @@ inline void Softmax(const SoftmaxParams& params,
|
||||
static const int kScaledDiffIntegerBits = 5;
|
||||
static const int kAccumulationIntegerBits = 12;
|
||||
using FixedPointScaledDiff =
|
||||
gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
|
||||
using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
|
||||
using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
|
||||
gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
|
||||
using FixedPointAccum =
|
||||
gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
|
||||
using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
|
||||
|
||||
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
||||
const int outer_size =
|
||||
@@ -96,10 +97,10 @@ inline void Softmax(const SoftmaxParams& params,
|
||||
|
||||
FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
int32 input_diff =
|
||||
static_cast<int32>(input_data[i * depth + c]) - max_in_row;
|
||||
int32_t input_diff =
|
||||
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
|
||||
if (input_diff >= diff_min) {
|
||||
const int32 input_diff_rescaled =
|
||||
const int32_t input_diff_rescaled =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(
|
||||
input_diff, input_beta_multiplier, input_beta_left_shift);
|
||||
const FixedPointScaledDiff scaled_diff_f8 =
|
||||
@@ -114,28 +115,28 @@ inline void Softmax(const SoftmaxParams& params,
|
||||
sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
|
||||
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
int32 input_diff =
|
||||
static_cast<int32>(input_data[i * depth + c]) - max_in_row;
|
||||
int32_t input_diff =
|
||||
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
|
||||
if (input_diff >= diff_min) {
|
||||
const int32 input_diff_rescaled =
|
||||
const int32_t input_diff_rescaled =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(
|
||||
input_diff, input_beta_multiplier, input_beta_left_shift);
|
||||
const FixedPointScaledDiff scaled_diff_f8 =
|
||||
FixedPointScaledDiff::FromRaw(input_diff_rescaled);
|
||||
|
||||
FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
|
||||
int32 unsat_output = gemmlowp::RoundingDivideByPOT(
|
||||
int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
|
||||
(shifted_scale * exp_in_0).raw(),
|
||||
num_bits_over_unit + 31 - (sizeof(OutputT) * 8));
|
||||
|
||||
const int32 shifted_output =
|
||||
const int32_t shifted_output =
|
||||
unsat_output +
|
||||
static_cast<int32>(std::numeric_limits<OutputT>::min());
|
||||
static_cast<int32_t>(std::numeric_limits<OutputT>::min());
|
||||
|
||||
output_data[i * depth + c] = static_cast<OutputT>(std::max(
|
||||
std::min(shifted_output,
|
||||
static_cast<int32>(std::numeric_limits<OutputT>::max())),
|
||||
static_cast<int32>(std::numeric_limits<OutputT>::min())));
|
||||
static_cast<int32_t>(std::numeric_limits<OutputT>::max())),
|
||||
static_cast<int32_t>(std::numeric_limits<OutputT>::min())));
|
||||
} else {
|
||||
output_data[i * depth + c] = std::numeric_limits<OutputT>::min();
|
||||
}
|
||||
@@ -143,7 +144,24 @@ inline void Softmax(const SoftmaxParams& params,
|
||||
}
|
||||
}
|
||||
|
||||
// Quantized softmax with int16 input and int16 output.
|
||||
// Computes exp(input - max_input)
|
||||
inline int16_t SoftMaxCalculateExp(const SoftmaxParams& params,
|
||||
const int16_t* input_data, const int depth,
|
||||
int16_t max_in_row, int i, int c) {
|
||||
int32_t input_diff = input_data[i * depth + c] - max_in_row;
|
||||
// scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
|
||||
// exp lut generated with range [-10, 0], as exp(-10) is negligible.
|
||||
int32_t scaled_diff = MultiplyByQuantizedMultiplier(
|
||||
input_diff, params.input_multiplier, params.input_left_shift);
|
||||
// recenter to [-32768, 32767]
|
||||
int32_t sym_scaled_diff = scaled_diff + 32767;
|
||||
int16_t sat_sym_scaled_diff =
|
||||
std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
|
||||
static_cast<int32_t>(32767));
|
||||
// apply the exp() LUT activation function
|
||||
return generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
|
||||
}
|
||||
// Quantized softmax with int16_t input and int16_t output.
|
||||
inline void SoftmaxInt16(const SoftmaxParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const int16_t* input_data,
|
||||
@@ -162,28 +180,16 @@ inline void SoftmaxInt16(const SoftmaxParams& params,
|
||||
max_in_row = std::max(max_in_row, input_data[i * depth + c]);
|
||||
}
|
||||
|
||||
// Compute exp(input - max_input)
|
||||
std::vector<int16_t> exp_result_Q015(depth);
|
||||
// This loops computes the exp values and their sum. We will need the exp
|
||||
// values later on in the function so we cache them in the output_data
|
||||
// buffer. This is an optimization done to avoid calculating the exp values
|
||||
// twice making use of the output_data buffer as scratch memory.
|
||||
int32_t sum_of_exps = 0; // Q16.15 fixed point format.
|
||||
int16_t* exp_results_Q015 = output_data + i * depth;
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
int32_t input_diff = input_data[i * depth + c] - max_in_row;
|
||||
// scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
|
||||
int32_t scaled_diff = MultiplyByQuantizedMultiplier(
|
||||
input_diff, params.input_multiplier, params.input_left_shift);
|
||||
// recenter to [-32768, 32767]
|
||||
int32_t sym_scaled_diff = scaled_diff + 32767;
|
||||
int16_t sat_sym_scaled_diff =
|
||||
std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
|
||||
static_cast<int32_t>(32767));
|
||||
// apply the exp() LUT activation function
|
||||
exp_result_Q015[c] =
|
||||
generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
|
||||
}
|
||||
|
||||
// sum_of_exps is a Q16.15 fixed point format.
|
||||
int32_t sum_of_exps = 0;
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
// Q16.15 + Q0.15
|
||||
sum_of_exps += exp_result_Q015[c];
|
||||
exp_results_Q015[c] =
|
||||
SoftMaxCalculateExp(params, input_data, depth, max_in_row, i, c);
|
||||
sum_of_exps += exp_results_Q015[c];
|
||||
}
|
||||
|
||||
// Compute the reciprocal 1/sum_of_exps
|
||||
@@ -209,7 +215,7 @@ inline void SoftmaxInt16(const SoftmaxParams& params,
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
uint8_t right_shift = 31 - headroom_plus_one;
|
||||
int64_t round = 1 << (right_shift - 1);
|
||||
int32_t result = (static_cast<int64_t>(exp_result_Q015[c]) *
|
||||
int32_t result = (static_cast<int64_t>(exp_results_Q015[c]) *
|
||||
static_cast<int64_t>(reciprocal_scale_Q015) +
|
||||
round) >>
|
||||
right_shift;
|
||||
|
||||
@@ -16,8 +16,10 @@ limitations under the License.
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
@@ -15,9 +15,15 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
@@ -41,11 +47,11 @@ inline void SubNonBroadcast(const ArithmeticParams& params,
|
||||
|
||||
inline void SubNonBroadcast(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int32* input1_data,
|
||||
const int32_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int32* input2_data,
|
||||
const int32_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int32* output_data) {
|
||||
int32_t* output_data) {
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
@@ -106,12 +112,12 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
template <int N = 5>
|
||||
inline void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const uint8* input1_data,
|
||||
const uint8_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const uint8* input2_data,
|
||||
const uint8_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8");
|
||||
uint8_t* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8_t");
|
||||
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
|
||||
@@ -134,28 +140,28 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
auto sub_func = [&](int indexes[N]) {
|
||||
const int32 input1_val =
|
||||
const int32_t input1_val =
|
||||
params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
|
||||
const int32 input2_val =
|
||||
const int32_t input2_val =
|
||||
params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
|
||||
const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32 scaled_input1_val =
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
const int32 scaled_input2_val =
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32 raw_sub = scaled_input1_val - scaled_input2_val;
|
||||
const int32 raw_output =
|
||||
const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sub, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32 clamped_output =
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
static_cast<uint8>(clamped_output);
|
||||
static_cast<uint8_t>(clamped_output);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, sub_func);
|
||||
}
|
||||
@@ -163,12 +169,12 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
template <int N = 5>
|
||||
inline void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int32* input1_data,
|
||||
const int32_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int32* input2_data,
|
||||
const int32_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int32* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32");
|
||||
int32_t* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
|
||||
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
|
||||
@@ -208,7 +214,7 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
const int8_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8");
|
||||
ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8_t");
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
@@ -254,6 +260,45 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
NDOpsHelper<N>(output_desc, sub_func);
|
||||
}
|
||||
|
||||
template <int N = 5>
|
||||
void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int64_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int64_t* input2_data,
|
||||
const RuntimeShape& output_shape, int64_t* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
|
||||
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
auto sub_func = [&](int indexes[N]) {
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
ActivationFunctionWithMinMax(
|
||||
input1_data[SubscriptToIndex(desc1, indexes)] -
|
||||
input2_data[SubscriptToIndex(desc2, indexes)],
|
||||
params.int64_activation_min, params.int64_activation_max);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, sub_func);
|
||||
}
|
||||
|
||||
template <typename T, int N = 5>
|
||||
void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const T* input1_data,
|
||||
@@ -294,33 +339,33 @@ void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
// Element-wise Sub that can often be used for inner loop of broadcast sub as
|
||||
// well as the non-broadcast sub.
|
||||
inline void SubElementwise(int size, const ArithmeticParams& params,
|
||||
const uint8* input1_data, const uint8* input2_data,
|
||||
uint8* output_data) {
|
||||
const uint8_t* input1_data,
|
||||
const uint8_t* input2_data, uint8_t* output_data) {
|
||||
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
||||
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
||||
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
||||
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32 input1_val = params.input1_offset + input1_data[i];
|
||||
const int32 input2_val = params.input2_offset + input2_data[i];
|
||||
const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32 scaled_input1_val =
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
const int32 scaled_input2_val =
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32 raw_sub = scaled_input1_val - scaled_input2_val;
|
||||
const int32 raw_output =
|
||||
const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sub, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32 clamped_output =
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[i] = static_cast<uint8>(clamped_output);
|
||||
output_data[i] = static_cast<uint8_t>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -336,22 +381,22 @@ inline void SubElementwise(int size, const ArithmeticParams& params,
|
||||
TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32 input1_val = params.input1_offset + input1_data[i];
|
||||
const int32 input2_val = params.input2_offset + input2_data[i];
|
||||
const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32 scaled_input1_val =
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
const int32 scaled_input2_val =
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32 raw_sub = scaled_input1_val - scaled_input2_val;
|
||||
const int32 raw_output =
|
||||
const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sub, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32 clamped_output =
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[i] = static_cast<int8_t>(clamped_output);
|
||||
@@ -359,9 +404,9 @@ inline void SubElementwise(int size, const ArithmeticParams& params,
|
||||
}
|
||||
|
||||
inline void Sub(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const uint8* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8* input2_data,
|
||||
const RuntimeShape& output_shape, uint8* output_data) {
|
||||
const RuntimeShape& input1_shape, const uint8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8_t* input2_data,
|
||||
const RuntimeShape& output_shape, uint8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
@@ -428,40 +473,43 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
||||
}
|
||||
}
|
||||
|
||||
inline void SubWithActivation(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int32* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int32* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int32* output_data) {
|
||||
inline void SetActivationMinMax(const ArithmeticParams& params,
|
||||
int32_t* activation_min,
|
||||
int32_t* activation_max) {
|
||||
*activation_min = params.quantized_activation_min;
|
||||
*activation_max = params.quantized_activation_max;
|
||||
}
|
||||
|
||||
inline void SetActivationMinMax(const ArithmeticParams& params,
|
||||
float* activation_min, float* activation_max) {
|
||||
*activation_min = params.float_activation_min;
|
||||
*activation_max = params.float_activation_max;
|
||||
}
|
||||
|
||||
inline void SetActivationMinMax(const ArithmeticParams& params,
|
||||
int64_t* activation_min,
|
||||
int64_t* activation_max) {
|
||||
*activation_min = params.int64_activation_min;
|
||||
*activation_max = params.int64_activation_max;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void SubWithActivation(
|
||||
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
||||
const T* input1_data, const RuntimeShape& input2_shape,
|
||||
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("SubWithActivation");
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
T activation_min, activation_max;
|
||||
SetActivationMinMax(params, &activation_min, &activation_max);
|
||||
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = ActivationFunctionWithMinMax(
|
||||
input1_data[i] - input2_data[i], params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
input1_data[i] - input2_data[i], activation_min, activation_max);
|
||||
}
|
||||
}
|
||||
|
||||
inline void SubWithActivation(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const float* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const float* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
float* output_data) {
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = ActivationFunctionWithMinMax(
|
||||
input1_data[i] - input2_data[i], params.float_activation_min,
|
||||
params.float_activation_max);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
|
||||
@@ -0,0 +1,129 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
#include "tensorflow/lite/kernels/op_macros.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
float val = input_data[i];
|
||||
float result = std::tanh(val);
|
||||
output_data[i] = result;
|
||||
}
|
||||
}
|
||||
|
||||
// Convenience version that allows, for example, generated-code calls to be
|
||||
// uniform between data types.
|
||||
inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& output_shape,
|
||||
float* output_data) {
|
||||
// Drop params: not needed.
|
||||
Tanh(input_shape, input_data, output_shape, output_data);
|
||||
}
|
||||
|
||||
inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
|
||||
const int16_t* input_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
const int input_left_shift = params.input_left_shift;
|
||||
// Support for shifts is limited until we have a parameterized version of
|
||||
// SaturatingRoundingMultiplyByPOT().
|
||||
TFLITE_DCHECK_GE(input_left_shift, 0);
|
||||
TFLITE_DCHECK_LE(input_left_shift, 1);
|
||||
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
// F0 uses 0 integer bits, range [-1, 1].
|
||||
// This is the return type of math functions such as tanh, logistic,
|
||||
// whose range is in [-1, 1].
|
||||
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
||||
// F3 uses 3 integer bits, range [-8, 8], the input range expected here.
|
||||
using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
|
||||
|
||||
if (input_left_shift == 0) {
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
F3 input = F3::FromRaw(input_data[i]);
|
||||
F0 output = gemmlowp::tanh(input);
|
||||
output_data[i] = output.raw();
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
F3 input = F3::FromRaw(
|
||||
gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
|
||||
F0 output = gemmlowp::tanh(input);
|
||||
output_data[i] = output.raw();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
const int32_t input_zero_point = params.input_zero_point;
|
||||
const int32_t input_range_radius = params.input_range_radius;
|
||||
const int32_t input_multiplier = params.input_multiplier;
|
||||
const int input_left_shift = params.input_left_shift;
|
||||
const int32_t output_zero_point = 128;
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
const uint8_t input_val_u8 = input_data[i];
|
||||
const int32_t input_val_centered =
|
||||
static_cast<int32_t>(input_val_u8) - input_zero_point;
|
||||
uint8_t output_val;
|
||||
if (input_val_centered <= -input_range_radius) {
|
||||
output_val = 0;
|
||||
} else if (input_val_centered >= input_range_radius) {
|
||||
output_val = 255;
|
||||
} else {
|
||||
const int32_t input_val_rescaled =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(
|
||||
input_val_centered, input_multiplier, input_left_shift);
|
||||
using FixedPoint4 = gemmlowp::FixedPoint<int32_t, 4>;
|
||||
using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
|
||||
const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
|
||||
const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
|
||||
// Convert from Q0.31 to Q24.7.
|
||||
using gemmlowp::RoundingDivideByPOT;
|
||||
int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
|
||||
output_val_s32 += output_zero_point;
|
||||
if (output_val_s32 == 256) {
|
||||
output_val_s32 = 255;
|
||||
}
|
||||
// Reinterpret as Q0.7, encoded in uint8_t.
|
||||
TFLITE_DCHECK_GE(output_val_s32, 0);
|
||||
TFLITE_DCHECK_LE(output_val_s32, 255);
|
||||
output_val = static_cast<uint8_t>(output_val_s32);
|
||||
}
|
||||
output_data[i] = output_val;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
|
||||
@@ -18,6 +18,7 @@ limitations under the License.
|
||||
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
@@ -69,8 +70,8 @@ inline void StridedSlicePadIndices(tflite::StridedSliceParams* p,
|
||||
}
|
||||
|
||||
// Return the index for the first element along that axis. This index will be a
|
||||
// positive integer between [0, axis_size - 1] that can be used to index
|
||||
// directly into the data.
|
||||
// positive integer between [0, axis_size] (or [-1, axis_size -1] if stride < 0)
|
||||
// that can be used to index directly into the data.
|
||||
inline int StartForAxis(const tflite::StridedSliceParams& params,
|
||||
const RuntimeShape& input_shape, int axis) {
|
||||
const auto begin_mask = params.begin_mask;
|
||||
@@ -102,7 +103,13 @@ inline int StartForAxis(const tflite::StridedSliceParams& params,
|
||||
}
|
||||
|
||||
// Clamping
|
||||
start = Clamp(start, 0, axis_size - 1);
|
||||
if (strides[axis] > 0) {
|
||||
// Forward iteration
|
||||
start = Clamp(start, 0, axis_size);
|
||||
} else {
|
||||
// Backward iteration
|
||||
start = Clamp(start, -1, axis_size - 1);
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
@@ -24,24 +24,29 @@ limitations under the License.
|
||||
|
||||
namespace tflite {
|
||||
|
||||
enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu };
|
||||
enum class PaddingType : uint8 { kNone, kSame, kValid };
|
||||
enum class FusedActivationFunctionType : uint8_t {
|
||||
kNone,
|
||||
kRelu6,
|
||||
kRelu1,
|
||||
kRelu
|
||||
};
|
||||
enum class PaddingType : uint8_t { kNone, kSame, kValid };
|
||||
|
||||
struct PaddingValues {
|
||||
int16 width;
|
||||
int16 height;
|
||||
int16_t width;
|
||||
int16_t height;
|
||||
// offset is used for calculating "remaining" padding, for example, `width`
|
||||
// is 1 and `width_offset` is 1, so padding_left is 1 while padding_right is
|
||||
// 1 + 1 = 2.
|
||||
int16 width_offset;
|
||||
int16_t width_offset;
|
||||
// Same as width_offset except it's over the height dimension.
|
||||
int16 height_offset;
|
||||
int16_t height_offset;
|
||||
};
|
||||
|
||||
// This enumeration allows for non-default formats for the weights array
|
||||
// of a fully-connected operator, allowing the use of special optimized
|
||||
// runtime paths.
|
||||
enum class FullyConnectedWeightsFormat : uint8 {
|
||||
enum class FullyConnectedWeightsFormat : uint8_t {
|
||||
// Default format (flat 2D layout, the inner contiguous dimension
|
||||
// is input_depth, the outer non-contiguous dimension is output_depth)
|
||||
kDefault,
|
||||
@@ -88,11 +93,11 @@ enum class FullyConnectedWeightsFormat : uint8 {
|
||||
// maximize arithmetic throughput.
|
||||
//
|
||||
// Finally, the 'Int8' part in the name refers to the fact that this
|
||||
// weights format has each weights value encoded as a signed int8 value,
|
||||
// even if the data type of the weights buffer is uint8. This is intended
|
||||
// weights format has each weights value encoded as a signed int8_t value,
|
||||
// even if the data type of the weights buffer is uint8_t. This is intended
|
||||
// to save runtime kernels the effort to have to XOR the top bit of these
|
||||
// bytes before using them in signed arithmetic, see this file for more
|
||||
// explanations on the 'signed int8 trick' in matrix multiplication kernels:
|
||||
// explanations on the 'signed int8_t trick' in matrix multiplication kernels:
|
||||
//
|
||||
// tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
|
||||
//
|
||||
@@ -111,7 +116,7 @@ enum class FullyConnectedWeightsFormat : uint8 {
|
||||
// the real 0 value, and scale designates the difference between the real values
|
||||
// corresponding to consecutive quantized values differing by 1.
|
||||
struct QuantizationParams {
|
||||
int32 zero_point = 0;
|
||||
int32_t zero_point = 0;
|
||||
double scale = 0.0;
|
||||
};
|
||||
|
||||
@@ -140,20 +145,20 @@ class RuntimeShape {
|
||||
if (dimensions_count > kMaxSmallSize) {
|
||||
#ifdef TF_LITE_STATIC_MEMORY
|
||||
TFLITE_CHECK(false && "No shape resizing supported on this platform");
|
||||
#else // TF_LITE_STATIC_MEMORY
|
||||
dims_pointer_ = new int32[dimensions_count];
|
||||
#else // TF_LITE_STATIC_MEMORY
|
||||
dims_pointer_ = new int32_t[dimensions_count];
|
||||
#endif // TF_LITE_STATIC_MEMORY
|
||||
}
|
||||
}
|
||||
|
||||
RuntimeShape(int shape_size, int32 value) : size_(0) {
|
||||
RuntimeShape(int shape_size, int32_t value) : size_(0) {
|
||||
Resize(shape_size);
|
||||
for (int i = 0; i < shape_size; ++i) {
|
||||
SetDim(i, value);
|
||||
}
|
||||
}
|
||||
|
||||
RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) {
|
||||
RuntimeShape(int dimensions_count, const int32_t* dims_data) : size_(0) {
|
||||
ReplaceWith(dimensions_count, dims_data);
|
||||
}
|
||||
|
||||
@@ -165,33 +170,34 @@ class RuntimeShape {
|
||||
// rolls out.
|
||||
RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
|
||||
if (size_ > kMaxSmallSize) {
|
||||
dims_pointer_ = new int32[size_];
|
||||
dims_pointer_ = new int32_t[size_];
|
||||
}
|
||||
std::memcpy(DimsData(), other.DimsData(), sizeof(int32) * size_);
|
||||
std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * size_);
|
||||
}
|
||||
|
||||
bool operator==(const RuntimeShape& comp) const {
|
||||
return this->size_ == comp.size_ &&
|
||||
std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32)) == 0;
|
||||
std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32_t)) ==
|
||||
0;
|
||||
}
|
||||
|
||||
~RuntimeShape() {
|
||||
if (size_ > kMaxSmallSize) {
|
||||
#ifdef TF_LITE_STATIC_MEMORY
|
||||
TFLITE_CHECK(false && "No shape resizing supported on this platform");
|
||||
#else // TF_LITE_STATIC_MEMORY
|
||||
#else // TF_LITE_STATIC_MEMORY
|
||||
delete[] dims_pointer_;
|
||||
#endif // TF_LITE_STATIC_MEMORY
|
||||
}
|
||||
}
|
||||
|
||||
inline int32 DimensionsCount() const { return size_; }
|
||||
inline int32 Dims(int i) const {
|
||||
inline int32_t DimensionsCount() const { return size_; }
|
||||
inline int32_t Dims(int i) const {
|
||||
TFLITE_DCHECK_GE(i, 0);
|
||||
TFLITE_DCHECK_LT(i, size_);
|
||||
return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i];
|
||||
}
|
||||
inline void SetDim(int i, int32 val) {
|
||||
inline void SetDim(int i, int32_t val) {
|
||||
TFLITE_DCHECK_GE(i, 0);
|
||||
TFLITE_DCHECK_LT(i, size_);
|
||||
if (size_ > kMaxSmallSize) {
|
||||
@@ -201,20 +207,20 @@ class RuntimeShape {
|
||||
}
|
||||
}
|
||||
|
||||
inline int32* DimsData() {
|
||||
inline int32_t* DimsData() {
|
||||
return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
|
||||
}
|
||||
inline const int32* DimsData() const {
|
||||
inline const int32_t* DimsData() const {
|
||||
return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
|
||||
}
|
||||
// The caller must ensure that the shape is no bigger than 5-D.
|
||||
inline const int32* DimsDataUpTo5D() const { return dims_; }
|
||||
inline const int32_t* DimsDataUpTo5D() const { return dims_; }
|
||||
|
||||
inline void Resize(int dimensions_count) {
|
||||
if (size_ > kMaxSmallSize) {
|
||||
#ifdef TF_LITE_STATIC_MEMORY
|
||||
TFLITE_CHECK(false && "No shape resizing supported on this platform");
|
||||
#else // TF_LITE_STATIC_MEMORY
|
||||
#else // TF_LITE_STATIC_MEMORY
|
||||
delete[] dims_pointer_;
|
||||
#endif // TF_LITE_STATIC_MEMORY
|
||||
}
|
||||
@@ -222,16 +228,16 @@ class RuntimeShape {
|
||||
if (dimensions_count > kMaxSmallSize) {
|
||||
#ifdef TF_LITE_STATIC_MEMORY
|
||||
TFLITE_CHECK(false && "No shape resizing supported on this platform");
|
||||
#else // TF_LITE_STATIC_MEMORY
|
||||
dims_pointer_ = new int32[dimensions_count];
|
||||
#else // TF_LITE_STATIC_MEMORY
|
||||
dims_pointer_ = new int32_t[dimensions_count];
|
||||
#endif // TF_LITE_STATIC_MEMORY
|
||||
}
|
||||
}
|
||||
|
||||
inline void ReplaceWith(int dimensions_count, const int32* dims_data) {
|
||||
inline void ReplaceWith(int dimensions_count, const int32_t* dims_data) {
|
||||
Resize(dimensions_count);
|
||||
int32* dst_dims = DimsData();
|
||||
std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32));
|
||||
int32_t* dst_dims = DimsData();
|
||||
std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -239,7 +245,7 @@ class RuntimeShape {
|
||||
const int dimensions_count =
|
||||
std::distance(src_iterable.begin(), src_iterable.end());
|
||||
Resize(dimensions_count);
|
||||
int32* data = DimsData();
|
||||
int32_t* data = DimsData();
|
||||
for (auto it : src_iterable) {
|
||||
*data = it;
|
||||
++data;
|
||||
@@ -288,13 +294,13 @@ class RuntimeShape {
|
||||
SetDim(i, pad_value);
|
||||
}
|
||||
std::memcpy(DimsData() + size_increase, shape.DimsData(),
|
||||
sizeof(int32) * shape.DimensionsCount());
|
||||
sizeof(int32_t) * shape.DimensionsCount());
|
||||
}
|
||||
|
||||
int32 size_;
|
||||
int32_t size_;
|
||||
union {
|
||||
int32 dims_[kMaxSmallSize];
|
||||
int32* dims_pointer_;
|
||||
int32_t dims_[kMaxSmallSize];
|
||||
int32_t* dims_pointer_;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -432,7 +438,7 @@ int MatchingArraySize(const ArrayType1& array1, int index1,
|
||||
inline int MatchingDim(const RuntimeShape& shape1, int index1,
|
||||
const RuntimeShape& shape2, int index2) {
|
||||
TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2));
|
||||
return shape1.Dims(index1);
|
||||
return std::min(shape1.Dims(index1), shape2.Dims(index2));
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
@@ -713,7 +719,7 @@ void ComputeStrides(Dims<N>* dims) {
|
||||
}
|
||||
}
|
||||
|
||||
enum class BroadcastableOpCategory : uint8 {
|
||||
enum class BroadcastableOpCategory : uint8_t {
|
||||
kNone,
|
||||
kNonBroadcast, // Matching input shapes.
|
||||
kFirstInputBroadcastsFast, // Fivefold nested loops.
|
||||
@@ -729,21 +735,21 @@ static_assert(sizeof(MinMax) == 8, "");
|
||||
|
||||
struct ActivationParams {
|
||||
FusedActivationFunctionType activation_type;
|
||||
// uint8, etc, activation params.
|
||||
int32 quantized_activation_min;
|
||||
int32 quantized_activation_max;
|
||||
// uint8_t, etc, activation params.
|
||||
int32_t quantized_activation_min;
|
||||
int32_t quantized_activation_max;
|
||||
};
|
||||
|
||||
struct ReluParams : public ActivationParams {
|
||||
int32 input_offset;
|
||||
int32 output_offset;
|
||||
int32 output_multiplier;
|
||||
int32 output_shift;
|
||||
int32_t input_offset;
|
||||
int32_t output_offset;
|
||||
int32_t output_multiplier;
|
||||
int output_shift;
|
||||
};
|
||||
|
||||
// Styles of resizing op usages. For example, kImageStyle can be used with a Pad
|
||||
// op for pattern-specific optimization.
|
||||
enum class ResizingCategory : uint8 {
|
||||
enum class ResizingCategory : uint8_t {
|
||||
kNone,
|
||||
kImageStyle, // 4D, operating on inner dimensions, say {0, a, b, 0}.
|
||||
kGenericResize,
|
||||
@@ -753,24 +759,29 @@ enum class ResizingCategory : uint8 {
|
||||
struct ArithmeticParams {
|
||||
// Shape dependent / common to data / op types.
|
||||
BroadcastableOpCategory broadcast_category;
|
||||
// uint8 inference params.
|
||||
int32 input1_offset;
|
||||
int32 input2_offset;
|
||||
int32 output_offset;
|
||||
int32 output_multiplier;
|
||||
// uint8_t inference params.
|
||||
int32_t input1_offset;
|
||||
int32_t input2_offset;
|
||||
int32_t output_offset;
|
||||
int32_t output_multiplier;
|
||||
int output_shift;
|
||||
// Add / Sub, not Mul, uint8 inference params.
|
||||
// Add / Sub, not Mul, uint8_t inference params.
|
||||
int left_shift;
|
||||
int32 input1_multiplier;
|
||||
int32_t input1_multiplier;
|
||||
int input1_shift;
|
||||
int32 input2_multiplier;
|
||||
int32_t input2_multiplier;
|
||||
int input2_shift;
|
||||
// uint8, etc, activation params.
|
||||
int32 quantized_activation_min;
|
||||
int32 quantized_activation_max;
|
||||
|
||||
// TODO(b/158622529): Union the following activation params.
|
||||
// uint8_t, etc, activation params.
|
||||
int32_t quantized_activation_min;
|
||||
int32_t quantized_activation_max;
|
||||
// float activation params.
|
||||
float float_activation_min;
|
||||
float float_activation_max;
|
||||
// int64_t activation params.
|
||||
int64_t int64_activation_min;
|
||||
int64_t int64_activation_max;
|
||||
|
||||
// Processed output dimensions.
|
||||
// Let input "a" be the one that broadcasts in the faster-changing dimension.
|
||||
@@ -785,22 +796,22 @@ struct ArithmeticParams {
|
||||
};
|
||||
|
||||
struct ConcatenationParams {
|
||||
int8 axis;
|
||||
const int32* input_zeropoint;
|
||||
int8_t axis;
|
||||
const int32_t* input_zeropoint;
|
||||
const float* input_scale;
|
||||
uint16 inputs_count;
|
||||
int32 output_zeropoint;
|
||||
uint16_t inputs_count;
|
||||
int32_t output_zeropoint;
|
||||
float output_scale;
|
||||
};
|
||||
|
||||
struct ComparisonParams {
|
||||
// uint8 inference params.
|
||||
// uint8_t inference params.
|
||||
int left_shift;
|
||||
int32 input1_offset;
|
||||
int32 input1_multiplier;
|
||||
int32_t input1_offset;
|
||||
int32_t input1_multiplier;
|
||||
int input1_shift;
|
||||
int32 input2_offset;
|
||||
int32 input2_multiplier;
|
||||
int32_t input2_offset;
|
||||
int32_t input2_multiplier;
|
||||
int input2_shift;
|
||||
// Shape dependent / common to inference types.
|
||||
bool is_broadcast;
|
||||
@@ -810,81 +821,81 @@ struct ConvParams {
|
||||
PaddingType padding_type;
|
||||
PaddingValues padding_values;
|
||||
// TODO(starka): This was just "stride", so check that width+height is OK.
|
||||
int16 stride_width;
|
||||
int16 stride_height;
|
||||
int16 dilation_width_factor;
|
||||
int16 dilation_height_factor;
|
||||
// uint8 inference params.
|
||||
int16_t stride_width;
|
||||
int16_t stride_height;
|
||||
int16_t dilation_width_factor;
|
||||
int16_t dilation_height_factor;
|
||||
// uint8_t inference params.
|
||||
// TODO(b/65838351): Use smaller types if appropriate.
|
||||
int32 input_offset;
|
||||
int32 weights_offset;
|
||||
int32 output_offset;
|
||||
int32 output_multiplier;
|
||||
int32_t input_offset;
|
||||
int32_t weights_offset;
|
||||
int32_t output_offset;
|
||||
int32_t output_multiplier;
|
||||
int output_shift;
|
||||
// uint8, etc, activation params.
|
||||
int32 quantized_activation_min;
|
||||
int32 quantized_activation_max;
|
||||
// uint8_t, etc, activation params.
|
||||
int32_t quantized_activation_min;
|
||||
int32_t quantized_activation_max;
|
||||
// float activation params.
|
||||
float float_activation_min;
|
||||
float float_activation_max;
|
||||
};
|
||||
|
||||
struct DepthToSpaceParams {
|
||||
int32 block_size;
|
||||
int32_t block_size;
|
||||
};
|
||||
|
||||
struct DepthwiseParams {
|
||||
PaddingType padding_type;
|
||||
PaddingValues padding_values;
|
||||
int16 stride_width;
|
||||
int16 stride_height;
|
||||
int16 dilation_width_factor;
|
||||
int16 dilation_height_factor;
|
||||
int16 depth_multiplier;
|
||||
// uint8 inference params.
|
||||
int16_t stride_width;
|
||||
int16_t stride_height;
|
||||
int16_t dilation_width_factor;
|
||||
int16_t dilation_height_factor;
|
||||
int16_t depth_multiplier;
|
||||
// uint8_t inference params.
|
||||
// TODO(b/65838351): Use smaller types if appropriate.
|
||||
int32 input_offset;
|
||||
int32 weights_offset;
|
||||
int32 output_offset;
|
||||
int32 output_multiplier;
|
||||
int32_t input_offset;
|
||||
int32_t weights_offset;
|
||||
int32_t output_offset;
|
||||
int32_t output_multiplier;
|
||||
int output_shift;
|
||||
// uint8, etc, activation params.
|
||||
int32 quantized_activation_min;
|
||||
int32 quantized_activation_max;
|
||||
// uint8_t, etc, activation params.
|
||||
int32_t quantized_activation_min;
|
||||
int32_t quantized_activation_max;
|
||||
// float activation params.
|
||||
float float_activation_min;
|
||||
float float_activation_max;
|
||||
const int32* output_multiplier_per_channel;
|
||||
const int32* output_shift_per_channel;
|
||||
const int32_t* output_multiplier_per_channel;
|
||||
const int32_t* output_shift_per_channel;
|
||||
};
|
||||
|
||||
struct DequantizationParams {
|
||||
double scale;
|
||||
int32 zero_point;
|
||||
int32_t zero_point;
|
||||
};
|
||||
|
||||
struct PerChannelDequantizationParams {
|
||||
const float* scale;
|
||||
const int32* zero_point;
|
||||
int32 quantized_dimension;
|
||||
const int32_t* zero_point;
|
||||
int32_t quantized_dimension;
|
||||
};
|
||||
|
||||
struct FakeQuantParams {
|
||||
MinMax minmax;
|
||||
int32 num_bits;
|
||||
int32_t num_bits;
|
||||
};
|
||||
|
||||
struct FullyConnectedParams {
|
||||
// uint8 inference params.
|
||||
// uint8_t inference params.
|
||||
// TODO(b/65838351): Use smaller types if appropriate.
|
||||
int32 input_offset;
|
||||
int32 weights_offset;
|
||||
int32 output_offset;
|
||||
int32 output_multiplier;
|
||||
int32_t input_offset;
|
||||
int32_t weights_offset;
|
||||
int32_t output_offset;
|
||||
int32_t output_multiplier;
|
||||
int output_shift;
|
||||
// uint8, etc, activation params.
|
||||
int32 quantized_activation_min;
|
||||
int32 quantized_activation_max;
|
||||
// uint8_t, etc, activation params.
|
||||
int32_t quantized_activation_min;
|
||||
int32_t quantized_activation_max;
|
||||
// float activation params.
|
||||
float float_activation_min;
|
||||
float float_activation_max;
|
||||
@@ -895,16 +906,16 @@ struct FullyConnectedParams {
|
||||
};
|
||||
|
||||
struct GatherParams {
|
||||
int16 axis;
|
||||
int16_t axis;
|
||||
};
|
||||
|
||||
struct L2NormalizationParams {
|
||||
// uint8 inference params.
|
||||
int32 input_zero_point;
|
||||
// uint8_t inference params.
|
||||
int32_t input_zero_point;
|
||||
};
|
||||
|
||||
struct LocalResponseNormalizationParams {
|
||||
int32 range;
|
||||
int32_t range;
|
||||
double bias;
|
||||
double alpha;
|
||||
double beta;
|
||||
@@ -932,48 +943,50 @@ struct HardSwishParams {
|
||||
};
|
||||
|
||||
struct LogisticParams {
|
||||
// uint8 inference params.
|
||||
int32 input_zero_point;
|
||||
int32 input_range_radius;
|
||||
int32 input_multiplier;
|
||||
// uint8_t inference params.
|
||||
int32_t input_zero_point;
|
||||
int32_t input_range_radius;
|
||||
int32_t input_multiplier;
|
||||
int input_left_shift;
|
||||
};
|
||||
|
||||
struct LstmCellParams {
|
||||
int32 weights_zero_point;
|
||||
int32 accum_multiplier;
|
||||
int32_t weights_zero_point;
|
||||
int32_t accum_multiplier;
|
||||
int accum_shift;
|
||||
int state_integer_bits;
|
||||
};
|
||||
|
||||
struct MeanParams {
|
||||
int8 axis_count;
|
||||
int16 axis[4];
|
||||
int8_t axis_count;
|
||||
int16_t axis[4];
|
||||
};
|
||||
|
||||
struct PackParams {
|
||||
int8 axis;
|
||||
const int32* input_zeropoint;
|
||||
int8_t axis;
|
||||
const int32_t* input_zeropoint;
|
||||
const float* input_scale;
|
||||
uint16 inputs_count;
|
||||
int32 output_zeropoint;
|
||||
uint16_t inputs_count;
|
||||
int32_t output_zeropoint;
|
||||
float output_scale;
|
||||
};
|
||||
|
||||
struct PadParams {
|
||||
int8 left_padding_count;
|
||||
int32 left_padding[4];
|
||||
int8 right_padding_count;
|
||||
int32 right_padding[4];
|
||||
int8_t left_padding_count;
|
||||
int32_t left_padding[4];
|
||||
int8_t right_padding_count;
|
||||
int32_t right_padding[4];
|
||||
ResizingCategory resizing_category;
|
||||
};
|
||||
|
||||
struct PreluParams {
|
||||
int32 input_offset;
|
||||
int32 alpha_offset;
|
||||
int32 output_offset;
|
||||
int32 output_multiplier;
|
||||
int output_shift;
|
||||
int32_t input_offset;
|
||||
int32_t alpha_offset;
|
||||
int32_t output_offset;
|
||||
int32_t output_multiplier_1;
|
||||
int output_shift_1;
|
||||
int32_t output_multiplier_2;
|
||||
int output_shift_2;
|
||||
};
|
||||
|
||||
struct PoolParams {
|
||||
@@ -984,17 +997,17 @@ struct PoolParams {
|
||||
int stride_width;
|
||||
int filter_height;
|
||||
int filter_width;
|
||||
// uint8, etc, activation params.
|
||||
int32 quantized_activation_min;
|
||||
int32 quantized_activation_max;
|
||||
// uint8_t, etc, activation params.
|
||||
int32_t quantized_activation_min;
|
||||
int32_t quantized_activation_max;
|
||||
// float activation params.
|
||||
float float_activation_min;
|
||||
float float_activation_max;
|
||||
};
|
||||
|
||||
struct ReshapeParams {
|
||||
int8 shape_count;
|
||||
int32 shape[4];
|
||||
int8_t shape_count;
|
||||
int32_t shape[4];
|
||||
};
|
||||
|
||||
struct ResizeBilinearParams {
|
||||
@@ -1011,91 +1024,95 @@ struct ResizeNearestNeighborParams {
|
||||
};
|
||||
|
||||
struct SliceParams {
|
||||
int8 begin_count;
|
||||
int32 begin[4];
|
||||
int8 size_count;
|
||||
int32 size[4];
|
||||
int8_t begin_count;
|
||||
int32_t begin[4];
|
||||
int8_t size_count;
|
||||
int32_t size[4];
|
||||
};
|
||||
|
||||
struct SoftmaxParams {
|
||||
// beta is not really used (not a Tensorflow parameter) and not implemented
|
||||
// for LogSoftmax.
|
||||
double beta;
|
||||
// uint8 inference params. Used even when beta defaults to 1.0.
|
||||
int32 input_multiplier;
|
||||
int32 input_left_shift;
|
||||
// uint8_t inference params. Used even when beta defaults to 1.0.
|
||||
int32_t input_multiplier;
|
||||
int32_t input_left_shift;
|
||||
// Reverse scaling is only used by LogSoftmax.
|
||||
int32 reverse_scaling_divisor;
|
||||
int32 reverse_scaling_right_shift;
|
||||
int32_t reverse_scaling_divisor;
|
||||
int32_t reverse_scaling_right_shift;
|
||||
int diff_min;
|
||||
int32_t zero_point;
|
||||
float scale;
|
||||
float* table;
|
||||
// int16 LUT for exp(x), where x uniform distributed between [-10.0 , 0.0]
|
||||
int16_t* exp_lut;
|
||||
// int16 LUT for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0]
|
||||
int16_t* one_over_one_plus_x_lut;
|
||||
uint8_t* uint8_table1;
|
||||
uint8_t* uint8_table2;
|
||||
};
|
||||
|
||||
struct SpaceToBatchParams {
|
||||
// "Zero" padding for uint8 means padding with the output offset.
|
||||
int32 output_offset;
|
||||
// "Zero" padding for uint8_t means padding with the output offset.
|
||||
int32_t output_offset;
|
||||
};
|
||||
|
||||
struct SpaceToDepthParams {
|
||||
int32 block_size;
|
||||
int32_t block_size;
|
||||
};
|
||||
|
||||
struct SplitParams {
|
||||
// Graphs that split into, say, 2000 nodes are encountered. The indices in
|
||||
// OperatorEdges are of type uint16.
|
||||
uint16 num_split;
|
||||
int16 axis;
|
||||
// OperatorEdges are of type uint16_t.
|
||||
uint16_t num_split;
|
||||
int16_t axis;
|
||||
};
|
||||
|
||||
struct SqueezeParams {
|
||||
int8 squeeze_dims_count;
|
||||
int32 squeeze_dims[4];
|
||||
int8_t squeeze_dims_count;
|
||||
int32_t squeeze_dims[4];
|
||||
};
|
||||
|
||||
struct StridedSliceParams {
|
||||
int8 start_indices_count;
|
||||
int32 start_indices[5];
|
||||
int8 stop_indices_count;
|
||||
int32 stop_indices[5];
|
||||
int8 strides_count;
|
||||
int32 strides[5];
|
||||
int8_t start_indices_count;
|
||||
int32_t start_indices[5];
|
||||
int8_t stop_indices_count;
|
||||
int32_t stop_indices[5];
|
||||
int8_t strides_count;
|
||||
int32_t strides[5];
|
||||
|
||||
int16 begin_mask;
|
||||
int16 ellipsis_mask;
|
||||
int16 end_mask;
|
||||
int16 new_axis_mask;
|
||||
int16 shrink_axis_mask;
|
||||
int16_t begin_mask;
|
||||
int16_t ellipsis_mask;
|
||||
int16_t end_mask;
|
||||
int16_t new_axis_mask;
|
||||
int16_t shrink_axis_mask;
|
||||
};
|
||||
|
||||
struct TanhParams {
|
||||
int32 input_zero_point;
|
||||
int32 input_range_radius;
|
||||
int32 input_multiplier;
|
||||
int32_t input_zero_point;
|
||||
int32_t input_range_radius;
|
||||
int32_t input_multiplier;
|
||||
int input_left_shift;
|
||||
};
|
||||
|
||||
struct TransposeParams {
|
||||
int8 perm_count;
|
||||
int32 perm[5];
|
||||
int8_t perm_count;
|
||||
int32_t perm[5];
|
||||
};
|
||||
|
||||
struct UnpackParams {
|
||||
uint16 num_split;
|
||||
int16 axis;
|
||||
uint16_t num_split;
|
||||
int16_t axis;
|
||||
};
|
||||
|
||||
struct LeakyReluParams {
|
||||
float alpha;
|
||||
int32 input_offset;
|
||||
int32 output_offset;
|
||||
int32 output_multiplier_alpha;
|
||||
int32 output_shift_alpha;
|
||||
int32 output_multiplier_identity;
|
||||
int32 output_shift_identity;
|
||||
int32_t input_offset;
|
||||
int32_t output_offset;
|
||||
int32_t output_multiplier_alpha;
|
||||
int32_t output_shift_alpha;
|
||||
int32_t output_multiplier_identity;
|
||||
int32_t output_shift_identity;
|
||||
};
|
||||
|
||||
template <typename P>
|
||||
@@ -1105,13 +1122,19 @@ inline void SetActivationParams(float min, float max, P* params) {
|
||||
}
|
||||
|
||||
template <typename P>
|
||||
inline void SetActivationParams(int32 min, int32 max, P* params) {
|
||||
inline void SetActivationParams(int32_t min, int32_t max, P* params) {
|
||||
params->quantized_activation_min = min;
|
||||
params->quantized_activation_max = max;
|
||||
}
|
||||
|
||||
template <typename P>
|
||||
inline void GetActivationParams(const P& params, int32* min, int32* max) {
|
||||
inline void SetActivationParams(int64_t min, int64_t max, P* params) {
|
||||
params->int64_activation_min = min;
|
||||
params->int64_activation_max = max;
|
||||
}
|
||||
|
||||
template <typename P>
|
||||
inline void GetActivationParams(const P& params, int32_t* min, int32_t* max) {
|
||||
*min = params.quantized_activation_min;
|
||||
*max = params.quantized_activation_max;
|
||||
}
|
||||
@@ -1122,6 +1145,11 @@ inline void GetActivationParams(const P& params, float* min, float* max) {
|
||||
*max = params.float_activation_max;
|
||||
}
|
||||
|
||||
template <typename P>
|
||||
inline void GetActivationParams(const P& params, int64_t* min, int64_t* max) {
|
||||
*min = params.int64_activation_min;
|
||||
*max = params.int64_activation_max;
|
||||
}
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
|
||||
|
||||
@@ -14,15 +14,176 @@ limitations under the License.
|
||||
==============================================================================*/
|
||||
#include "tensorflow/lite/kernels/kernel_util.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <complex>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
|
||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||
#include "tensorflow/lite/c/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace {
|
||||
|
||||
// Assumes tensor_index is a valid index (in bounds)
|
||||
inline TfLiteTensor* GetTensorAtIndex(const TfLiteContext* context,
|
||||
int tensor_index) {
|
||||
if (context->tensors != nullptr) {
|
||||
return &context->tensors[tensor_index];
|
||||
} else {
|
||||
return context->GetTensor(context, tensor_index);
|
||||
}
|
||||
}
|
||||
|
||||
// Validate in a single place to reduce binary size
|
||||
inline TfLiteStatus ValidateTensorIndexingSafe(const TfLiteContext* context,
|
||||
int index, int max_size,
|
||||
const int* tensor_indices,
|
||||
int* tensor_index) {
|
||||
if (index < 0 || index >= max_size) {
|
||||
TF_LITE_KERNEL_LOG(const_cast<TfLiteContext*>(context),
|
||||
"Invalid tensor index %d (not in [0, %d))\n", index,
|
||||
max_size);
|
||||
return kTfLiteError;
|
||||
}
|
||||
if (tensor_indices[index] == kTfLiteOptionalTensor) {
|
||||
TF_LITE_KERNEL_LOG(const_cast<TfLiteContext*>(context),
|
||||
"Tensor at index %d was optional but was expected\n",
|
||||
index);
|
||||
return kTfLiteError;
|
||||
}
|
||||
|
||||
*tensor_index = tensor_indices[index];
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
// Same as above but returns -1 for invalid inputs instead of status + logging
|
||||
// error.
|
||||
inline int ValidateTensorIndexing(const TfLiteContext* context, int index,
|
||||
int max_size, const int* tensor_indices) {
|
||||
if (index >= 0 && index < max_size) {
|
||||
const int tensor_index = tensor_indices[index];
|
||||
if (tensor_index != kTfLiteOptionalTensor) {
|
||||
return tensor_index;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
inline TfLiteTensor* GetMutableInput(const TfLiteContext* context,
|
||||
const TfLiteNode* node, int index) {
|
||||
const int tensor_index = ValidateTensorIndexing(
|
||||
context, index, node->inputs->size, node->inputs->data);
|
||||
if (tensor_index < 0) {
|
||||
return nullptr;
|
||||
}
|
||||
return GetTensorAtIndex(context, tensor_index);
|
||||
}
|
||||
|
||||
inline TfLiteStatus GetMutableInputSafe(const TfLiteContext* context,
|
||||
const TfLiteNode* node, int index,
|
||||
const TfLiteTensor** tensor) {
|
||||
int tensor_index;
|
||||
TF_LITE_ENSURE_OK(
|
||||
context, ValidateTensorIndexingSafe(context, index, node->inputs->size,
|
||||
node->inputs->data, &tensor_index));
|
||||
*tensor = GetTensorAtIndex(context, tensor_index);
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
} // anonymous namespace.
|
||||
|
||||
const TfLiteTensor* GetInput(const TfLiteContext* context,
|
||||
const TfLiteNode* node, int index) {
|
||||
return GetMutableInput(context, node, index);
|
||||
}
|
||||
|
||||
TfLiteStatus GetInputSafe(const TfLiteContext* context, const TfLiteNode* node,
|
||||
int index, const TfLiteTensor** tensor) {
|
||||
return GetMutableInputSafe(context, node, index, tensor);
|
||||
}
|
||||
|
||||
TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
|
||||
int index) {
|
||||
TfLiteTensor* tensor = GetMutableInput(context, node, index);
|
||||
return tensor->is_variable ? tensor : nullptr;
|
||||
}
|
||||
|
||||
TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
|
||||
int index) {
|
||||
const int tensor_index = ValidateTensorIndexing(
|
||||
context, index, node->outputs->size, node->outputs->data);
|
||||
if (tensor_index < 0) {
|
||||
return nullptr;
|
||||
}
|
||||
return GetTensorAtIndex(context, tensor_index);
|
||||
}
|
||||
|
||||
TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node,
|
||||
int index, TfLiteTensor** tensor) {
|
||||
int tensor_index;
|
||||
TF_LITE_ENSURE_OK(
|
||||
context, ValidateTensorIndexingSafe(context, index, node->outputs->size,
|
||||
node->outputs->data, &tensor_index));
|
||||
*tensor = GetTensorAtIndex(context, tensor_index);
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
|
||||
const TfLiteNode* node, int index) {
|
||||
return GetInput(context, node, index);
|
||||
}
|
||||
|
||||
#ifndef TF_LITE_STATIC_MEMORY
|
||||
TfLiteTensor* GetTemporary(TfLiteContext* context, const TfLiteNode* node,
|
||||
int index) {
|
||||
const int tensor_index = ValidateTensorIndexing(
|
||||
context, index, node->temporaries->size, node->temporaries->data);
|
||||
if (tensor_index < 0) {
|
||||
return nullptr;
|
||||
}
|
||||
return GetTensorAtIndex(context, tensor_index);
|
||||
}
|
||||
|
||||
TfLiteStatus GetTemporarySafe(const TfLiteContext* context,
|
||||
const TfLiteNode* node, int index,
|
||||
TfLiteTensor** tensor) {
|
||||
int tensor_index;
|
||||
TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe(
|
||||
context, index, node->temporaries->size,
|
||||
node->temporaries->data, &tensor_index));
|
||||
*tensor = GetTensorAtIndex(context, tensor_index);
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
const TfLiteTensor* GetIntermediates(TfLiteContext* context,
|
||||
const TfLiteNode* node, int index) {
|
||||
const int tensor_index = ValidateTensorIndexing(
|
||||
context, index, node->intermediates->size, node->intermediates->data);
|
||||
if (tensor_index < 0) {
|
||||
return nullptr;
|
||||
}
|
||||
return GetTensorAtIndex(context, tensor_index);
|
||||
}
|
||||
|
||||
TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context,
|
||||
const TfLiteNode* node, int index,
|
||||
TfLiteTensor** tensor) {
|
||||
int tensor_index;
|
||||
TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe(
|
||||
context, index, node->intermediates->size,
|
||||
node->intermediates->data, &tensor_index));
|
||||
*tensor = GetTensorAtIndex(context, tensor_index);
|
||||
return kTfLiteOk;
|
||||
}
|
||||
#endif // TF_LITE_STATIC_MEMORY
|
||||
|
||||
// Per-axis
|
||||
TfLiteStatus PopulateConvolutionQuantizationParams(
|
||||
TfLiteContext* context, const TfLiteTensor* input,
|
||||
@@ -126,11 +287,27 @@ TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
|
||||
// pipeline.
|
||||
if (bias) {
|
||||
const double bias_scale = static_cast<double>(bias->params.scale);
|
||||
// Here we're making sure the input_product_scale & bias_scale the same.
|
||||
// Normally this should be guaranteed by the training pipeline, we are
|
||||
// setting the threshold to be 2e-6 to allow some numeric stability
|
||||
// difference.
|
||||
TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <= 2e-6);
|
||||
// Here we're making sure the input_product_scale & bias_scale are about the
|
||||
// same. Since we have:
|
||||
// (output - output_zp) * output_scale =
|
||||
// input_product_scale * input_product + bias * bias_scale ---- (0)
|
||||
//
|
||||
// (0) equals:
|
||||
// (input_product + bias) * input_product_scale ----- (1)
|
||||
// +
|
||||
// bias * (bias_scale - input_product_scale) ------ (2)
|
||||
//
|
||||
// For the real kernel computation, we're doing (1), so we really need to
|
||||
// make sure (2) has minimum impact on the output, so:
|
||||
// bias * (bias_scale - input_product_scale) / output_scale should be
|
||||
// a small number for an integer.
|
||||
// Since normally bias should be within a small range.
|
||||
// We should expect (bias_scale - input_product_scale) / output_scale to
|
||||
// be a small number like 0.02.
|
||||
const double scale_diff = std::abs(input_product_scale - bias_scale);
|
||||
const double output_scale = static_cast<double>(output->params.scale);
|
||||
|
||||
TF_LITE_ENSURE(context, scale_diff / output_scale <= 0.02);
|
||||
}
|
||||
return GetQuantizedConvolutionMultipler(context, input, filter, output,
|
||||
multiplier);
|
||||
@@ -167,7 +344,7 @@ void CalculateActivationRangeQuantizedImpl(TfLiteFusedActivation activation,
|
||||
} else if (activation == kTfLiteActRelu6) {
|
||||
*act_min = std::max(qmin, quantize(0.0));
|
||||
*act_max = std::min(qmax, quantize(6.0));
|
||||
} else if (activation == kTfLiteActRelu1) {
|
||||
} else if (activation == kTfLiteActReluN1To1) {
|
||||
*act_min = std::max(qmin, quantize(-1.0));
|
||||
*act_max = std::min(qmax, quantize(1.0));
|
||||
} else {
|
||||
@@ -258,4 +435,44 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
|
||||
}
|
||||
#endif // TF_LITE_STATIC_MEMORY
|
||||
|
||||
// Size of string is not constant, return 0 in such case.
|
||||
int TfLiteTypeGetSize(TfLiteType type) {
|
||||
switch (type) {
|
||||
case kTfLiteUInt8:
|
||||
TF_LITE_ASSERT_EQ(sizeof(uint8_t), 1);
|
||||
return 1;
|
||||
case kTfLiteInt8:
|
||||
TF_LITE_ASSERT_EQ(sizeof(int8_t), 1);
|
||||
return 1;
|
||||
case kTfLiteBool:
|
||||
return sizeof(bool);
|
||||
case kTfLiteInt16:
|
||||
TF_LITE_ASSERT_EQ(sizeof(int16_t), 2);
|
||||
return 2;
|
||||
case kTfLiteFloat16:
|
||||
TF_LITE_ASSERT_EQ(sizeof(int16_t), 2);
|
||||
return 2;
|
||||
case kTfLiteFloat32:
|
||||
TF_LITE_ASSERT_EQ(sizeof(float), 4);
|
||||
return 4;
|
||||
case kTfLiteInt32:
|
||||
TF_LITE_ASSERT_EQ(sizeof(int32_t), 4);
|
||||
return 4;
|
||||
case kTfLiteInt64:
|
||||
TF_LITE_ASSERT_EQ(sizeof(int64_t), 8);
|
||||
return 8;
|
||||
case kTfLiteFloat64:
|
||||
TF_LITE_ASSERT_EQ(sizeof(double), 8);
|
||||
return 8;
|
||||
case kTfLiteComplex64:
|
||||
TF_LITE_ASSERT_EQ(sizeof(std::complex<float>), 8);
|
||||
return 8;
|
||||
case kTfLiteComplex128:
|
||||
TF_LITE_ASSERT_EQ(sizeof(std::complex<double>), 16);
|
||||
return 16;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
@@ -15,52 +15,148 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "flatbuffers/flatbuffers.h"
|
||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||
#include "tensorflow/lite/c/common.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
// A fair number of functions in this header have historically been inline.
|
||||
// It is ok to change functions to not be inline if the latency with
|
||||
// benchmark_model for MobileNet + MobileBERT is unaffected. If such a change is
|
||||
// made, move the newly non-inlined function declarations to the top of this
|
||||
// header file.
|
||||
|
||||
// Note: You must check if result is not null:
|
||||
//
|
||||
// TfLiteTensor* my_tensor = GetInput(context, node, kMyTensorIdx);
|
||||
// TF_LITE_ENSURE(context, my_tensor != nullptr);
|
||||
//
|
||||
// This is because the index might point to the optional tensor constant
|
||||
// (kTfLiteOptionalTensor) in which case there is no tensor to return.
|
||||
const TfLiteTensor* GetInput(const TfLiteContext* context,
|
||||
const TfLiteNode* node, int index);
|
||||
|
||||
// Same as `GetInput` but returns boolean and uses output argument for tensor.
|
||||
//
|
||||
// TfLiteTensor* my_tensor;
|
||||
// TF_LITE_ENSURE_OK(context,
|
||||
// GetInputSafe(context, node, kMyTensorIdx, &my_tensor));
|
||||
// // can use my_tensor directly from here onwards, it is not nullptr
|
||||
//
|
||||
// Should be used in cases where the binary size is too large.
|
||||
TfLiteStatus GetInputSafe(const TfLiteContext* context, const TfLiteNode* node,
|
||||
int index, const TfLiteTensor** tensor);
|
||||
|
||||
// Note: You must check if result is not null:
|
||||
//
|
||||
// TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
|
||||
// TF_LITE_ENSURE(context, my_tensor != nullptr);
|
||||
//
|
||||
// This is because the index might point to the optional tensor constant
|
||||
// (kTfLiteOptionalTensor) in which case there is no tensor to return.
|
||||
TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
|
||||
int index);
|
||||
|
||||
// Note: You must check if result is not null:
|
||||
//
|
||||
// TfLiteTensor* my_tensor = GetOutput(context, node, kMyTensorIdx);
|
||||
// TF_LITE_ENSURE(context, my_tensor != nullptr);
|
||||
//
|
||||
// This is because the index might point to the optional tensor constant
|
||||
// (kTfLiteOptionalTensor) in which case there is no tensor to return.
|
||||
TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
|
||||
int index);
|
||||
|
||||
// Same as `GetOutput` but returns boolean and uses output argument for tensor.
|
||||
//
|
||||
// TfLiteTensor* my_tensor;
|
||||
// TF_LITE_ENSURE_OK(context,
|
||||
// GetOutputSafe(context, node, kMyTensorIdx, &my_tensor));
|
||||
// // can use my_tensor directly from here onwards, it is not nullptr
|
||||
//
|
||||
// Should be used in cases where the binary size is too large.
|
||||
TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node,
|
||||
int index, TfLiteTensor** tensor);
|
||||
|
||||
// Note: You must check if result is not null:
|
||||
//
|
||||
// TfLiteTensor* my_tensor = GetOptionalInputTensor(context, node, kIdx);
|
||||
// TF_LITE_ENSURE(context, my_tensor != nullptr);
|
||||
//
|
||||
// This is because the index might point to the optional tensor constant
|
||||
// (kTfLiteOptionalTensor) in which case there is no tensor to return.
|
||||
//
|
||||
// Deprecated. GetInput has the same functionality.
|
||||
const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
|
||||
const TfLiteNode* node, int index);
|
||||
|
||||
#ifndef TF_LITE_STATIC_MEMORY
|
||||
// Note: You must check if result is not null:
|
||||
//
|
||||
// TfLiteTensor* my_tensor = GetTemporary(context, node, kMyTensorIdx);
|
||||
// TF_LITE_ENSURE(context, my_tensor != nullptr);
|
||||
//
|
||||
// This is because the index might point to the optional tensor constant
|
||||
// (kTfLiteOptionalTensor) in which case there is no tensor to return.
|
||||
TfLiteTensor* GetTemporary(TfLiteContext* context, const TfLiteNode* node,
|
||||
int index);
|
||||
|
||||
// Same as `GetTemporary` but returns boolean and uses output argument for
|
||||
// tensor.
|
||||
//
|
||||
// TfLiteTensor* my_tensor;
|
||||
// TF_LITE_ENSURE_OK(context,
|
||||
// GetTemporarySafe(context, node, kMyTensorIdx,
|
||||
// &my_tensor));
|
||||
// // can use my_tensor directly from here onwards, it is not nullptr
|
||||
//
|
||||
// Should be used in cases where the binary size is too large.
|
||||
TfLiteStatus GetTemporarySafe(const TfLiteContext* context,
|
||||
const TfLiteNode* node, int index,
|
||||
TfLiteTensor** tensor);
|
||||
|
||||
// Note: You must check if result is not null:
|
||||
//
|
||||
// TfLiteTensor* my_tensor = GetIntermediates(context, node, kMyTensorIdx);
|
||||
// TF_LITE_ENSURE(context, my_tensor != nullptr);
|
||||
//
|
||||
// This is because the index might point to the optional tensor constant
|
||||
// (kTfLiteOptionalTensor) in which case there is no tensor to return.
|
||||
const TfLiteTensor* GetIntermediates(TfLiteContext* context,
|
||||
const TfLiteNode* node, int index);
|
||||
|
||||
// Same as `GetIntermediates` but returns boolean and uses output argument for
|
||||
// tensor.
|
||||
//
|
||||
// TfLiteTensor* my_tensor;
|
||||
// TF_LITE_ENSURE_OK(context,
|
||||
// GetIntermediatesSafe(context, node, kMyTensorIdx,
|
||||
// &my_tensor));
|
||||
// // can use my_tensor directly from here onwards, it is not nullptr
|
||||
//
|
||||
// Should be used in cases where the binary size is too large.
|
||||
TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context,
|
||||
const TfLiteNode* node, int index,
|
||||
TfLiteTensor** tensor);
|
||||
#endif // TF_LITE_STATIC_MEMORY
|
||||
|
||||
inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
|
||||
inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
|
||||
return t->dims->data[dim];
|
||||
}
|
||||
inline const TfLiteTensor* GetInput(TfLiteContext* context,
|
||||
const TfLiteNode* node, int index) {
|
||||
return &context
|
||||
->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
|
||||
}
|
||||
// Note: You must check if result is not null:
|
||||
// TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
|
||||
// TF_LITE_ENSURE(context, my_tensor != nullptr);
|
||||
inline TfLiteTensor* GetVariableInput(TfLiteContext* context,
|
||||
const TfLiteNode* node, int index) {
|
||||
TfLiteTensor* tensor =
|
||||
&context->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
|
||||
return (tensor->is_variable) ? tensor : nullptr;
|
||||
}
|
||||
inline TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
|
||||
int index) {
|
||||
return &context
|
||||
->tensors[flatbuffers::EndianScalar(node->outputs->data[index])];
|
||||
}
|
||||
inline TfLiteTensor* GetTemporary(TfLiteContext* context,
|
||||
const TfLiteNode* node, int index) {
|
||||
return &context->tensors[flatbuffers::EndianScalar(
|
||||
node->temporaries->data[index])];
|
||||
}
|
||||
inline const TfLiteTensor* GetIntermediates(TfLiteContext* context,
|
||||
const TfLiteNode* node, int index) {
|
||||
return &context->tensors[node->intermediates->data[index]];
|
||||
}
|
||||
|
||||
inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
|
||||
inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
|
||||
|
||||
#ifndef TF_LITE_STATIC_MEMORY
|
||||
inline int NumIntermediates(const TfLiteNode* node) {
|
||||
return node->intermediates->size;
|
||||
}
|
||||
#endif // TF_LITE_STATIC_MEMORY
|
||||
|
||||
inline int64_t NumElements(const TfLiteIntArray* dims) {
|
||||
int64_t count = 1;
|
||||
@@ -74,19 +170,11 @@ inline int64_t NumElements(const TfLiteTensor* t) {
|
||||
return NumElements(t->dims);
|
||||
}
|
||||
|
||||
inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
|
||||
const TfLiteNode* node,
|
||||
int index) {
|
||||
const bool use_tensor = index < node->inputs->size &&
|
||||
node->inputs->data[index] != kTfLiteOptionalTensor;
|
||||
if (use_tensor) {
|
||||
return &context
|
||||
->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Determines whether tensor is constant.
|
||||
// TODO(b/138199592): Introduce new query which checks for constant OR
|
||||
// persistent-read-only, which would be useful for most tensor kernels that
|
||||
// are potentially dynamic based on the input tensor value availability at the
|
||||
// time of prepare.
|
||||
inline bool IsConstantTensor(const TfLiteTensor* tensor) {
|
||||
return tensor->allocation_type == kTfLiteMmapRo;
|
||||
}
|
||||
@@ -105,6 +193,14 @@ inline void SetTensorToDynamic(TfLiteTensor* tensor) {
|
||||
}
|
||||
}
|
||||
|
||||
// Sets tensor to persistent and read-only.
|
||||
inline void SetTensorToPersistentRo(TfLiteTensor* tensor) {
|
||||
if (tensor->allocation_type != kTfLitePersistentRo) {
|
||||
tensor->allocation_type = kTfLitePersistentRo;
|
||||
tensor->data.raw = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Determines whether it is a hybrid op - one that has float inputs and
|
||||
// quantized weights.
|
||||
inline bool IsHybridOp(const TfLiteTensor* input, const TfLiteTensor* weight) {
|
||||
@@ -162,7 +258,7 @@ void CalculateActivationRange(TfLiteFusedActivation activation,
|
||||
} else if (activation == kTfLiteActRelu6) {
|
||||
*activation_min = 0;
|
||||
*activation_max = 6;
|
||||
} else if (activation == kTfLiteActRelu1) {
|
||||
} else if (activation == kTfLiteActReluN1To1) {
|
||||
*activation_min = -1;
|
||||
*activation_max = 1;
|
||||
} else {
|
||||
@@ -188,6 +284,10 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
|
||||
const TfLiteTensor* input2,
|
||||
const TfLiteTensor* input3,
|
||||
TfLiteIntArray** output_shape);
|
||||
|
||||
// Return the size of given type in bytes. Return 0 in in case of string.
|
||||
int TfLiteTypeGetSize(TfLiteType type);
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
|
||||
|
||||
@@ -19,7 +19,7 @@ limitations under the License.
|
||||
// non-portable function.
|
||||
#ifdef TF_LITE_MCU_DEBUG_LOG
|
||||
|
||||
#include "tensorflow/lite/micro/micro_error_reporter.h"
|
||||
#include "tensorflow/lite/micro/debug_log.h"
|
||||
|
||||
#define DEBUG_LOG(x) \
|
||||
do { \
|
||||
@@ -36,7 +36,6 @@ inline void InfiniteLoop() {
|
||||
|
||||
#else // TF_LITE_MCU_DEBUG_LOG
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
@@ -45,6 +44,15 @@ inline void InfiniteLoop() {
|
||||
fprintf(stderr, "%s", (x)); \
|
||||
} while (0)
|
||||
|
||||
// Report Error for unsupported type by op 'op_name' and returns kTfLiteError.
|
||||
#define TF_LITE_UNSUPPORTED_TYPE(context, type, op_name) \
|
||||
do { \
|
||||
TF_LITE_KERNEL_LOG((context), "%s:%d Type %s is unsupported by op %s.", \
|
||||
__FILE__, __LINE__, TfLiteTypeGetName(type), \
|
||||
(op_name)); \
|
||||
return kTfLiteError; \
|
||||
} while (0)
|
||||
|
||||
#define TFLITE_ABORT abort()
|
||||
|
||||
#endif // TF_LITE_MCU_DEBUG_LOG
|
||||
|
||||
Reference in New Issue
Block a user