mirror of
https://github.com/jomjol/AI-on-the-edge-device.git
synced 2025-12-10 21:46:55 +03:00
rolling 20210708
This commit is contained in:
@@ -279,81 +279,125 @@ inline Integer FloorLog2(Integer n) {
|
||||
}
|
||||
}
|
||||
|
||||
// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
|
||||
// softmax
|
||||
// func - the function to build the LUT for (e.g exp(x))
|
||||
// min,max - table limits
|
||||
// table - pointer to buffer
|
||||
// num - number of elements in the LUT
|
||||
inline void gen_lut(double (*func)(double), double min, double max,
|
||||
int16_t* table, const int num) {
|
||||
// size of table should equal to num + 1
|
||||
// last element only for slope calculation
|
||||
double step = (max - min) / (num - 1);
|
||||
double half_step = step / 2.0;
|
||||
for (int i = 0; i < num - 1; i++) {
|
||||
double sample_val = TfLiteRound(func(min + i * step) * 32768.0);
|
||||
double midpoint_interp_val =
|
||||
TfLiteRound((func(min + (i + 1) * step) * 32768.0 +
|
||||
TfLiteRound(func(min + i * step) * 32768.0)) /
|
||||
2.0);
|
||||
double midpoint_val =
|
||||
TfLiteRound(func(min + i * step + half_step) * 32768.0);
|
||||
double midpoint_err = midpoint_interp_val - midpoint_val;
|
||||
double bias = TfLiteRound(midpoint_err / 2.0);
|
||||
table[i] = std::min<double>(std::max<double>(sample_val - bias, -32768.0),
|
||||
32767.0);
|
||||
}
|
||||
table[num - 1] = std::min<double>(
|
||||
std::max<double>(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
|
||||
// The size of the LUT depends on the type of input. For int8 inputs a simple
|
||||
// 256 entries LUT is used. For int16 inputs the high 9 bits are used for
|
||||
// indexing and the 7 remaining bits are used for interpolation. We thus use a
|
||||
// 513-entries LUT for int16 cases, 512 for the 9-bit indexing and 1 extra entry
|
||||
// to interpolate the last value.
|
||||
template <typename LutInT>
|
||||
constexpr int lut_size() {
|
||||
static_assert(std::is_same<LutInT, int8_t>::value ||
|
||||
std::is_same<LutInT, int16_t>::value,
|
||||
"Only LUTs with int8 or int16 inputs are supported.");
|
||||
return std::is_same<LutInT, int8_t>::value ? 256 : 513;
|
||||
}
|
||||
|
||||
// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
|
||||
// softmax
|
||||
// func - the function to build the LUT for (e.g exp(x))
|
||||
// min,max - table limits
|
||||
// table - pointer to buffer
|
||||
// num - number of elements in the LUT
|
||||
inline void gen_lut(float (*func)(float), float min, float max, int16_t* table,
|
||||
const int num) {
|
||||
// size of table should equal to num + 1
|
||||
// last element only for slope calculation
|
||||
float step = (max - min) / (num - 1);
|
||||
float half_step = step / 2.0f;
|
||||
for (int i = 0; i < num - 1; i++) {
|
||||
float sample_val = TfLiteRound(func(min + i * step) * 32768.0f);
|
||||
float midpoint_interp_val =
|
||||
TfLiteRound((func(min + (i + 1) * step) * 32768.0f +
|
||||
TfLiteRound(func(min + i * step) * 32768.0f)) /
|
||||
2.0f);
|
||||
float midpoint_val =
|
||||
TfLiteRound(func(min + i * step + half_step) * 32768.0f);
|
||||
float midpoint_err = midpoint_interp_val - midpoint_val;
|
||||
float bias = TfLiteRound(midpoint_err / 2.0f);
|
||||
table[i] = std::min<float>(std::max<float>(sample_val - bias, -32768.0f),
|
||||
32767.0f);
|
||||
// Generate a LUT for 'func' which can be used to approximate functions like
|
||||
// exp, log, ...
|
||||
//
|
||||
// - func: the function to build the LUT for (e.g exp(x))
|
||||
// - input_min, input_max: range of the func inputs
|
||||
// - output_min, output_max: range of the func outputs
|
||||
// - lut: pointer to the LUT table to fill, the table must be of size
|
||||
// lut_size<LutInT>()
|
||||
template <typename FloatT, typename LutInT, typename LutOutT>
|
||||
inline void gen_lut(FloatT (*func)(FloatT), FloatT input_min, FloatT input_max,
|
||||
FloatT output_min, FloatT output_max, LutOutT* lut) {
|
||||
static_assert(std::is_same<LutInT, int8_t>::value ||
|
||||
std::is_same<LutInT, int16_t>::value,
|
||||
"Only LUTs with int8 or int16 inputs are supported.");
|
||||
static_assert(std::is_same<LutOutT, int8_t>::value ||
|
||||
std::is_same<LutOutT, int16_t>::value,
|
||||
"Only LUTs with int8 or int16 outputs are supported.");
|
||||
static_assert(std::is_floating_point<FloatT>::value,
|
||||
"FloatT must be a floating-point type.");
|
||||
|
||||
const int nb_steps = std::is_same<LutInT, int8_t>::value ? 256 : 512;
|
||||
const FloatT step = (input_max - input_min) / nb_steps;
|
||||
const FloatT half_step = step / 2;
|
||||
const FloatT output_scaling_inv =
|
||||
static_cast<FloatT>(std::numeric_limits<LutOutT>::max() -
|
||||
std::numeric_limits<LutOutT>::min() + 1) /
|
||||
(output_max - output_min);
|
||||
const FloatT table_min =
|
||||
static_cast<FloatT>(std::numeric_limits<LutOutT>::min());
|
||||
const FloatT table_max =
|
||||
static_cast<FloatT>(std::numeric_limits<LutOutT>::max());
|
||||
|
||||
for (int i = 0; i < nb_steps; i++) {
|
||||
const FloatT val = func(input_min + i * step);
|
||||
const FloatT val_midpoint = func(input_min + i * step + half_step);
|
||||
const FloatT val_next = func(input_min + (i + 1) * step);
|
||||
|
||||
const FloatT sample_val = TfLiteRound(val * output_scaling_inv);
|
||||
const FloatT midpoint_interp_val =
|
||||
TfLiteRound((val_next * output_scaling_inv +
|
||||
TfLiteRound(val * output_scaling_inv)) /
|
||||
2);
|
||||
const FloatT midpoint_val = TfLiteRound(val_midpoint * output_scaling_inv);
|
||||
const FloatT midpoint_err = midpoint_interp_val - midpoint_val;
|
||||
const FloatT bias = TfLiteRound(midpoint_err / 2);
|
||||
|
||||
lut[i] = static_cast<LutOutT>(std::min<FloatT>(
|
||||
std::max<FloatT>(sample_val - bias, table_min), table_max));
|
||||
}
|
||||
|
||||
const bool with_extra_interpolation_value =
|
||||
std::is_same<LutInT, int16_t>::value;
|
||||
if (with_extra_interpolation_value) {
|
||||
lut[nb_steps] = static_cast<LutOutT>(std::min<FloatT>(
|
||||
std::max<FloatT>(TfLiteRound(func(input_max) * output_scaling_inv),
|
||||
table_min),
|
||||
table_max));
|
||||
}
|
||||
table[num - 1] = std::min<float>(
|
||||
std::max<float>(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
|
||||
}
|
||||
|
||||
// int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
|
||||
inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) {
|
||||
// 512 base value, lut[513] only for calculate slope
|
||||
uint16_t index = static_cast<uint16_t>(256 + (value >> 7));
|
||||
// LUT must have 513 values
|
||||
template <typename LutOutT>
|
||||
inline LutOutT lut_lookup_with_interpolation(int16_t value,
|
||||
const LutOutT* lut) {
|
||||
static_assert(std::is_same<LutOutT, int8_t>::value ||
|
||||
std::is_same<LutOutT, int16_t>::value,
|
||||
"Only LUTs with int8 or int16 outputs are supported.");
|
||||
// 512 base values, lut[513] is only used to calculate the slope
|
||||
const uint16_t index = static_cast<uint16_t>(256 + (value >> 7));
|
||||
assert(index < 512 && "LUT index out of range.");
|
||||
int16_t offset = value & 0x7f;
|
||||
const int16_t offset = value & 0x7f;
|
||||
|
||||
// base and slope are Q0.15
|
||||
int16_t base = lut[index];
|
||||
int16_t slope = lut[index + 1] - lut[index];
|
||||
// Base and slope are Q0.x
|
||||
const LutOutT base = lut[index];
|
||||
const LutOutT slope = lut[index + 1] - lut[index];
|
||||
|
||||
// Q0.15 * Q0.7 = Q0.22
|
||||
// Round and convert from Q0.22 to Q0.15
|
||||
int32_t delta = (static_cast<int32_t>(slope) * offset + 64) >> 7;
|
||||
// Q0.x * Q0.7 = Q0.(x + 7)
|
||||
// Round and convert from Q0.(x + 7) to Q0.x
|
||||
const int delta = (slope * offset + 64) >> 7;
|
||||
|
||||
// Q0.15 + Q0.15
|
||||
return base + delta;
|
||||
return static_cast<LutOutT>(base + delta);
|
||||
}
|
||||
|
||||
// int16_t -> int16_t table lookup with interpolation
|
||||
// LUT must have 513 values
|
||||
inline int16_t lut_lookup(int16_t value, const int16_t* lut) {
|
||||
return lut_lookup_with_interpolation(value, lut);
|
||||
}
|
||||
|
||||
// int16_t -> int8_t table lookup with interpolation
|
||||
// LUT must have 513 values
|
||||
inline int8_t lut_lookup(int16_t value, const int8_t* lut) {
|
||||
return lut_lookup_with_interpolation(value, lut);
|
||||
}
|
||||
|
||||
// int8_t -> int8_t table lookup without interpolation
|
||||
// LUT must have 256 values
|
||||
inline int8_t lut_lookup(int8_t value, const int8_t* lut) {
|
||||
return lut[128 + value];
|
||||
}
|
||||
|
||||
// int8_t -> int16_t table lookup without interpolation
|
||||
// LUT must have 256 values
|
||||
inline int16_t lut_lookup(int8_t value, const int16_t* lut) {
|
||||
return lut[128 + value];
|
||||
}
|
||||
|
||||
// Table of sigmoid(i/24) at 0.16 format - 256 elements.
|
||||
@@ -575,7 +619,8 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
|
||||
// InputIntegerBits - z_b_headroom - 0.25);
|
||||
const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
|
||||
FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
|
||||
InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
|
||||
static_cast<int32_t>(InputIntegerBits - z_a_headroom_plus_1),
|
||||
31 - kAccumIntegerBits)),
|
||||
shifted_quarter);
|
||||
|
||||
// z_b is treated like z_a, but premultiplying by sqrt(0.5).
|
||||
@@ -585,7 +630,8 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
|
||||
SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
|
||||
const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
|
||||
FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
|
||||
InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
|
||||
static_cast<int32_t>(InputIntegerBits - z_b_headroom),
|
||||
31 - kAccumIntegerBits)),
|
||||
shifted_quarter);
|
||||
|
||||
const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
|
||||
|
||||
@@ -19,9 +19,8 @@ limitations under the License.
|
||||
|
||||
namespace tflite {
|
||||
|
||||
#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
|
||||
(defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO) || \
|
||||
defined(__ZEPHYR__)
|
||||
#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
|
||||
(defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(__ZEPHYR__)
|
||||
#define TF_LITE_GLOBAL_STD_PREFIX
|
||||
#else
|
||||
#define TF_LITE_GLOBAL_STD_PREFIX std
|
||||
|
||||
@@ -15,26 +15,6 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
|
||||
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#define USE_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON
|
||||
#define USE_NEON
|
||||
#include "NEON_2_SSE.h"
|
||||
#endif
|
||||
|
||||
// NEON_OR_PORTABLE(SomeFunc, args) calls NeonSomeFunc(args) if USE_NEON is
|
||||
// defined, PortableSomeFunc(args) otherwise.
|
||||
#ifdef USE_NEON
|
||||
// Always use Neon code
|
||||
#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__)
|
||||
|
||||
#else
|
||||
// No NEON available: Use Portable code
|
||||
#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__)
|
||||
|
||||
#endif // defined(USE_NEON)
|
||||
// TFLM does not need to utilize any Neon optimizations.
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
|
||||
|
||||
@@ -15,6 +15,8 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
@@ -27,25 +29,14 @@ inline void Add(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const T* input1_data,
|
||||
const RuntimeShape& input2_shape, const T* input2_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
T activation_min, activation_max;
|
||||
GetActivationParams(params, &activation_min, &activation_max);
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = ActivationFunctionWithMinMax(
|
||||
input1_data[i] + input2_data[i], params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
}
|
||||
}
|
||||
|
||||
inline void Add(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const float* input1_data,
|
||||
const RuntimeShape& input2_shape, const float* input2_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
auto x = input1_data[i] + input2_data[i];
|
||||
output_data[i] = ActivationFunctionWithMinMax(
|
||||
x, params.float_activation_min, params.float_activation_max);
|
||||
input1_data[i] + input2_data[i], activation_min, activation_max);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -202,13 +193,12 @@ inline void Add(const ArithmeticParams& params,
|
||||
}
|
||||
}
|
||||
|
||||
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const float* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const float* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
float* output_data) {
|
||||
template <typename T>
|
||||
inline typename std::enable_if<!is_small_integer<T>::value, void>::type
|
||||
BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const T* input1_data,
|
||||
const RuntimeShape& input2_shape, const T* input2_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
@@ -216,6 +206,9 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape extended_output_shape =
|
||||
RuntimeShape::ExtendedShape(4, output_shape);
|
||||
|
||||
T activation_min, activation_max;
|
||||
GetActivationParams(params, &activation_min, &activation_max);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
@@ -232,51 +225,10 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
||||
ActivationFunctionWithMinMax(
|
||||
ActivationFunctionWithMinMax<T>(
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)],
|
||||
params.float_activation_min, params.float_activation_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int32_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int32_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int32_t* output_data) {
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
const RuntimeShape extended_output_shape =
|
||||
RuntimeShape::ExtendedShape(4, output_shape);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
||||
ActivationFunctionWithMinMax(
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)],
|
||||
params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
activation_min, activation_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -287,10 +239,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
// is 32-bit for both cases. The overflow does not happen due to the
|
||||
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
|
||||
template <typename T>
|
||||
inline void BroadcastAdd4DSlow(
|
||||
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
||||
const T* input1_data, const RuntimeShape& input2_shape,
|
||||
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
|
||||
inline typename std::enable_if<is_small_integer<T>::value, void>::type
|
||||
BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const T* input1_data,
|
||||
const RuntimeShape& input2_shape, const T* input2_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
|
||||
@@ -15,7 +15,10 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
@@ -36,6 +39,47 @@ inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
|
||||
}
|
||||
}
|
||||
|
||||
inline void AddN(const ArithmeticParams& params,
|
||||
const RuntimeShape& input_shape, const size_t num_inputs,
|
||||
const int8_t* const* input_data, int8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
// Input offset is negative input zero point. Activation tensors are
|
||||
// asymmetric quantized so they span the full int8 range.
|
||||
// All inputs should have same zero-point and scale, this is checked during
|
||||
// Prepare stage.
|
||||
TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
|
||||
TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
|
||||
|
||||
// All inputs and output should have the same shape, this is checked during
|
||||
// Prepare stage.
|
||||
const size_t size = input_shape.FlatSize();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
// accumulate in scaled_x before clamping to avoid overflow
|
||||
const int32_t x = params.input1_offset; // x = 0
|
||||
const int32_t shifted_x = x * (1 << params.left_shift);
|
||||
int32_t scaled_x = MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_x, params.input1_multiplier, params.input1_shift);
|
||||
|
||||
for (size_t j = 0; j < num_inputs; ++j) {
|
||||
const int32_t y = params.input1_offset + input_data[j][i];
|
||||
const int32_t shifted_y = y * (1 << params.left_shift);
|
||||
int32_t scaled_y = MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_y, params.input1_multiplier, params.input1_shift);
|
||||
scaled_x += scaled_y;
|
||||
}
|
||||
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
scaled_x, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[i] = static_cast<int8_t>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
|
||||
@@ -0,0 +1,275 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/tensor_utils_common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
namespace batch_matmul {
|
||||
|
||||
// Determine which dimension is the broadcast dimension.
|
||||
inline int broadcast_dim(int lhs_dim, int rhs_dim) {
|
||||
if (lhs_dim == rhs_dim) return lhs_dim;
|
||||
if (lhs_dim == 1) return rhs_dim;
|
||||
TFLITE_DCHECK_EQ(rhs_dim, 1);
|
||||
return lhs_dim;
|
||||
}
|
||||
|
||||
// Compute the "extent" for iterating on this dimension.
|
||||
// If we are broadcasting, then don't advance (i.e return 0).
|
||||
inline int extent(const RuntimeShape& shape, int x) {
|
||||
if (shape.Dims(x) == 1) {
|
||||
return 0;
|
||||
}
|
||||
int prod = 1;
|
||||
for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
|
||||
prod *= shape.Dims(i);
|
||||
}
|
||||
return prod;
|
||||
}
|
||||
|
||||
} // namespace batch_matmul
|
||||
|
||||
template <typename Ta, typename Tb, typename Tout>
|
||||
inline void BatchMatMul(const RuntimeShape& lhs_shape, const Ta* lhs_data,
|
||||
const RuntimeShape& rhs_shape, const Tb* rhs_data,
|
||||
const RuntimeShape& output_shape, Tout* output_data) {
|
||||
const RuntimeShape extended_lhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, lhs_shape);
|
||||
const RuntimeShape extended_rhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, rhs_shape);
|
||||
|
||||
const int batch_dim0 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
|
||||
const int batch_dim1 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
|
||||
const int batch_dim2 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
|
||||
|
||||
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
|
||||
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
|
||||
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
|
||||
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
|
||||
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
|
||||
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
|
||||
|
||||
// Set params for each matrix multiply.
|
||||
const int lhs_rows = extended_lhs_shape.Dims(3);
|
||||
const int rhs_cols = extended_rhs_shape.Dims(4);
|
||||
const int accum_depth = extended_lhs_shape.Dims(4);
|
||||
|
||||
for (int b0 = 0; b0 < batch_dim0; ++b0) {
|
||||
const Ta* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
|
||||
const Tb* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
|
||||
for (int b1 = 0; b1 < batch_dim1; ++b1) {
|
||||
const Ta* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
|
||||
const Tb* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
|
||||
for (int b2 = 0; b2 < batch_dim2; ++b2) {
|
||||
const Ta* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
|
||||
const Tb* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
|
||||
Tout* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
|
||||
b1 * batch_dim2 + b2) *
|
||||
lhs_rows * rhs_cols;
|
||||
for (int j = 0; j < rhs_cols; ++j) {
|
||||
for (int i = 0; i < lhs_rows; ++i) {
|
||||
Tout total = 0;
|
||||
for (int k = 0; k < accum_depth; ++k) {
|
||||
total += static_cast<Tout>(lhs_ptr2[accum_depth * i + k]) *
|
||||
static_cast<Tout>(rhs_ptr2[j * accum_depth + k]);
|
||||
}
|
||||
int idx = lhs_rows * j + i;
|
||||
out_ptr[idx] = total;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
|
||||
const RuntimeShape& rhs_shape, const int8_t* rhs_data,
|
||||
const float* scaling_factors,
|
||||
const int32_t* input_offset, int32_t* row_sums,
|
||||
const RuntimeShape& output_shape, float* output_data,
|
||||
bool* compute_row_sums) {
|
||||
const RuntimeShape extended_lhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, lhs_shape);
|
||||
const RuntimeShape extended_rhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, rhs_shape);
|
||||
|
||||
const int batch_dim0 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
|
||||
const int batch_dim1 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
|
||||
const int batch_dim2 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
|
||||
|
||||
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
|
||||
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
|
||||
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
|
||||
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
|
||||
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
|
||||
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
|
||||
|
||||
// Set params for each matrix multiply.
|
||||
const int lhs_rows = extended_lhs_shape.Dims(3);
|
||||
const int rhs_cols = extended_rhs_shape.Dims(4);
|
||||
const int accum_depth = extended_lhs_shape.Dims(4);
|
||||
|
||||
const int ioff_ext0 = rhs_ext0 == 0 ? 0 : rhs_cols;
|
||||
const int ioff_ext1 = rhs_ext1 == 0 ? 0 : rhs_cols;
|
||||
const int ioff_ext2 = rhs_ext2 == 0 ? 0 : rhs_cols;
|
||||
const int woff_ext0 = lhs_ext0 == 0 ? 0 : lhs_rows;
|
||||
const int woff_ext1 = lhs_ext1 == 0 ? 0 : lhs_rows;
|
||||
const int woff_ext2 = lhs_ext2 == 0 ? 0 : lhs_rows;
|
||||
|
||||
if (!compute_row_sums || *compute_row_sums) {
|
||||
int num_weights_matrices = 1;
|
||||
for (int i = 1; i < extended_lhs_shape.DimensionsCount() - 2; ++i) {
|
||||
num_weights_matrices *= extended_lhs_shape.Dims(i);
|
||||
}
|
||||
tensor_utils::ReductionSumVector(
|
||||
lhs_data, row_sums, num_weights_matrices * lhs_rows, accum_depth);
|
||||
if (compute_row_sums) {
|
||||
*compute_row_sums = false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int b0 = 0; b0 < batch_dim0; ++b0) {
|
||||
const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
|
||||
const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
|
||||
const int32_t* ioff_ptr0 = input_offset + (b0 * ioff_ext0);
|
||||
const float* scale_ptr0 = scaling_factors + (b0 * ioff_ext0);
|
||||
const int32_t* woff_ptr0 = row_sums + (b0 * woff_ext0);
|
||||
for (int b1 = 0; b1 < batch_dim1; ++b1) {
|
||||
const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
|
||||
const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
|
||||
const int32_t* ioff_ptr1 = ioff_ptr0 + (b1 * ioff_ext1);
|
||||
const float* scale_ptr1 = scale_ptr0 + (b1 * ioff_ext1);
|
||||
const int32_t* woff_ptr1 = woff_ptr0 + (b1 * woff_ext1);
|
||||
for (int b2 = 0; b2 < batch_dim2; ++b2) {
|
||||
const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
|
||||
const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
|
||||
const int32_t* ioff_ptr2 = ioff_ptr1 + (b2 * ioff_ext2);
|
||||
const float* scale_ptr2 = scale_ptr1 + (b2 * ioff_ext2);
|
||||
const int32_t* woff_ptr2 = woff_ptr1 + (b2 * woff_ext2);
|
||||
float* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
|
||||
b1 * batch_dim2 + b2) *
|
||||
lhs_rows * rhs_cols;
|
||||
for (int j = 0; j < rhs_cols; ++j) {
|
||||
const float batch_scaling_factor = scale_ptr2[j];
|
||||
const float batch_offset = static_cast<float>(ioff_ptr2[j]);
|
||||
for (int i = 0; i < lhs_rows; ++i) {
|
||||
int32_t total = 0;
|
||||
for (int k = 0; k < accum_depth; ++k) {
|
||||
total +=
|
||||
lhs_ptr2[accum_depth * i + k] * rhs_ptr2[j * accum_depth + k];
|
||||
}
|
||||
int32_t row_sum = woff_ptr2[i];
|
||||
total -= row_sum * batch_offset;
|
||||
int idx = lhs_rows * j + i;
|
||||
out_ptr[idx] += batch_scaling_factor * total;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename AccumT>
|
||||
inline void BatchMatMul(const FullyConnectedParams& params,
|
||||
const RuntimeShape& lhs_shape, const T* lhs_data,
|
||||
const RuntimeShape& rhs_shape, const T* rhs_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
const RuntimeShape extended_lhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, lhs_shape);
|
||||
const RuntimeShape extended_rhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, rhs_shape);
|
||||
|
||||
const int batch_dim0 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
|
||||
const int batch_dim1 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
|
||||
const int batch_dim2 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
|
||||
|
||||
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
|
||||
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
|
||||
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
|
||||
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
|
||||
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
|
||||
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
|
||||
|
||||
// Set params for each matrix multiply.
|
||||
const int lhs_rows = extended_lhs_shape.Dims(3);
|
||||
const int rhs_cols = extended_rhs_shape.Dims(4);
|
||||
const int accum_depth = extended_lhs_shape.Dims(4);
|
||||
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
|
||||
for (int b0 = 0; b0 < batch_dim0; ++b0) {
|
||||
const T* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
|
||||
const T* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
|
||||
for (int b1 = 0; b1 < batch_dim1; ++b1) {
|
||||
const T* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
|
||||
const T* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
|
||||
for (int b2 = 0; b2 < batch_dim2; ++b2) {
|
||||
const T* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
|
||||
const T* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
|
||||
T* out_ptr = output_data +
|
||||
((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) *
|
||||
lhs_rows * rhs_cols;
|
||||
|
||||
for (int j = 0; j < rhs_cols; ++j) {
|
||||
for (int i = 0; i < lhs_rows; ++i) {
|
||||
AccumT total = 0;
|
||||
for (int k = 0; k < accum_depth; ++k) {
|
||||
AccumT lhs_val = lhs_ptr2[accum_depth * i + k];
|
||||
AccumT rhs_val = rhs_ptr2[accum_depth * j + k];
|
||||
total += (lhs_val + filter_offset) * (rhs_val + input_offset);
|
||||
}
|
||||
int32_t total_scaled = MultiplyByQuantizedMultiplier(
|
||||
total, output_multiplier, output_shift);
|
||||
total_scaled += output_offset;
|
||||
total_scaled = std::max(total_scaled, output_activation_min);
|
||||
total_scaled = std::min(total_scaled, output_activation_max);
|
||||
const int idx = lhs_rows * j + i;
|
||||
out_ptr[idx] = static_cast<T>(total_scaled);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
|
||||
@@ -0,0 +1,175 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void CumSum(const T* input_data, const RuntimeShape& shape, int32_t axis,
|
||||
bool exclusive, bool reverse, T* output_data) {
|
||||
const int32_t rank = shape.DimensionsCount();
|
||||
TFLITE_DCHECK_GE(rank, 1);
|
||||
TFLITE_DCHECK_GE(axis, 0);
|
||||
TFLITE_DCHECK_LT(axis, rank);
|
||||
|
||||
size_t inner = 1;
|
||||
size_t outer = 1;
|
||||
size_t depth = 1;
|
||||
for (int32_t i = 0; i < rank; i++) {
|
||||
if (i < axis)
|
||||
inner *= shape.Dims(i);
|
||||
else if (i > axis)
|
||||
outer *= shape.Dims(i);
|
||||
else
|
||||
depth = shape.Dims(i);
|
||||
}
|
||||
|
||||
for (size_t outer_index = 0; outer_index < outer; outer_index++) {
|
||||
size_t outer_index_adj;
|
||||
if (reverse)
|
||||
outer_index_adj = (outer - 1) - outer_index;
|
||||
else
|
||||
outer_index_adj = outer_index;
|
||||
for (size_t inner_index = 0; inner_index < inner; inner_index++) {
|
||||
T accumulator = 0;
|
||||
size_t inner_index_adj;
|
||||
if (reverse)
|
||||
inner_index_adj = (inner - 1) - inner_index;
|
||||
else
|
||||
inner_index_adj = inner_index;
|
||||
for (size_t depth_index = 0; depth_index < depth; depth_index++) {
|
||||
size_t depth_index_adj;
|
||||
if (reverse)
|
||||
depth_index_adj = (depth - 1) - depth_index;
|
||||
else
|
||||
depth_index_adj = depth_index;
|
||||
|
||||
size_t index = outer_index_adj;
|
||||
index += inner_index_adj * depth * outer;
|
||||
index += depth_index_adj * outer;
|
||||
|
||||
if (exclusive) {
|
||||
output_data[index] = accumulator;
|
||||
accumulator += input_data[index];
|
||||
} else {
|
||||
accumulator += input_data[index];
|
||||
output_data[index] = accumulator;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Quantized INT8 CUMSUM
|
||||
//
|
||||
inline void CumSum(const ArithmeticParams& params, const int8_t* input_data,
|
||||
const RuntimeShape& shape, int32_t axis, bool exclusive,
|
||||
bool reverse, int8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
// Input offset is negative input zero point. Activation tensors are
|
||||
// asymmetric quantized so they span the full int8 range.
|
||||
// All inputs should have same zero-point and scale, this is checked during
|
||||
// Prepare stage.
|
||||
TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
|
||||
TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
|
||||
|
||||
const int32_t rank = shape.DimensionsCount();
|
||||
TFLITE_DCHECK_GE(rank, 1);
|
||||
TFLITE_DCHECK_GE(axis, 0);
|
||||
TFLITE_DCHECK_LT(axis, rank);
|
||||
|
||||
size_t inner = 1;
|
||||
size_t outer = 1;
|
||||
size_t depth = 1;
|
||||
for (int32_t i = 0; i < rank; i++) {
|
||||
if (i < axis)
|
||||
inner *= shape.Dims(i);
|
||||
else if (i > axis)
|
||||
outer *= shape.Dims(i);
|
||||
else
|
||||
depth = shape.Dims(i);
|
||||
}
|
||||
|
||||
for (size_t outer_index = 0; outer_index < outer; outer_index++) {
|
||||
size_t outer_index_adj;
|
||||
if (reverse)
|
||||
outer_index_adj = (outer - 1) - outer_index;
|
||||
else
|
||||
outer_index_adj = outer_index;
|
||||
for (size_t inner_index = 0; inner_index < inner; inner_index++) {
|
||||
int32_t accumulator = params.input1_offset; // accumulator = 0
|
||||
accumulator *= (1 << params.left_shift);
|
||||
accumulator = MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
accumulator, params.input1_multiplier, params.input1_shift);
|
||||
|
||||
size_t inner_index_adj;
|
||||
if (reverse)
|
||||
inner_index_adj = (inner - 1) - inner_index;
|
||||
else
|
||||
inner_index_adj = inner_index;
|
||||
|
||||
for (size_t depth_index = 0; depth_index < depth; depth_index++) {
|
||||
size_t depth_index_adj;
|
||||
if (reverse)
|
||||
depth_index_adj = (depth - 1) - depth_index;
|
||||
else
|
||||
depth_index_adj = depth_index;
|
||||
|
||||
size_t index = outer_index_adj;
|
||||
index += inner_index_adj * depth * outer;
|
||||
index += depth_index_adj * outer;
|
||||
|
||||
const int32_t y = params.input1_offset + input_data[index];
|
||||
const int32_t shifted_y = y * (1 << params.left_shift);
|
||||
const int32_t scaled_y = MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_y, params.input1_multiplier, params.input1_shift);
|
||||
|
||||
int32_t scaled_output;
|
||||
if (exclusive) {
|
||||
scaled_output = accumulator;
|
||||
accumulator += scaled_y;
|
||||
} else {
|
||||
accumulator += scaled_y;
|
||||
scaled_output = accumulator;
|
||||
}
|
||||
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
scaled_output, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[index] = static_cast<int8_t>(clamped_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
|
||||
@@ -0,0 +1,79 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
const int input_depth = input_shape.Dims(3);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_batch = input_shape.Dims(0);
|
||||
|
||||
const int output_depth = output_shape.Dims(3);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_batch = output_shape.Dims(0);
|
||||
|
||||
const int32_t block_size = op_params.block_size;
|
||||
|
||||
TFLITE_DCHECK_EQ(input_width * block_size, output_width);
|
||||
TFLITE_DCHECK_EQ(input_height * block_size, output_height);
|
||||
TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size);
|
||||
TFLITE_DCHECK_EQ(input_batch, output_batch);
|
||||
|
||||
for (int out_b = 0; out_b < output_batch; ++out_b) {
|
||||
for (int out_h = 0; out_h < output_height; ++out_h) {
|
||||
for (int out_w = 0; out_w < output_width; ++out_w) {
|
||||
for (int out_d = 0; out_d < output_depth; ++out_d) {
|
||||
const int in_d =
|
||||
out_d + ((out_h % block_size) * block_size + out_w % block_size) *
|
||||
output_depth;
|
||||
|
||||
const int in_w = out_w / block_size;
|
||||
const int in_h = out_h / block_size;
|
||||
const int in_b = out_b;
|
||||
|
||||
const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
|
||||
const int output_index =
|
||||
Offset(output_shape, out_b, out_h, out_w, out_d);
|
||||
|
||||
output_data[output_index] = input_data[input_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
|
||||
@@ -1,239 +0,0 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void DivCheckArithmeticParams(const ArithmeticParams& params) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
// Input offset is negative input zero point. Activation tensors are
|
||||
// asymmetric quantized so they span the full int8 range.
|
||||
constexpr int32_t max_value =
|
||||
static_cast<int32_t>(std::numeric_limits<T>::max());
|
||||
TFLITE_DCHECK_GE(params.input1_offset, -max_value);
|
||||
TFLITE_DCHECK_LE(params.input1_offset, max_value);
|
||||
TFLITE_DCHECK_GE(params.input2_offset, -max_value);
|
||||
TFLITE_DCHECK_LE(params.input2_offset, max_value);
|
||||
TFLITE_DCHECK_GE(params.output_offset, -max_value);
|
||||
TFLITE_DCHECK_LE(params.output_offset, max_value);
|
||||
}
|
||||
|
||||
// Element-wise div that can often be used for inner loop of broadcast Div as
|
||||
// well as the non-broadcast Div.
|
||||
template <typename T>
|
||||
inline void DivElementwise(int size, const ArithmeticParams& params,
|
||||
const T* input1_data, const T* input2_data,
|
||||
T* output_data) {
|
||||
DivCheckArithmeticParams<T>(params);
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
TFLITE_DCHECK_NE(input2_val, 0);
|
||||
int recip_shift;
|
||||
const int32_t input2_inv =
|
||||
(input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
|
||||
: -GetReciprocal(-input2_val, 31, &recip_shift);
|
||||
const int headroom = CountLeadingSignBits(input1_val);
|
||||
const int32_t unscaled_quotient =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
|
||||
headroom);
|
||||
const int total_shift = params.output_shift - recip_shift - headroom;
|
||||
const int32_t unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
unscaled_quotient, params.output_multiplier, total_shift);
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[i] = static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
inline void Div(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const uint8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8_t* input2_data,
|
||||
const RuntimeShape& output_shape, uint8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
DivElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void Div(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const int8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const int8_t* input2_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
DivElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
template <typename T, int N = 5>
|
||||
inline void BroadcastDivSlowQuantized(
|
||||
const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape,
|
||||
const T* input1_data, const RuntimeShape& unextended_input2_shape,
|
||||
const T* input2_data, const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
|
||||
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
||||
unextended_input2_shape, &desc1, &desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
|
||||
&output_desc);
|
||||
|
||||
DivCheckArithmeticParams<T>(params);
|
||||
|
||||
auto div_func = [&](int indexes[N]) {
|
||||
const int32_t input1_val =
|
||||
params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
|
||||
const int32_t input2_val =
|
||||
params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
|
||||
TFLITE_DCHECK_NE(input2_val, 0);
|
||||
int recip_shift;
|
||||
const int32_t input2_inv =
|
||||
(input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
|
||||
: -GetReciprocal(-input2_val, 31, &recip_shift);
|
||||
const int headroom = CountLeadingSignBits(input1_val);
|
||||
const int32_t unscaled_quotient =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
|
||||
headroom);
|
||||
const int total_shift = params.output_shift - recip_shift - headroom;
|
||||
const int32_t unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
unscaled_quotient, params.output_multiplier, total_shift);
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
static_cast<T>(clamped_output);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, div_func);
|
||||
}
|
||||
|
||||
template <int N = 5>
|
||||
inline void BroadcastDivSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& unextended_input1_shape,
|
||||
const uint8_t* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const uint8_t* input2_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
uint8_t* output_data) {
|
||||
BroadcastDivSlowQuantized<uint8_t, N>(
|
||||
params, unextended_input1_shape, input1_data, unextended_input2_shape,
|
||||
input2_data, unextended_output_shape, output_data);
|
||||
}
|
||||
|
||||
template <int N = 5>
|
||||
inline void BroadcastDivSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& unextended_input1_shape,
|
||||
const int8_t* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const int8_t* input2_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
int8_t* output_data) {
|
||||
BroadcastDivSlowQuantized<int8_t, N>(
|
||||
params, unextended_input1_shape, input1_data, unextended_input2_shape,
|
||||
input2_data, unextended_output_shape, output_data);
|
||||
}
|
||||
|
||||
// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
|
||||
// dimensionality if the runtime code does a single loop over one dimension
|
||||
// that handles broadcasting as the base case. The code generator would then
|
||||
// generate max(D1, D2) nested for loops.
|
||||
template <typename T, int N = 5>
|
||||
void BroadcastDivSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& unextended_input1_shape,
|
||||
const T* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const T* input2_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
T output_activation_min;
|
||||
T output_activation_max;
|
||||
GetActivationParams(params, &output_activation_min, &output_activation_max);
|
||||
|
||||
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
|
||||
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
||||
unextended_input2_shape, &desc1, &desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
|
||||
&output_desc);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest
|
||||
// stride, typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
|
||||
auto div_func = [&](int indexes[N]) {
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
ActivationFunctionWithMinMax(
|
||||
input1_data[SubscriptToIndex(desc1, indexes)] /
|
||||
input2_data[SubscriptToIndex(desc2, indexes)],
|
||||
output_activation_min, output_activation_max);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, div_func);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void Div(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const T* input1_data,
|
||||
const RuntimeShape& input2_shape, const T* input2_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
T output_activation_min;
|
||||
T output_activation_max;
|
||||
GetActivationParams(params, &output_activation_min, &output_activation_max);
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = ActivationFunctionWithMinMax(
|
||||
input1_data[i] / input2_data[i], output_activation_min,
|
||||
output_activation_max);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
|
||||
@@ -0,0 +1,35 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
|
||||
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
T FloorDiv(T input1, T input2) {
|
||||
return std::floor(std::divides<double>()(static_cast<double>(input1),
|
||||
static_cast<double>(input2)));
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
|
||||
@@ -0,0 +1,44 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
|
||||
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
T FloorMod(T input1, T input2) {
|
||||
struct FloatMod {
|
||||
float operator()(const float lhs, const float rhs) const {
|
||||
return std::fmod(lhs, rhs);
|
||||
}
|
||||
};
|
||||
using ModFunc = typename std::conditional<std::is_integral<T>::value,
|
||||
std::modulus<T>, FloatMod>::type;
|
||||
ModFunc mod_func;
|
||||
T trunc_mod = mod_func(input1, input2);
|
||||
return (trunc_mod != 0) && ((input2 < 0) != (trunc_mod < 0))
|
||||
? (trunc_mod + input2)
|
||||
: trunc_mod;
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
|
||||
@@ -21,7 +21,7 @@ limitations under the License.
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
inline void AveragePool(const PoolParams& params,
|
||||
inline bool AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const int8_t* input_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
@@ -66,6 +66,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
filter_count++;
|
||||
}
|
||||
}
|
||||
if (filter_count == 0) return false;
|
||||
// Round to the closest integer value.
|
||||
acc = acc > 0 ? (acc + filter_count / 2) / filter_count
|
||||
: (acc - filter_count / 2) / filter_count;
|
||||
@@ -77,6 +78,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
@@ -136,7 +138,7 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
}
|
||||
}
|
||||
|
||||
inline void AveragePool(const PoolParams& params,
|
||||
inline bool AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const int16_t* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
@@ -182,6 +184,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
filter_count++;
|
||||
}
|
||||
}
|
||||
if (filter_count == 0) return false;
|
||||
// Round to the closest integer value.
|
||||
acc = acc > 0 ? (acc + filter_count / 2) / filter_count
|
||||
: (acc - filter_count / 2) / filter_count;
|
||||
@@ -193,6 +196,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
|
||||
@@ -0,0 +1,256 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <limits>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void LogSoftmax(const SoftmaxParams& params,
|
||||
const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
||||
const int outer_size =
|
||||
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
||||
const int depth =
|
||||
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
||||
|
||||
for (int i = 0; i < outer_size; ++i) {
|
||||
// Find max element value which we'll use to ensure numerical stability
|
||||
// taking advantage of the following equality:
|
||||
// log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
|
||||
float max = std::numeric_limits<float>::lowest();
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
max = std::max(max, input_data[i * depth + c]);
|
||||
}
|
||||
|
||||
// Compute sum.
|
||||
float sum = 0.f;
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
sum += std::exp(input_data[i * depth + c] - max);
|
||||
}
|
||||
|
||||
// Compute result.
|
||||
const float log_sum = std::log(sum);
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void LogSoftmax(const SoftmaxParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data,
|
||||
const RuntimeShape& output_shape, uint8_t* output_data) {
|
||||
const int32_t input_multiplier = params.input_multiplier;
|
||||
const int32_t input_left_shift = params.input_left_shift;
|
||||
const int32_t reverse_scaling_divisor = params.reverse_scaling_divisor;
|
||||
const int32_t reverse_scaling_right_shift =
|
||||
params.reverse_scaling_right_shift;
|
||||
const int diff_min = params.diff_min;
|
||||
// The representation chosen for the input to the exp() function is Q5.26.
|
||||
// We need to leave extra space since values that we skip might be as large
|
||||
// as -32 before multiplying by input_beta_multiplier, and therefore as
|
||||
// large as -16 afterwards. Note that exp(-8) is definitely not
|
||||
// insignificant to accumulation, but exp(-16) definitely is.
|
||||
static constexpr int kScaledDiffIntegerBits = 5;
|
||||
static constexpr int kAccumulationIntegerBits = 12;
|
||||
static constexpr int kOutputIntegerBits = 4;
|
||||
using FixedPointScaledDiff =
|
||||
gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
|
||||
using FixedPointAccum =
|
||||
gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
|
||||
|
||||
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
||||
const int outer_size =
|
||||
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
||||
const int depth =
|
||||
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
||||
|
||||
for (int i = 0; i < outer_size; ++i) {
|
||||
uint8_t max_in_row = 0;
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
max_in_row = std::max(max_in_row, input_data[i * depth + c]);
|
||||
}
|
||||
|
||||
FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
int32_t input_diff =
|
||||
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
|
||||
if (input_diff >= diff_min) {
|
||||
const int32_t input_diff_rescaled =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(
|
||||
input_diff, input_multiplier, input_left_shift);
|
||||
const FixedPointScaledDiff scaled_diff_f8 =
|
||||
FixedPointScaledDiff::FromRaw(input_diff_rescaled);
|
||||
sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
|
||||
exp_on_negative_values(scaled_diff_f8));
|
||||
}
|
||||
}
|
||||
|
||||
const int32_t fixed_log_sum_of_exps =
|
||||
log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>(
|
||||
sum_of_exps)
|
||||
.raw();
|
||||
|
||||
// rescaled_diff_min is smallest representable in
|
||||
// Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
|
||||
// log-sub-exps that will be subtracted in the loop.
|
||||
//
|
||||
// The thresholds diff_min, etc are negative.
|
||||
const int rescaled_diff_min =
|
||||
fixed_log_sum_of_exps + std::numeric_limits<int32_t>::lowest();
|
||||
const int adjusted_diff_min =
|
||||
std::max(static_cast<int32_t>(
|
||||
diff_min - 1), // Note use of > below instead of >= above.
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
rescaled_diff_min, reverse_scaling_divisor,
|
||||
-reverse_scaling_right_shift));
|
||||
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
int32_t input_diff =
|
||||
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
|
||||
if (input_diff > adjusted_diff_min) {
|
||||
const int32_t input_diff_rescaled =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(
|
||||
input_diff, input_multiplier, input_left_shift);
|
||||
int32_t unsat_output =
|
||||
gemmlowp::RoundingDivideByPOT(
|
||||
(input_diff_rescaled - fixed_log_sum_of_exps),
|
||||
31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
|
||||
255;
|
||||
|
||||
output_data[i * depth + c] = static_cast<uint8_t>(
|
||||
std::max(std::min(unsat_output, static_cast<int32_t>(255)),
|
||||
static_cast<int32_t>(0)));
|
||||
} else {
|
||||
// Set output to smallest value.
|
||||
output_data[i * depth + c] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void LogSoftmaxQuantized(const SoftmaxParams& params,
|
||||
const size_t outer_size, const size_t depth,
|
||||
const RuntimeShape& input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
T* output_data) {
|
||||
const int32_t input_multiplier = params.input_multiplier;
|
||||
const int32_t input_left_shift = params.input_left_shift;
|
||||
const int32_t reverse_scaling_divisor = params.reverse_scaling_divisor;
|
||||
const int32_t reverse_scaling_right_shift =
|
||||
params.reverse_scaling_right_shift;
|
||||
const int diff_min = params.diff_min;
|
||||
|
||||
static constexpr T kMinT8 = std::numeric_limits<T>::min();
|
||||
static constexpr T kMaxT8 = std::numeric_limits<T>::max();
|
||||
static constexpr int32_t kMinInt32 = std::numeric_limits<int32_t>::min();
|
||||
|
||||
// All IntegerBits must agree with Prepare function.
|
||||
// Input is chosen as Q5.26 so exp(-1 * 2^5 * 2^-1) = exp(-16) is negligible.
|
||||
static constexpr int kInputIntegerBits = 5;
|
||||
static constexpr int kAccumulationIntegerBits = 12;
|
||||
static constexpr int kOutputIntegerBits = 4;
|
||||
using F5 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
|
||||
using F12 = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
|
||||
|
||||
for (size_t outer_index = 0; outer_index < outer_size; ++outer_index) {
|
||||
T max_in_row = kMinT8;
|
||||
for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
|
||||
max_in_row =
|
||||
std::max(max_in_row, input_data[outer_index * depth + inner_index]);
|
||||
}
|
||||
|
||||
// Accumulator "sum_of_exps_in_q12" is safe from overflowing in 2^12 steps.
|
||||
F12 sum_of_exps_in_q12 = F12::FromRaw(0);
|
||||
for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
|
||||
int32_t input_diff =
|
||||
static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
|
||||
max_in_row;
|
||||
if (input_diff >= diff_min) {
|
||||
const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
|
||||
input_diff, input_multiplier, input_left_shift);
|
||||
sum_of_exps_in_q12 =
|
||||
sum_of_exps_in_q12 +
|
||||
gemmlowp::Rescale<kAccumulationIntegerBits>(
|
||||
exp_on_negative_values(F5::FromRaw(input_diff_in_q5)));
|
||||
}
|
||||
}
|
||||
|
||||
const int32_t log_sum_of_exps_in_q5 =
|
||||
log_x_for_x_greater_than_or_equal_to_1<kInputIntegerBits>(
|
||||
sum_of_exps_in_q12)
|
||||
.raw();
|
||||
|
||||
// Potentially reduced the valid range. shifted_log_sum_of_exps_in_q5 is
|
||||
// smallest representable in Q5.26 plus the log_sum_of_exps.
|
||||
const int32_t shifted_log_sum_of_exps_in_q5 =
|
||||
log_sum_of_exps_in_q5 + kMinInt32;
|
||||
const int32_t adjusted_diff_min =
|
||||
std::max(static_cast<int32_t>(diff_min - 1),
|
||||
MultiplyByQuantizedMultiplier(shifted_log_sum_of_exps_in_q5,
|
||||
reverse_scaling_divisor,
|
||||
-reverse_scaling_right_shift));
|
||||
|
||||
for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
|
||||
int32_t input_diff =
|
||||
static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
|
||||
max_in_row;
|
||||
// Note use of > below instead of >= above.
|
||||
if (input_diff > adjusted_diff_min) {
|
||||
const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
|
||||
input_diff, input_multiplier, input_left_shift);
|
||||
|
||||
// Rescale and downcast.
|
||||
int32_t output_in_q27 =
|
||||
gemmlowp::RoundingDivideByPOT(
|
||||
(input_diff_in_q5 - log_sum_of_exps_in_q5),
|
||||
31 - kInputIntegerBits - kOutputIntegerBits) +
|
||||
kMaxT8;
|
||||
|
||||
output_in_q27 =
|
||||
std::max(std::min(output_in_q27, static_cast<int32_t>(kMaxT8)),
|
||||
static_cast<int32_t>(kMinT8));
|
||||
output_data[outer_index * depth + inner_index] =
|
||||
static_cast<T>(output_in_q27);
|
||||
} else {
|
||||
output_data[outer_index * depth + inner_index] = kMinT8;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void LogSoftmax(const SoftmaxParams& params, const size_t outer_size,
|
||||
const size_t depth, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
LogSoftmaxQuantized(params, outer_size, depth, input_shape, input_data,
|
||||
output_shape, output_data);
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
|
||||
@@ -51,7 +51,7 @@ inline void Mul(const ArithmeticParams& params,
|
||||
GetActivationParams(params, &output_activation_min, &output_activation_max);
|
||||
|
||||
const int flat_size =
|
||||
MatchingFlatSize(input1_shape, input2_shape, output_shape);
|
||||
MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = ActivationFunctionWithMinMax(
|
||||
input1_data[i] * input2_data[i], output_activation_min,
|
||||
@@ -66,7 +66,7 @@ inline void Mul(const ArithmeticParams& params,
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
MatchingFlatSize(input1_shape, input2_shape, output_shape);
|
||||
MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
MulElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
@@ -24,8 +24,8 @@ namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
// TFLite Pad supports activation tensors with up to 4 dimensions.
|
||||
constexpr int PadKernelMaxDimensionCount() { return 4; }
|
||||
// TFLite Pad supports activation tensors with up to 5 dimensions.
|
||||
constexpr int PadKernelMaxDimensionCount() { return 5; }
|
||||
|
||||
// There are two versions of pad: Pad and PadV2. In PadV2 there is a second
|
||||
// scalar input that provides the padding value. Therefore pad_value_ptr can be
|
||||
@@ -46,8 +46,8 @@ inline void PadImpl(const tflite::PadParams& op_params,
|
||||
TFLITE_DCHECK_LE(op_params.left_padding_count, PadKernelMaxDimensionCount());
|
||||
TFLITE_DCHECK_LE(op_params.right_padding_count, PadKernelMaxDimensionCount());
|
||||
|
||||
// Runtime calls are currently fixed at 4 dimensions. Copy inputs so we can
|
||||
// pad them to 4 dims (yes, we are "padding the padding").
|
||||
// Runtime calls are currently fixed at 5 dimensions. Copy inputs so we can
|
||||
// pad them to 5 dims (yes, we are "padding the padding").
|
||||
int left_padding_copy[PadKernelMaxDimensionCount()];
|
||||
for (int i = 0; i < PadKernelMaxDimensionCount(); i++) {
|
||||
left_padding_copy[i] = 0;
|
||||
@@ -67,39 +67,46 @@ inline void PadImpl(const tflite::PadParams& op_params,
|
||||
}
|
||||
|
||||
const int output_batch = ext_output_shape.Dims(0);
|
||||
const int output_height = ext_output_shape.Dims(1);
|
||||
const int output_width = ext_output_shape.Dims(2);
|
||||
const int output_depth = ext_output_shape.Dims(3);
|
||||
const int output_plane = ext_output_shape.Dims(1);
|
||||
const int output_height = ext_output_shape.Dims(2);
|
||||
const int output_width = ext_output_shape.Dims(3);
|
||||
const int output_depth = ext_output_shape.Dims(4);
|
||||
|
||||
const int left_b_padding = left_padding_copy[0];
|
||||
const int left_h_padding = left_padding_copy[1];
|
||||
const int left_w_padding = left_padding_copy[2];
|
||||
const int left_d_padding = left_padding_copy[3];
|
||||
const int left_p_padding = left_padding_copy[1];
|
||||
const int left_h_padding = left_padding_copy[2];
|
||||
const int left_w_padding = left_padding_copy[3];
|
||||
const int left_d_padding = left_padding_copy[4];
|
||||
|
||||
const int right_b_padding = right_padding_copy[0];
|
||||
const int right_h_padding = right_padding_copy[1];
|
||||
const int right_w_padding = right_padding_copy[2];
|
||||
const int right_d_padding = right_padding_copy[3];
|
||||
const int right_p_padding = right_padding_copy[1];
|
||||
const int right_h_padding = right_padding_copy[2];
|
||||
const int right_w_padding = right_padding_copy[3];
|
||||
const int right_d_padding = right_padding_copy[4];
|
||||
|
||||
const T pad_value = *pad_value_ptr;
|
||||
|
||||
const T* in_ptr = input_data;
|
||||
T* out_ptr = output_data;
|
||||
for (int out_b = 0; out_b < output_batch; ++out_b) {
|
||||
for (int out_h = 0; out_h < output_height; ++out_h) {
|
||||
for (int out_w = 0; out_w < output_width; ++out_w) {
|
||||
for (int out_d = 0; out_d < output_depth; ++out_d) {
|
||||
if (out_b < left_b_padding ||
|
||||
out_b >= output_batch - right_b_padding ||
|
||||
out_h < left_h_padding ||
|
||||
out_h >= output_height - right_h_padding ||
|
||||
out_w < left_w_padding ||
|
||||
out_w >= output_width - right_w_padding ||
|
||||
out_d < left_d_padding ||
|
||||
out_d >= output_depth - right_d_padding) {
|
||||
*out_ptr++ = pad_value;
|
||||
} else {
|
||||
*out_ptr++ = *in_ptr++;
|
||||
for (int out_p = 0; out_p < output_plane; ++out_p) {
|
||||
for (int out_h = 0; out_h < output_height; ++out_h) {
|
||||
for (int out_w = 0; out_w < output_width; ++out_w) {
|
||||
for (int out_d = 0; out_d < output_depth; ++out_d) {
|
||||
if (out_b < left_b_padding ||
|
||||
out_b >= output_batch - right_b_padding ||
|
||||
out_p < left_p_padding ||
|
||||
out_p >= output_plane - right_p_padding ||
|
||||
out_h < left_h_padding ||
|
||||
out_h >= output_height - right_h_padding ||
|
||||
out_w < left_w_padding ||
|
||||
out_w >= output_width - right_w_padding ||
|
||||
out_d < left_d_padding ||
|
||||
out_d >= output_depth - right_d_padding) {
|
||||
*out_ptr++ = pad_value;
|
||||
} else {
|
||||
*out_ptr++ = *in_ptr++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ limitations under the License.
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void AveragePool(const PoolParams& params,
|
||||
inline bool AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
@@ -66,6 +66,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
filter_count++;
|
||||
}
|
||||
}
|
||||
if (filter_count == 0) return false;
|
||||
const float average = total / filter_count;
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
ActivationFunctionWithMinMax(average, params.float_activation_min,
|
||||
@@ -74,9 +75,10 @@ inline void AveragePool(const PoolParams& params,
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline void AveragePool(const PoolParams& params,
|
||||
inline bool AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
@@ -122,6 +124,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
filter_count++;
|
||||
}
|
||||
}
|
||||
if (filter_count == 0) return false;
|
||||
acc = (acc + filter_count / 2) / filter_count;
|
||||
acc = std::max(acc, params.quantized_activation_min);
|
||||
acc = std::min(acc, params.quantized_activation_max);
|
||||
@@ -131,6 +134,7 @@ inline void AveragePool(const PoolParams& params,
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
|
||||
@@ -0,0 +1,774 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define __restrict__ __restrict
|
||||
#endif
|
||||
|
||||
namespace tflite {
|
||||
namespace tensor_utils {
|
||||
|
||||
namespace {
|
||||
const int32_t kInt16Max = std::numeric_limits<int16_t>::max();
|
||||
const int32_t kInt16Min = std::numeric_limits<int16_t>::min();
|
||||
} // namespace
|
||||
|
||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* min_value,
|
||||
float* max_value, float* scaling_factor) {
|
||||
auto minmax = std::minmax_element(values, values + size);
|
||||
*min_value = *minmax.first;
|
||||
*max_value = *minmax.second;
|
||||
|
||||
PortableSymmetricQuantizeFloats(values, size, quantized_values, *min_value,
|
||||
*max_value, scaling_factor);
|
||||
}
|
||||
|
||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float min_value,
|
||||
float max_value, float* scaling_factor) {
|
||||
const int32_t kScale = 127;
|
||||
const float range = std::max(std::abs(min_value), std::abs(max_value));
|
||||
if (range == 0) {
|
||||
memset(quantized_values, 0, size * sizeof(int8_t));
|
||||
*scaling_factor = 1;
|
||||
return;
|
||||
}
|
||||
*scaling_factor = range / kScale;
|
||||
const float scaling_factor_inv = kScale / range;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t quantized_value =
|
||||
static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
|
||||
// Clamp: just in case some odd numeric offset.
|
||||
quantized_values[i] = static_cast<int8_t>(
|
||||
std::min(kScale, std::max(-kScale, quantized_value)));
|
||||
}
|
||||
}
|
||||
|
||||
void PortableAsymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values,
|
||||
float* scaling_factor, int32_t* offset) {
|
||||
const int32_t kMinScale = -128;
|
||||
const int32_t kMaxScale = 127;
|
||||
const double qmin_double = kMinScale;
|
||||
const double qmax_double = kMaxScale;
|
||||
const auto minmax = std::minmax_element(values, values + size);
|
||||
const double rmin = std::fmin(0, *minmax.first);
|
||||
const double rmax = std::fmax(0, *minmax.second);
|
||||
if (rmin == rmax) {
|
||||
memset(quantized_values, 0, size * sizeof(int8_t));
|
||||
*scaling_factor = 1;
|
||||
*offset = 0;
|
||||
return;
|
||||
} else {
|
||||
double scale = (rmax - rmin) / (qmax_double - qmin_double);
|
||||
const double zero_point_from_min = qmin_double - rmin / scale;
|
||||
const double zero_point_from_max = qmax_double - rmax / scale;
|
||||
const double zero_point_from_min_error =
|
||||
std::abs(qmin_double) + std::abs(rmin / scale);
|
||||
const double zero_point_from_max_error =
|
||||
std::abs(qmax_double) + std::abs(rmax / scale);
|
||||
const double zero_point_double =
|
||||
zero_point_from_min_error < zero_point_from_max_error
|
||||
? zero_point_from_min
|
||||
: zero_point_from_max;
|
||||
int8_t nudged_zero_point = 0;
|
||||
if (zero_point_double <= qmin_double) {
|
||||
nudged_zero_point = kMinScale;
|
||||
} else if (zero_point_double >= qmax_double) {
|
||||
nudged_zero_point = kMaxScale;
|
||||
} else {
|
||||
nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
|
||||
}
|
||||
*scaling_factor = scale;
|
||||
*offset = nudged_zero_point;
|
||||
}
|
||||
const float scaling_factor_inv = 1.0f / *scaling_factor;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t quantized_value = static_cast<int32_t>(
|
||||
TfLiteRound(*offset + values[i] * scaling_factor_inv));
|
||||
quantized_values[i] =
|
||||
std::min(kMaxScale, std::max(kMinScale, quantized_value));
|
||||
}
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
|
||||
int m_rows, int m_cols,
|
||||
const float* vector,
|
||||
int n_batch, float* result) {
|
||||
float* result_in_batch = result;
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
const float* matrix_ptr = matrix;
|
||||
for (int r = 0; r < m_rows; r++) {
|
||||
float dot_prod = 0.0f;
|
||||
const float* vector_in_batch = vector + b * m_cols;
|
||||
for (int c = 0; c < m_cols; c++) {
|
||||
dot_prod += *matrix_ptr++ * *vector_in_batch++;
|
||||
}
|
||||
*result_in_batch += dot_prod;
|
||||
++result_in_batch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||
int n_batch, float* __restrict__ result) {
|
||||
for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
|
||||
const float batch_scaling_factor = scaling_factors[batch];
|
||||
// Get the address of the first row.
|
||||
const int8_t* row_ptr = matrix;
|
||||
for (int row = 0; row < m_rows; ++row) {
|
||||
// Initialize the dot product sum for the row to 0.
|
||||
int32_t dotprod = 0;
|
||||
#if defined(__GNUC__)
|
||||
// Prefetch the row to cache.
|
||||
__builtin_prefetch(row_ptr, 0 /* prefetch for read */,
|
||||
3 /* temporal locality */);
|
||||
#endif
|
||||
for (int col = 0; col < m_cols; ++col, ++row_ptr) {
|
||||
dotprod += (*row_ptr) * (vectors[col]);
|
||||
} // for col
|
||||
*result += dotprod * batch_scaling_factor;
|
||||
++result;
|
||||
} // for row
|
||||
} // for batch
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||
int n_batch, float* __restrict__ result, const float* per_channel_scale,
|
||||
const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
|
||||
bool* compute_row_sums, CpuBackendContext* context) {
|
||||
if (input_offset == nullptr) {
|
||||
PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
|
||||
return;
|
||||
}
|
||||
if (!compute_row_sums || *compute_row_sums) {
|
||||
PortableReductionSumVector(matrix, row_sums, m_rows, m_cols);
|
||||
if (compute_row_sums) {
|
||||
*compute_row_sums = false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
|
||||
const float batch_scaling_factor = scaling_factors[batch];
|
||||
const int32_t batch_offset = input_offset[batch];
|
||||
const int8_t* row_ptr = matrix;
|
||||
for (int row = 0; row < m_rows; ++row) {
|
||||
int32_t dotprod = 0;
|
||||
float scale = batch_scaling_factor;
|
||||
if (per_channel_scale) {
|
||||
scale *= per_channel_scale[row];
|
||||
}
|
||||
#if defined(__GNUC__)
|
||||
// Prefetch the row to cache.
|
||||
__builtin_prefetch(row_ptr, 0 /* prefetch for read */,
|
||||
3 /* temporal locality */);
|
||||
#endif
|
||||
for (int col = 0; col < m_cols; ++col, ++row_ptr) {
|
||||
dotprod += (*row_ptr) * vectors[col];
|
||||
} // for col
|
||||
dotprod -= row_sums[row] * batch_offset;
|
||||
*result += dotprod * scale;
|
||||
++result;
|
||||
} // for row
|
||||
} // for batch
|
||||
}
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
|
||||
const float* __restrict__ matrix, const int32_t* __restrict__ segments,
|
||||
const int32_t* __restrict__ indices, int m_rows, int m_cols,
|
||||
const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
|
||||
const int kBlockSize = 4;
|
||||
TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
|
||||
for (int batch = 0; batch < n_batch; batch++) {
|
||||
const float* matrix_ptr = matrix;
|
||||
for (int row = 0; row < m_rows; row++) {
|
||||
float dot_prod = 0.0f;
|
||||
const float* vector_in_batch = vector + batch * m_cols;
|
||||
for (int i = segments[row]; i < segments[row + 1]; i++) {
|
||||
const int block_start_index = indices[i] * kBlockSize;
|
||||
const float* vector_block_in_batch_ptr =
|
||||
vector_in_batch + block_start_index;
|
||||
for (int c = 0; c < kBlockSize; c++) {
|
||||
dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
|
||||
}
|
||||
}
|
||||
result[batch * m_rows + row] += dot_prod;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
||||
float* __restrict__ result) {
|
||||
const int kBlockSize = 16;
|
||||
TFLITE_DCHECK_EQ( // NOLINT
|
||||
m_cols % kBlockSize, 0);
|
||||
for (int batch = 0; batch < n_batch; batch++) {
|
||||
const float* matrix_ptr = matrix;
|
||||
const uint8_t* ledger_ptr = ledger;
|
||||
for (int row = 0; row < m_rows; row++) {
|
||||
float dot_prod = 0.0f;
|
||||
int num_nonzero_blocks = *ledger_ptr++;
|
||||
if (num_nonzero_blocks > 0) {
|
||||
const float* vector_in_batch = vector + batch * m_cols;
|
||||
for (int i = 0; i < num_nonzero_blocks; i++) {
|
||||
const int block_start_index = *ledger_ptr++ * kBlockSize;
|
||||
const float* vector_block_in_batch_ptr =
|
||||
vector_in_batch + block_start_index;
|
||||
for (int c = 0; c < kBlockSize; c++) {
|
||||
dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
|
||||
}
|
||||
}
|
||||
}
|
||||
result[batch * m_rows + row] += dot_prod;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
||||
const int m_cols, const int8_t* __restrict__ vectors,
|
||||
const float* scaling_factors, int n_batch, float* __restrict__ result) {
|
||||
static const int kBlockSize = 16;
|
||||
TFLITE_DCHECK_EQ( // NOLINT
|
||||
m_cols % kBlockSize, 0);
|
||||
for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
|
||||
const float batch_scaling_factor = scaling_factors[batch];
|
||||
const uint8_t* ledger_ptr = ledger;
|
||||
// Get the address of the first row.
|
||||
const int8_t* row_ptr = matrix;
|
||||
for (int row = 0; row < m_rows; ++row) {
|
||||
// Initialize the dot product sum for the row to 0.
|
||||
int32_t dotprod = 0;
|
||||
#if defined(__GNUC__)
|
||||
// Prefetch the row to cache.
|
||||
__builtin_prefetch(row_ptr, 0 /* prefetch for read */,
|
||||
3 /* temporal locality */);
|
||||
#endif
|
||||
int num_nonzero_blocks = *ledger_ptr++;
|
||||
for (int i = 0; i < num_nonzero_blocks; i++) {
|
||||
const int block_start_index = *ledger_ptr++ * kBlockSize;
|
||||
const int8_t* vector_block_ptr = vectors + block_start_index;
|
||||
for (int c = 0; c < kBlockSize; c++) {
|
||||
dotprod += (*row_ptr++) * (*vector_block_ptr++);
|
||||
} // for block
|
||||
} // for num_nonzero_blocks
|
||||
result[batch * m_rows + row] += dotprod * batch_scaling_factor;
|
||||
} // for row
|
||||
} // for batch
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void PortableMatrixBatchVectorMultiplyAccumulateImpl(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
||||
T* output) {
|
||||
const int16_t output_max = std::numeric_limits<T>::max();
|
||||
const int16_t output_min = std::numeric_limits<T>::min();
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int row = 0; row < n_output; ++row) {
|
||||
int32_t acc = bias[row];
|
||||
for (int col = 0; col < n_input; ++col) {
|
||||
int8_t input_val = input[batch * n_input + col];
|
||||
int8_t weights_val = input_to_gate_weights[row * n_input + col];
|
||||
acc += input_val * weights_val;
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
|
||||
acc += output_zp;
|
||||
acc += output[batch * n_output + row];
|
||||
if (acc > output_max) {
|
||||
acc = output_max;
|
||||
}
|
||||
if (acc < output_min) {
|
||||
acc = output_min;
|
||||
}
|
||||
output[batch * n_output + row] = static_cast<T>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
||||
int32_t* scratch, int16_t* output, CpuBackendContext* context) {
|
||||
PortableMatrixBatchVectorMultiplyAccumulateImpl(
|
||||
input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
|
||||
n_output, output_zp, output);
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
||||
int32_t* scratch, int8_t* output, CpuBackendContext* context) {
|
||||
PortableMatrixBatchVectorMultiplyAccumulateImpl(
|
||||
input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
|
||||
n_output, output_zp, output);
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiply(const int8_t* input,
|
||||
int32_t input_zeropoint,
|
||||
const int8_t* input_to_gate_weights,
|
||||
int32_t input_to_gate_effective_scale_a,
|
||||
int32_t input_to_gate_effective_scale_b,
|
||||
int32_t n_batch, int32_t n_input,
|
||||
int32_t n_cell, int8_t* gate_output,
|
||||
int8_t gate_output_zp) {
|
||||
const int32_t int8_max = std::numeric_limits<int8_t>::max();
|
||||
const int32_t int8_min = std::numeric_limits<int8_t>::min();
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int row = 0; row < n_cell; ++row) {
|
||||
int32_t acc = 0;
|
||||
for (int col = 0; col < n_input; ++col) {
|
||||
int32_t input_val = input[batch * n_input + col];
|
||||
int8_t weights_val = input_to_gate_weights[row * n_input + col];
|
||||
acc += (input_val - input_zeropoint) * weights_val;
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(acc, input_to_gate_effective_scale_a,
|
||||
input_to_gate_effective_scale_b);
|
||||
acc += gate_output_zp;
|
||||
if (acc > int8_max) {
|
||||
acc = int8_max;
|
||||
}
|
||||
if (acc < int8_min) {
|
||||
acc = int8_min;
|
||||
}
|
||||
gate_output[batch * n_cell + row] = static_cast<int8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiply(
|
||||
const int16_t* hidden, const int8_t* hidden_to_output_weights,
|
||||
int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
|
||||
const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
|
||||
int32_t n_output, int32_t output_zp, int8_t* proj_output) {
|
||||
const int16_t int8_max = std::numeric_limits<int8_t>::max();
|
||||
const int16_t int8_min = std::numeric_limits<int8_t>::min();
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int row = 0; row < n_output; ++row) {
|
||||
int64_t acc = gate_bias[row];
|
||||
for (int col = 0; col < n_hidden; ++col) {
|
||||
int16_t input_val = hidden[batch * n_hidden + col];
|
||||
int8_t weights_val = hidden_to_output_weights[row * n_hidden + col];
|
||||
int64_t curr = acc;
|
||||
acc += input_val * weights_val;
|
||||
if (input_val * weights_val > 0 && acc < curr) {
|
||||
acc = std::numeric_limits<int32_t>::max();
|
||||
}
|
||||
if (input_val * weights_val < 0 && acc > curr) {
|
||||
acc = std::numeric_limits<int32_t>::min();
|
||||
}
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
|
||||
proj_effective_scale_b);
|
||||
acc += output_zp;
|
||||
if (acc > int8_max) {
|
||||
acc = int8_max;
|
||||
}
|
||||
if (acc < int8_min) {
|
||||
acc = int8_min;
|
||||
}
|
||||
proj_output[batch * n_output + row] = acc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableApplyLayerNorm(const int16_t* input,
|
||||
const int16_t* layer_norm_weights,
|
||||
const int32_t* bias, int32_t layer_norm_scale_a,
|
||||
int32_t layer_norm_scale_b, int32_t variance_limit,
|
||||
int n_batch, int n_input, int16_t* output) {
|
||||
// The square of std::pow(2, 10), which is the extra factor that makes sure
|
||||
// normalized values has enough resolution.
|
||||
static const int kTwoToPower20 = 1 << 20;
|
||||
for (int i = 0; i < n_batch; ++i) {
|
||||
int64_t sum = 0;
|
||||
int64_t sum_sq = 0;
|
||||
for (int j = 0; j < n_input; ++j) {
|
||||
const int32_t index = i * n_input + j;
|
||||
int32_t val = static_cast<int32_t>(input[index]);
|
||||
sum += val;
|
||||
sum_sq += val * val;
|
||||
}
|
||||
int32_t mean =
|
||||
static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
|
||||
// TODO(b/173994730): Avoids overflow but only works for POT n_input.
|
||||
int32_t temp = kTwoToPower20 / n_input;
|
||||
int64_t variance =
|
||||
sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
|
||||
int32_t variance2 = static_cast<int32_t>(variance / kTwoToPower20);
|
||||
if (variance2 < 1) {
|
||||
variance2 = variance_limit;
|
||||
}
|
||||
int32_t stddev_inverse_a;
|
||||
int stddev_inverse_b;
|
||||
GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
|
||||
&stddev_inverse_a, &stddev_inverse_b);
|
||||
|
||||
for (int j = 0; j < n_input; ++j) {
|
||||
const int32_t index = i * n_input + j;
|
||||
int32_t val = static_cast<int32_t>(input[index]);
|
||||
int32_t shifted = 1024 * val - mean;
|
||||
int32_t rescaled = MultiplyByQuantizedMultiplier(
|
||||
shifted, stddev_inverse_a, stddev_inverse_b);
|
||||
// TODO(jianlijianli): Saturate this.
|
||||
int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
|
||||
int32_t val4 =
|
||||
static_cast<int32_t>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
|
||||
int32_t val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
|
||||
layer_norm_scale_b + 12);
|
||||
val5 = std::min(std::max(kInt16Min, val5), kInt16Max);
|
||||
output[index] = static_cast<int16_t>(val5);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableApplyLayerNormFloat(const int16_t* input,
|
||||
const int16_t* layer_norm_weights,
|
||||
int32_t layer_norm_scale_a,
|
||||
int32_t layer_norm_scale_b,
|
||||
const int32_t* bias, int n_batch, int n_input,
|
||||
int16_t* output) {
|
||||
const int32_t int16_max = std::numeric_limits<int16_t>::max();
|
||||
const int32_t int16_min = std::numeric_limits<int16_t>::min();
|
||||
const float layer_norm_scale =
|
||||
layer_norm_scale_a *
|
||||
std::pow(2.0, static_cast<double>(layer_norm_scale_b - 31));
|
||||
const float bias_scale =
|
||||
static_cast<float>(std::pow(2.0, -10)) * layer_norm_scale;
|
||||
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
float sum = 0.0f;
|
||||
float sum_sq = 0.0f;
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
const float value = static_cast<float>(input[index]);
|
||||
sum += value;
|
||||
sum_sq += value * value;
|
||||
}
|
||||
const float mean = sum / n_input;
|
||||
float stddev_inv = 0.0f;
|
||||
const float variance = sum_sq / n_input - mean * mean;
|
||||
if (variance == 0) {
|
||||
stddev_inv = 1.0f / std::sqrt(1e-8f);
|
||||
} else {
|
||||
stddev_inv = 1.0f / std::sqrt(variance);
|
||||
}
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
const float normalized_value =
|
||||
(static_cast<float>(input[index]) - mean) * stddev_inv;
|
||||
const float weighted_normalized_value =
|
||||
normalized_value * layer_norm_weights[i] * layer_norm_scale +
|
||||
bias[i] * bias_scale;
|
||||
const int32_t quant_output = static_cast<int32_t>(std::round(
|
||||
weighted_normalized_value * static_cast<float>(std::pow(2, 12))));
|
||||
output[index] = std::min(int16_max, std::max(int16_min, quant_output));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
|
||||
int32_t scalar, int32_t n_row,
|
||||
int32_t n_col, int32_t* output) {
|
||||
for (int i = 0; i < n_row; ++i) {
|
||||
int32_t row_sum = 0;
|
||||
for (int j = 0; j < n_col; ++j) {
|
||||
row_sum += *matrix++;
|
||||
}
|
||||
output[i] += row_sum * scalar;
|
||||
}
|
||||
}
|
||||
|
||||
void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int16_t* output) {
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int c = 0; c < n_input; c++) {
|
||||
using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
|
||||
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
||||
const int index = batch * n_input + c;
|
||||
F3 sigmoid_input = F3::FromRaw(input[index]);
|
||||
F0 sigmoid_output = gemmlowp::logistic(sigmoid_input);
|
||||
output[index] = sigmoid_output.raw();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int16_t* output) {
|
||||
const int32_t int16_max = std::numeric_limits<int16_t>::max();
|
||||
const int32_t int16_min = std::numeric_limits<int16_t>::min();
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
const float float_input =
|
||||
input[index] * static_cast<float>(std::pow(2, -12));
|
||||
const float float_output = 1.0f / (1.0f + std::exp(-float_input));
|
||||
const int32_t quant_output = static_cast<int32_t>(
|
||||
float_output * static_cast<float>(std::pow(2, 15)));
|
||||
const int32_t quant_output_clamped =
|
||||
std::min(int16_max, std::max(int16_min, quant_output));
|
||||
output[index] = static_cast<int16_t>(quant_output_clamped);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int IntegerBits>
|
||||
void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int16_t* output) {
|
||||
using FX = gemmlowp::FixedPoint<std::int16_t, IntegerBits>;
|
||||
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
FX tanh_input = FX::FromRaw(input[index]);
|
||||
F0 tanh_output = gemmlowp::tanh(tanh_input);
|
||||
output[index] = tanh_output.raw();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
|
||||
int32_t n_batch, int32_t n_input, int16_t* output) {
|
||||
assert(integer_bits <= 6);
|
||||
#define DISPATCH_TANH(i) \
|
||||
case i: \
|
||||
PortableApplyTanhImpl<i>(input, n_batch, n_input, output); \
|
||||
break;
|
||||
switch (integer_bits) {
|
||||
DISPATCH_TANH(0);
|
||||
DISPATCH_TANH(1);
|
||||
DISPATCH_TANH(2);
|
||||
DISPATCH_TANH(3);
|
||||
DISPATCH_TANH(4);
|
||||
DISPATCH_TANH(5);
|
||||
DISPATCH_TANH(6);
|
||||
default:
|
||||
return;
|
||||
}
|
||||
#undef DISPATCH_TANH
|
||||
}
|
||||
|
||||
void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int32_t integer_bits,
|
||||
int16_t* output) {
|
||||
const int32_t int16_max = std::numeric_limits<int16_t>::max();
|
||||
const int32_t int16_min = std::numeric_limits<int16_t>::min();
|
||||
const double two = 2.0;
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
const float float_input =
|
||||
input[index] * std::pow(two, static_cast<double>(integer_bits));
|
||||
const float float_output = std::tanh(float_input);
|
||||
const int32_t quant_output = static_cast<int32_t>(
|
||||
float_output * static_cast<float>(std::pow(2, 15)));
|
||||
const int32_t quant_output_clamped =
|
||||
std::min(int16_max, std::max(int16_min, quant_output));
|
||||
output[index] = static_cast<int16_t>(quant_output_clamped);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
|
||||
int n_batch, int n_input, int shift, int16_t* output) {
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
const int16_t a = input_1[index];
|
||||
const int16_t b = input_2[index];
|
||||
const int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
|
||||
output[index] =
|
||||
static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
|
||||
int32_t multiplier, int32_t shift, int32_t n_batch,
|
||||
int32_t n_input, int32_t output_zp, int8_t* output) {
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
const int16_t a = input_1[index];
|
||||
const int16_t b = input_2[index];
|
||||
int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
|
||||
value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
|
||||
value -= output_zp;
|
||||
value = std::min(std::max(static_cast<int32_t>(-128), value),
|
||||
static_cast<int32_t>(127));
|
||||
|
||||
output[index] = static_cast<int8_t>(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
|
||||
int n_batch, int n_input, int16_t* output) {
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
int32_t sum = input_1[index] + input_2[index];
|
||||
const int32_t sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum));
|
||||
output[index] = static_cast<int16_t>(sum_clamped);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size) {
|
||||
float result = 0.0;
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
result += *vector1++ * *vector2++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
namespace {
|
||||
inline int32_t VectorVectorDotProduct(const int16_t* vector1,
|
||||
const int16_t* vector2, int v_size) {
|
||||
int32_t result = 0;
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
result += *vector1++ * *vector2++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
|
||||
const int16_t* vector2,
|
||||
int v_size, int n_batch,
|
||||
int32_t* result) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
|
||||
vector1 += v_size;
|
||||
vector2 += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
void PortableVectorBatchVectorCwiseProductAccumulate(
|
||||
const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
|
||||
int32_t multiplier, int shift, int16_t* result) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
int32_t prod = vector[v] * *batch_vector++;
|
||||
prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
|
||||
int32_t output = prod + *result;
|
||||
output = std::max(std::min(static_cast<int32_t>(32767), output),
|
||||
static_cast<int32_t>(-32768));
|
||||
*result++ = output;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableSub1Vector(const float* vector, int v_size, float* result) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
*result++ = 1.0f - *vector++;
|
||||
}
|
||||
}
|
||||
|
||||
void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result) {
|
||||
static const int16_t kOne = 32767;
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
*result++ = kOne - *vector++;
|
||||
}
|
||||
}
|
||||
|
||||
void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
|
||||
const float scale, float* result) {
|
||||
for (int v = 0; v < v_size; ++v) {
|
||||
*result++ = scale * *vector++;
|
||||
}
|
||||
}
|
||||
|
||||
void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
|
||||
float* __restrict__ output_vector,
|
||||
int v_size, int n_batch) {
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < v_size; ++i) {
|
||||
sum += input_vector[i];
|
||||
}
|
||||
const float mean = sum / v_size;
|
||||
float sum_diff_sq = 0.0f;
|
||||
for (int i = 0; i < v_size; ++i) {
|
||||
const float diff = input_vector[i] - mean;
|
||||
sum_diff_sq += diff * diff;
|
||||
}
|
||||
const float variance = sum_diff_sq / v_size;
|
||||
constexpr float kNormalizationConstant = 1e-8f;
|
||||
const float stddev_inv =
|
||||
1.0f / std::sqrt(variance + kNormalizationConstant);
|
||||
for (int i = 0; i < v_size; ++i) {
|
||||
output_vector[i] = (input_vector[i] - mean) * stddev_inv;
|
||||
}
|
||||
input_vector += v_size;
|
||||
output_vector += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
|
||||
const int8_t* recurrent, int8_t recurrent_zp,
|
||||
int32_t input_effective_scale_a,
|
||||
int32_t input_effective_scale_b,
|
||||
int32_t recurrent_effective_scale_a,
|
||||
int32_t recurrent_effective_scale_b,
|
||||
int32_t n_batch, int32_t n_cell,
|
||||
int16_t* output) {
|
||||
const int32_t int16_max = std::numeric_limits<int16_t>::max();
|
||||
const int32_t int16_min = std::numeric_limits<int16_t>::min();
|
||||
for (int i = 0; i < n_batch * n_cell; ++i) {
|
||||
int32_t x = static_cast<int32_t>(input[i]) - static_cast<int32_t>(input_zp);
|
||||
int32_t h =
|
||||
static_cast<int32_t>(recurrent[i]) - static_cast<int32_t>(recurrent_zp);
|
||||
int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
|
||||
input_effective_scale_b);
|
||||
int32_t h_scaled = MultiplyByQuantizedMultiplier(
|
||||
h, recurrent_effective_scale_a, recurrent_effective_scale_b);
|
||||
int32_t y = h_scaled + x_scaled;
|
||||
if (y > int16_max) {
|
||||
y = int16_max;
|
||||
}
|
||||
if (y < int16_min) {
|
||||
y = int16_min;
|
||||
}
|
||||
output[i] = static_cast<int16_t>(y);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tensor_utils
|
||||
} // namespace tflite
|
||||
@@ -0,0 +1,235 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define __restrict__ __restrict
|
||||
#endif
|
||||
|
||||
namespace tflite {
|
||||
|
||||
// Not all backends support CpuBackendContext usage, so forward declare to avoid
|
||||
// pulling in its implementation.
|
||||
class CpuBackendContext;
|
||||
|
||||
namespace tensor_utils {
|
||||
|
||||
template <typename T>
|
||||
bool PortableIsZeroVector(const T* vector, int v_size) {
|
||||
for (int i = 0; i < v_size; ++i) {
|
||||
if (vector[i] != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* min_value,
|
||||
float* max_value, float* scaling_factor);
|
||||
|
||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float min_value,
|
||||
float max_value, float* scaling_factor);
|
||||
|
||||
void PortableAsymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values,
|
||||
float* scaling_factor, int32_t* offset);
|
||||
|
||||
// Multiply a matrix by a batch vector, and store results in a batch-size
|
||||
// vector.
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
|
||||
int m_rows, int m_cols,
|
||||
const float* vector,
|
||||
int n_batch, float* result);
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||
int n_batch, float* __restrict__ result);
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||
int n_batch, float* __restrict__ result, const float* per_channel_scale,
|
||||
const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
|
||||
bool* compute_row_sums, CpuBackendContext* context);
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vector, const float* scaling_factors,
|
||||
int n_batch, int32_t* scratch, float* __restrict__ result,
|
||||
CpuBackendContext* context);
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
|
||||
const float* __restrict__ matrix, const int32_t* __restrict__ segments,
|
||||
const int32_t* __restrict__ indices, int m_rows, int m_cols,
|
||||
const float* __restrict__ vector, int n_batch, float* __restrict__ result);
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
||||
float* __restrict__ result);
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
||||
const int m_cols, const int8_t* __restrict__ vectors,
|
||||
const float* scaling_factors, int n_batch, float* __restrict__ result);
|
||||
|
||||
// Dot product of two vectors.
|
||||
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size);
|
||||
|
||||
void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
|
||||
const int16_t* vector2,
|
||||
int v_size, int n_batch,
|
||||
int32_t* result);
|
||||
|
||||
void PortableVectorBatchVectorCwiseProductAccumulate(
|
||||
const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
|
||||
int32_t multiplier, int shift, int16_t* result);
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
||||
int32_t* scratch, int16_t* output, CpuBackendContext* context);
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
||||
int32_t* scratch, int8_t* output, CpuBackendContext* context);
|
||||
|
||||
void PortableMatrixBatchVectorMultiply(const int8_t* input,
|
||||
int32_t input_zeropoint,
|
||||
const int8_t* input_to_gate_weights,
|
||||
int32_t input_to_gate_effective_scale_a,
|
||||
int32_t input_to_gate_effective_scale_b,
|
||||
int32_t n_batch, int32_t n_input,
|
||||
int32_t n_cell, int8_t* gate_output,
|
||||
int8_t gate_output_zp);
|
||||
|
||||
void PortableMatrixBatchVectorMultiply(
|
||||
const int16_t* hidden, const int8_t* hidden_to_output_weights,
|
||||
int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
|
||||
const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
|
||||
int32_t n_output, int32_t output_zp, int8_t* proj_output);
|
||||
|
||||
void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
|
||||
int32_t scalar, int32_t n_row,
|
||||
int32_t n_col, int32_t* output);
|
||||
|
||||
void PortableApplyLayerNorm(const int16_t* input,
|
||||
const int16_t* layer_norm_weights,
|
||||
const int32_t* bias, int32_t layer_norm_scale_a,
|
||||
int32_t layer_norm_scale_b, int32_t variance_limit,
|
||||
int n_batch, int n_input, int16_t* output);
|
||||
|
||||
void PortableApplyLayerNormFloat(const int16_t* input,
|
||||
const int16_t* layer_norm_weights,
|
||||
int32_t layer_norm_scale_a,
|
||||
int32_t layer_norm_scale_b,
|
||||
const int32_t* bias, int n_batch, int n_input,
|
||||
int16_t* output);
|
||||
|
||||
void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int16_t* output);
|
||||
|
||||
void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int16_t* output);
|
||||
|
||||
void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
|
||||
int32_t n_batch, int32_t n_input, int16_t* output);
|
||||
|
||||
void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int32_t integer_bits,
|
||||
int16_t* output);
|
||||
|
||||
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
|
||||
int n_batch, int n_input, int shift, int16_t* output);
|
||||
|
||||
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
|
||||
int32_t multiplier, int32_t shift, int32_t n_batch,
|
||||
int32_t n_input, int32_t output_zp, int8_t* output);
|
||||
|
||||
void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
|
||||
int n_batch, int n_input, int16_t* output);
|
||||
|
||||
template <typename T>
|
||||
void PortableCwiseClipping(T* vector, const int v_size,
|
||||
const T& clipping_value) {
|
||||
for (int i = 0; i < v_size; i++) {
|
||||
vector[i] = std::max(std::min(clipping_value, vector[i]),
|
||||
static_cast<T>(-clipping_value));
|
||||
}
|
||||
}
|
||||
|
||||
// Batch vector initialization with another vector.
|
||||
void PortableVectorBatchVectorAssign(const float* vector, int v_size,
|
||||
int n_batch, float* batch_vector);
|
||||
|
||||
// Compute "1.0f - elements of vector" (used in CIFG).
|
||||
void PortableSub1Vector(const float* vector, int v_size, float* result);
|
||||
|
||||
void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result);
|
||||
|
||||
// Multiply all elements of vector with a scalar.
|
||||
void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
||||
float* result);
|
||||
|
||||
// Reduce-sum on a vector:
|
||||
// input_vector: pointer to input vector.
|
||||
// output_vector: pointer to vector.
|
||||
// output_size: output vector size.
|
||||
// reduction_size: number of consecutive elements from input vector which are
|
||||
// added to get one element of output.
|
||||
template <typename INPUT, typename OUTPUT>
|
||||
void PortableReductionSumVector(const INPUT* input_vector,
|
||||
OUTPUT* output_vector, int output_size,
|
||||
int reduction_size) {
|
||||
for (int o = 0; o < output_size; o++) {
|
||||
OUTPUT result = 0;
|
||||
for (int r = 0; r < reduction_size; r++) {
|
||||
result += input_vector[r];
|
||||
}
|
||||
output_vector[o] = result;
|
||||
input_vector += reduction_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Layer norm for each batch.
|
||||
void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
|
||||
float* __restrict__ output_vector,
|
||||
int v_size, int n_batch);
|
||||
|
||||
// Saturate Add.
|
||||
void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
|
||||
const int8_t* recurrent, int8_t recurrent_zp,
|
||||
int32_t input_effective_scale_a,
|
||||
int32_t input_effective_scale_b,
|
||||
int32_t recurrent_effective_scale_a,
|
||||
int32_t recurrent_effective_scale_b,
|
||||
int32_t n_batch, int32_t n_cell,
|
||||
int16_t* output);
|
||||
|
||||
} // namespace tensor_utils
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
|
||||
@@ -23,6 +23,25 @@ limitations under the License.
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
// Check if the reduction at index is the first one along the dimensions given
|
||||
// in axis.
|
||||
inline bool IsFirstReduction(const int* index, const int num_axis,
|
||||
const int* axis) {
|
||||
if (num_axis == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
TFLITE_DCHECK(index != nullptr);
|
||||
TFLITE_DCHECK(axis != nullptr);
|
||||
for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
|
||||
if (index[axis[axis_idx]] != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
@@ -35,8 +54,7 @@ inline bool Reduce(const In* input_data, const int* input_dims,
|
||||
const int* output_dims, const int input_num_dims,
|
||||
const int output_num_dims, const int* axis,
|
||||
const int num_axis, int* input_iter,
|
||||
Out reducer(const Out current, const In in),
|
||||
Out* output_data) {
|
||||
Out reducer(Out current, const In in), Out* output_data) {
|
||||
// Reset input iterator.
|
||||
for (int idx = 0; idx < input_num_dims; ++idx) {
|
||||
input_iter[idx] = 0;
|
||||
@@ -53,6 +71,37 @@ inline bool Reduce(const In* input_data, const int* input_dims,
|
||||
return true;
|
||||
}
|
||||
|
||||
// Similar to above Reduce function but takes two reducer functions.
|
||||
// The 'reducer_first' is called with the first value of the reduction,
|
||||
// 'reducer_next' is then called for all the others.
|
||||
template <typename In, typename Out>
|
||||
inline bool Reduce(const In* input_data, const int* input_dims,
|
||||
const int* output_dims, const int input_num_dims,
|
||||
const int output_num_dims, const int* axis,
|
||||
const int num_axis, int* input_iter,
|
||||
const std::function<Out(In in)>& reducer_first,
|
||||
const std::function<Out(Out current, In in)>& reducer_next,
|
||||
Out* output_data) {
|
||||
// Reset input iterator.
|
||||
for (int idx = 0; idx < input_num_dims; ++idx) {
|
||||
input_iter[idx] = 0;
|
||||
}
|
||||
// Iterate through input_data.
|
||||
do {
|
||||
size_t input_offset =
|
||||
ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
|
||||
size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
|
||||
input_iter, num_axis, axis);
|
||||
if (IsFirstReduction(input_iter, num_axis, axis)) {
|
||||
output_data[output_offset] = reducer_first(input_data[input_offset]);
|
||||
} else {
|
||||
output_data[output_offset] =
|
||||
reducer_next(output_data[output_offset], input_data[input_offset]);
|
||||
}
|
||||
} while (NextIndex(input_num_dims, input_dims, input_iter));
|
||||
return true;
|
||||
}
|
||||
|
||||
// This method parses the input 'axis' to remove duplicates and handle negative
|
||||
// values, and returns a valid 'out_axis'
|
||||
inline bool ResolveAxis(const int num_dims, const int* axis,
|
||||
@@ -111,7 +160,8 @@ inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
|
||||
for (int idx = 0; idx < num_dims; ++idx) {
|
||||
size_t current = static_cast<size_t>(dims[idx]);
|
||||
// Overflow prevention.
|
||||
if (num_elements > std::numeric_limits<size_t>::max() / current) {
|
||||
if (current > 0 &&
|
||||
num_elements > std::numeric_limits<size_t>::max() / current) {
|
||||
return false;
|
||||
}
|
||||
num_elements *= current;
|
||||
@@ -132,17 +182,20 @@ inline bool ReduceGeneric(const T* input_data, const int* input_dims,
|
||||
bool keep_dims, int* temp_index, int* resolved_axis,
|
||||
T init_value,
|
||||
T reducer(const T current, const T in)) {
|
||||
// Return early when input shape has zero dim.
|
||||
for (int i = 0; i < input_num_dims; ++i) {
|
||||
if (input_dims[i] == 0) return true;
|
||||
}
|
||||
|
||||
// Reset output data.
|
||||
if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
|
||||
output_data)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Return early when input shape has zero dim. This is done after initializing
|
||||
// data for output tensor because there are cases that the input tensor is
|
||||
// empty but output tensor is not. In that case, output tensor should be
|
||||
// filled with init_value.
|
||||
for (int i = 0; i < input_num_dims; ++i) {
|
||||
if (input_dims[i] == 0) return true;
|
||||
}
|
||||
|
||||
// Resolve axis.
|
||||
int num_resolved_axis = 0;
|
||||
if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
|
||||
@@ -290,9 +343,9 @@ inline void Mean(const tflite::MeanParams& op_params,
|
||||
constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
|
||||
constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
|
||||
|
||||
int32_t bias =
|
||||
output_zero_point -
|
||||
static_cast<int32_t>(input_zero_point * input_scale / output_scale);
|
||||
float temp = input_zero_point * input_scale / output_scale;
|
||||
temp = temp > 0 ? temp + 0.5f : temp - 0.5f;
|
||||
int32_t bias = output_zero_point - static_cast<int32_t>(temp);
|
||||
double real_scale =
|
||||
static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
|
||||
|
||||
@@ -353,6 +406,14 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
|
||||
temp_sum[idx] = U();
|
||||
}
|
||||
|
||||
// Return early when input shape has zero dim. This is done after initializing
|
||||
// data for output tensor because there are cases that the input tensor is
|
||||
// empty but output tensor is not. In that case, output tensor should be
|
||||
// filled with init_value.
|
||||
for (int i = 0; i < input_num_dims; ++i) {
|
||||
if (input_dims[i] == 0) return true;
|
||||
}
|
||||
|
||||
// Resolve axis.
|
||||
int num_resolved_axis = 0;
|
||||
if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
|
||||
@@ -405,6 +466,57 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline bool QuantizedReduceProd(const T* input_data, int32_t input_zero_point,
|
||||
const RuntimeShape& input_shape, T* output_data,
|
||||
int32_t output_zero_point,
|
||||
const RuntimeShape& output_shape,
|
||||
const int* axis,
|
||||
const int64_t num_axis_dimensions,
|
||||
bool keep_dims, int* temp_index,
|
||||
int* resolved_axis, int32_t* temp_prod,
|
||||
int32_t scaling_multiplier, int scaling_shift) {
|
||||
const int32_t kMinValue = std::numeric_limits<T>::min();
|
||||
const int32_t kMaxValue = std::numeric_limits<T>::max();
|
||||
|
||||
// Resolve axis.
|
||||
int num_resolved_axis = 0;
|
||||
if (!ResolveAxis(input_shape.DimensionsCount(), axis, num_axis_dimensions,
|
||||
resolved_axis, &num_resolved_axis)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Calculate the reduced product by rescaling each multiplication step to
|
||||
// avoid an overflow.
|
||||
auto reducer_first = [&](T in) -> int32_t { return in - input_zero_point; };
|
||||
|
||||
auto reducer_next = [&](int32_t current, T in) -> int32_t {
|
||||
const int64_t result =
|
||||
static_cast<int64_t>(current) * (in - input_zero_point);
|
||||
return MultiplyByQuantizedMultiplier(result, scaling_multiplier,
|
||||
scaling_shift);
|
||||
};
|
||||
|
||||
if (!Reduce<T, int32_t>(
|
||||
input_data, input_shape.DimsData(), output_shape.DimsData(),
|
||||
input_shape.DimensionsCount(), output_shape.DimensionsCount(),
|
||||
resolved_axis, num_resolved_axis, temp_index, reducer_first,
|
||||
reducer_next, temp_prod)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < output_shape.FlatSize(); i++) {
|
||||
int32_t result =
|
||||
MultiplyByQuantizedMultiplier(static_cast<int64_t>(temp_prod[i]),
|
||||
scaling_multiplier, scaling_shift) +
|
||||
output_zero_point;
|
||||
result = std::min(std::max(result, kMinValue), kMaxValue);
|
||||
output_data[i] = static_cast<T>(result);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
@@ -0,0 +1,228 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void ComputeInterpolationValues(const float value, const float scale,
|
||||
const bool half_pixel_centers,
|
||||
int32_t input_size, float* scaled_value,
|
||||
int32_t* lower_bound,
|
||||
int32_t* upper_bound) {
|
||||
if (half_pixel_centers) {
|
||||
*scaled_value = (value + 0.5f) * scale - 0.5f;
|
||||
} else {
|
||||
*scaled_value = value * scale;
|
||||
}
|
||||
float scaled_value_floor = std::floor(*scaled_value);
|
||||
*lower_bound = std::max(static_cast<int32_t>(scaled_value_floor),
|
||||
static_cast<int32_t>(0));
|
||||
*upper_bound =
|
||||
std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& unextended_output_size_shape,
|
||||
const int32_t* output_size_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
// If half_pixel_centers is True, align_corners must be False.
|
||||
TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
|
||||
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_size_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
int32_t input_height = input_shape.Dims(1);
|
||||
int32_t input_width = input_shape.Dims(2);
|
||||
int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
|
||||
int32_t output_height =
|
||||
output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
|
||||
int32_t output_width =
|
||||
output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
|
||||
|
||||
float height_scale = static_cast<float>(input_height) / output_height;
|
||||
float width_scale = static_cast<float>(input_width) / output_width;
|
||||
if (op_params.align_corners && output_height > 1) {
|
||||
height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
|
||||
}
|
||||
if (op_params.align_corners && output_width > 1) {
|
||||
width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
|
||||
}
|
||||
const float rounding_offset = std::numeric_limits<T>::is_integer ? .5f : .0f;
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int y = 0; y < output_height; ++y) {
|
||||
float input_y;
|
||||
int32_t y0, y1;
|
||||
ComputeInterpolationValues(y, height_scale, op_params.half_pixel_centers,
|
||||
input_height, &input_y, &y0, &y1);
|
||||
for (int x = 0; x < output_width; ++x) {
|
||||
float input_x;
|
||||
int32_t x0, x1;
|
||||
ComputeInterpolationValues(x, width_scale, op_params.half_pixel_centers,
|
||||
input_width, &input_x, &x0, &x1);
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
T interpolation =
|
||||
static_cast<T>(input_data[Offset(input_shape, b, y0, x0, c)] *
|
||||
(1 - (input_y - y0)) * (1 - (input_x - x0)) +
|
||||
input_data[Offset(input_shape, b, y1, x0, c)] *
|
||||
(input_y - y0) * (1 - (input_x - x0)) +
|
||||
input_data[Offset(input_shape, b, y0, x1, c)] *
|
||||
(1 - (input_y - y0)) * (input_x - x0) +
|
||||
input_data[Offset(input_shape, b, y1, x1, c)] *
|
||||
(input_y - y0) * (input_x - x0) +
|
||||
rounding_offset);
|
||||
output_data[Offset(output_shape, b, y, x, c)] = interpolation;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void ComputeInterpolationValuesInteger(
|
||||
const int32_t value, const int32_t scale_10, const bool half_pixel_centers,
|
||||
int32_t input_size, int32_t* scaled_value, int32_t* lower_bound,
|
||||
int32_t* upper_bound) {
|
||||
if (half_pixel_centers) {
|
||||
*scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
|
||||
} else {
|
||||
*scaled_value = value * scale_10;
|
||||
}
|
||||
constexpr int32_t zero = 0;
|
||||
*lower_bound = std::max(*scaled_value / (1 << 10), zero);
|
||||
*upper_bound =
|
||||
std::min((*scaled_value + (1 << 10) - 1) / (1 << 10), input_size - 1);
|
||||
}
|
||||
|
||||
// Same as above but doesn't use any floating-point for the resize
|
||||
template <typename T>
|
||||
inline void ResizeBilinearInteger(
|
||||
const tflite::ResizeBilinearParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape, const T* input_data,
|
||||
const RuntimeShape& unextended_output_size_shape,
|
||||
const int32_t* output_size_data,
|
||||
const RuntimeShape& unextended_output_shape, T* output_data) {
|
||||
// If half_pixel_centers is True, align_corners must be False.
|
||||
TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
|
||||
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_size_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int32_t input_height = input_shape.Dims(1);
|
||||
const int32_t input_width = input_shape.Dims(2);
|
||||
const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
|
||||
const int32_t output_height =
|
||||
output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
|
||||
const int32_t output_width =
|
||||
output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
|
||||
|
||||
int32_t height_scale_10 =
|
||||
((1 << 10) * input_height + output_height / 2) / output_height;
|
||||
int32_t width_scale_10 =
|
||||
((1 << 10) * input_width + output_width / 2) / output_width;
|
||||
if (op_params.align_corners && output_height > 1) {
|
||||
height_scale_10 =
|
||||
((1 << 10) * (input_height - 1) + (output_height - 1) / 2) /
|
||||
(output_height - 1);
|
||||
}
|
||||
if (op_params.align_corners && output_width > 1) {
|
||||
width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) /
|
||||
(output_width - 1);
|
||||
}
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int y = 0; y < output_height; ++y) {
|
||||
int32_t input_y, y0, y1;
|
||||
ComputeInterpolationValuesInteger(y, height_scale_10,
|
||||
op_params.half_pixel_centers,
|
||||
input_height, &input_y, &y0, &y1);
|
||||
for (int x = 0; x < output_width; ++x) {
|
||||
int32_t input_x, x0, x1;
|
||||
ComputeInterpolationValuesInteger(x, width_scale_10,
|
||||
op_params.half_pixel_centers,
|
||||
input_width, &input_x, &x0, &x1);
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
const int64_t output_20_ll =
|
||||
static_cast<int64_t>(
|
||||
input_data[Offset(input_shape, b, y0, x0, c)]) *
|
||||
((1 << 10) - (input_y - (1 << 10) * y0)) *
|
||||
((1 << 10) - (input_x - (1 << 10) * x0));
|
||||
const int64_t output_20_lu =
|
||||
static_cast<int64_t>(
|
||||
input_data[Offset(input_shape, b, y1, x0, c)]) *
|
||||
(input_y - (1 << 10) * y0) *
|
||||
((1 << 10) - (input_x - (1 << 10) * x0));
|
||||
const int64_t output_20_rl =
|
||||
static_cast<int64_t>(
|
||||
input_data[Offset(input_shape, b, y0, x1, c)]) *
|
||||
((1 << 10) - (input_y - (1 << 10) * y0)) *
|
||||
(input_x - (1 << 10) * x0);
|
||||
const int64_t output_20_ru =
|
||||
static_cast<int64_t>(
|
||||
input_data[Offset(input_shape, b, y1, x1, c)]) *
|
||||
(input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
|
||||
const int64_t output_20 =
|
||||
output_20_ll + output_20_lu + output_20_rl + output_20_ru;
|
||||
const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
|
||||
const T interpolation =
|
||||
static_cast<T>((output_20 + round) / (1 << 20));
|
||||
output_data[Offset(output_shape, b, y, x, c)] = interpolation;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
|
||||
@@ -159,7 +159,7 @@ inline int16_t SoftMaxCalculateExp(const SoftmaxParams& params,
|
||||
std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
|
||||
static_cast<int32_t>(32767));
|
||||
// apply the exp() LUT activation function
|
||||
return generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
|
||||
return lut_lookup(sat_sym_scaled_diff, params.exp_lut);
|
||||
}
|
||||
// Quantized softmax with int16_t input and int16_t output.
|
||||
inline void SoftmaxInt16(const SoftmaxParams& params,
|
||||
@@ -207,8 +207,8 @@ inline void SoftmaxInt16(const SoftmaxParams& params,
|
||||
std::min(std::max(sym_shifted_sum, static_cast<int32_t>(-32768)),
|
||||
static_cast<int32_t>(32767)));
|
||||
// apply 1/(1 + x) LUT activation function
|
||||
int16_t reciprocal_scale_Q015 = generic_int16_table_lookup(
|
||||
sat_sym_shifted_sum, params.one_over_one_plus_x_lut);
|
||||
int16_t reciprocal_scale_Q015 =
|
||||
lut_lookup(sat_sym_shifted_sum, params.one_over_one_plus_x_lut);
|
||||
|
||||
// Rescale the exp_result with reciprocal
|
||||
// range of output is [0, 32767] correspond to [0.0, 1.0]
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
const int input_depth = input_shape.Dims(3);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_batch = input_shape.Dims(0);
|
||||
|
||||
const int output_depth = output_shape.Dims(3);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_batch = output_shape.Dims(0);
|
||||
|
||||
const int32_t block_size = op_params.block_size;
|
||||
|
||||
TFLITE_DCHECK_EQ(input_width, output_width * block_size);
|
||||
TFLITE_DCHECK_EQ(input_height, output_height * block_size);
|
||||
TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth);
|
||||
TFLITE_DCHECK_EQ(input_batch, output_batch);
|
||||
|
||||
for (int in_b = 0; in_b < input_batch; ++in_b) {
|
||||
for (int in_h = 0; in_h < input_height; ++in_h) {
|
||||
for (int in_w = 0; in_w < input_width; ++in_w) {
|
||||
for (int in_d = 0; in_d < input_depth; ++in_d) {
|
||||
const int out_d =
|
||||
in_d + ((in_h % block_size) * block_size + in_w % block_size) *
|
||||
input_depth;
|
||||
const int out_w = in_w / block_size;
|
||||
const int out_h = in_h / block_size;
|
||||
const int out_b = in_b;
|
||||
|
||||
const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
|
||||
const int output_index =
|
||||
Offset(output_shape, out_b, out_h, out_w, out_d);
|
||||
|
||||
output_data[output_index] = input_data[input_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
|
||||
@@ -0,0 +1,111 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T, int N>
|
||||
void TransposeImpl(const TransposeParams& params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
const int unextended_input_size = unextended_input_shape.DimensionsCount();
|
||||
const int unextended_output_size = unextended_output_shape.DimensionsCount();
|
||||
TFLITE_DCHECK_LE(unextended_input_size, N);
|
||||
TFLITE_DCHECK_LE(unextended_output_size, N);
|
||||
TFLITE_DCHECK_EQ(unextended_output_size, params.perm_count);
|
||||
const int input_ext_size = N - unextended_input_size;
|
||||
const int output_ext_size = N - unextended_output_size;
|
||||
NdArrayDesc<N> input_desc;
|
||||
NdArrayDesc<N> output_desc;
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_input_shape),
|
||||
&input_desc);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
|
||||
&output_desc);
|
||||
|
||||
// The perm data is extended to match the output, each index incremented by
|
||||
// the amount of front padding of the input shape.
|
||||
int extended_perm[N];
|
||||
for (int i = 0; i < N; ++i) {
|
||||
extended_perm[i] = i < output_ext_size
|
||||
? i
|
||||
: params.perm[i - output_ext_size] + input_ext_size;
|
||||
}
|
||||
|
||||
// Permutes the input shape so we don't need to permute the indexes inside
|
||||
// the loop. Check to make sure output_dims is matching input_dims.
|
||||
NdArrayDesc<N> perm_input_desc;
|
||||
for (int k = 0; k < N; ++k) {
|
||||
TFLITE_DCHECK_EQ(input_desc.extents[extended_perm[k]],
|
||||
output_desc.extents[k]);
|
||||
perm_input_desc.extents[k] = input_desc.extents[extended_perm[k]];
|
||||
perm_input_desc.strides[k] = input_desc.strides[extended_perm[k]];
|
||||
}
|
||||
|
||||
// Naive transpose loop (iterate on output index and compute input index).
|
||||
auto tranpose_func = [&](int indexes[N]) {
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
input_data[SubscriptToIndex(perm_input_desc, indexes)];
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, tranpose_func);
|
||||
}
|
||||
|
||||
template <typename T, int N = 5>
|
||||
void Transpose(const TransposeParams& params,
|
||||
const RuntimeShape& unextended_input_shape, const T* input_data,
|
||||
const RuntimeShape& unextended_output_shape, T* output_data) {
|
||||
// Transpose kernel only does rearranging values not numeric evaluations on
|
||||
// each cell. It's safe to implement per size of scalar type and this trick
|
||||
// keeps the total code size in a reasonable range.
|
||||
switch (sizeof(T)) {
|
||||
case 1:
|
||||
TransposeImpl<int8_t, N>(params, unextended_input_shape,
|
||||
reinterpret_cast<const int8_t*>(input_data),
|
||||
unextended_output_shape,
|
||||
reinterpret_cast<int8_t*>(output_data));
|
||||
break;
|
||||
case 2:
|
||||
TransposeImpl<int16_t, N>(params, unextended_input_shape,
|
||||
reinterpret_cast<const int16_t*>(input_data),
|
||||
unextended_output_shape,
|
||||
reinterpret_cast<int16_t*>(output_data));
|
||||
break;
|
||||
|
||||
case 4:
|
||||
TransposeImpl<int32_t, N>(params, unextended_input_shape,
|
||||
reinterpret_cast<const int32_t*>(input_data),
|
||||
unextended_output_shape,
|
||||
reinterpret_cast<int32_t*>(output_data));
|
||||
break;
|
||||
case 8:
|
||||
TransposeImpl<int64_t, N>(params, unextended_input_shape,
|
||||
reinterpret_cast<const int64_t*>(input_data),
|
||||
unextended_output_shape,
|
||||
reinterpret_cast<int64_t*>(output_data));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
|
||||
@@ -400,13 +400,22 @@ inline size_t ReducedOutputOffset(const int num_dims, const int* dims,
|
||||
return offset;
|
||||
}
|
||||
|
||||
// Since tensors with '0' in their shape are valid in TF, these offset functions
|
||||
// allow that as long as the corresponding index is also 0. It is upto the
|
||||
// calling ops to ensure that they perform verification checks on tensor shapes
|
||||
// if they don't support a particular behavior.
|
||||
|
||||
inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) {
|
||||
TFLITE_DCHECK_EQ(shape.DimensionsCount(), 4);
|
||||
const int* dims_data = reinterpret_cast<const int*>(shape.DimsDataUpTo5D());
|
||||
TFLITE_DCHECK(i0 >= 0 && i0 < dims_data[0]);
|
||||
TFLITE_DCHECK(i1 >= 0 && i1 < dims_data[1]);
|
||||
TFLITE_DCHECK(i2 >= 0 && i2 < dims_data[2]);
|
||||
TFLITE_DCHECK(i3 >= 0 && i3 < dims_data[3]);
|
||||
TFLITE_DCHECK((dims_data[0] == 0 && i0 == 0) ||
|
||||
(i0 >= 0 && i0 < dims_data[0]));
|
||||
TFLITE_DCHECK((dims_data[1] == 0 && i1 == 0) ||
|
||||
(i1 >= 0 && i1 < dims_data[1]));
|
||||
TFLITE_DCHECK((dims_data[2] == 0 && i2 == 0) ||
|
||||
(i2 >= 0 && i2 < dims_data[2]));
|
||||
TFLITE_DCHECK((dims_data[3] == 0 && i3 == 0) ||
|
||||
(i3 >= 0 && i3 < dims_data[3]));
|
||||
return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
|
||||
}
|
||||
|
||||
@@ -414,21 +423,34 @@ inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3,
|
||||
int i4) {
|
||||
TFLITE_DCHECK_EQ(shape.DimensionsCount(), 5);
|
||||
const int* dims_data = reinterpret_cast<const int*>(shape.DimsDataUpTo5D());
|
||||
TFLITE_DCHECK(i0 >= 0 && i0 < dims_data[0]);
|
||||
TFLITE_DCHECK(i1 >= 0 && i1 < dims_data[1]);
|
||||
TFLITE_DCHECK(i2 >= 0 && i2 < dims_data[2]);
|
||||
TFLITE_DCHECK(i3 >= 0 && i3 < dims_data[3]);
|
||||
TFLITE_DCHECK(i4 >= 0 && i4 < dims_data[4]);
|
||||
TFLITE_DCHECK((dims_data[0] == 0 && i0 == 0) ||
|
||||
(i0 >= 0 && i0 < dims_data[0]));
|
||||
TFLITE_DCHECK((dims_data[1] == 0 && i1 == 0) ||
|
||||
(i1 >= 0 && i1 < dims_data[1]));
|
||||
TFLITE_DCHECK((dims_data[2] == 0 && i2 == 0) ||
|
||||
(i2 >= 0 && i2 < dims_data[2]));
|
||||
TFLITE_DCHECK((dims_data[3] == 0 && i3 == 0) ||
|
||||
(i3 >= 0 && i3 < dims_data[3]));
|
||||
TFLITE_DCHECK((dims_data[4] == 0 && i4 == 0) ||
|
||||
(i4 >= 0 && i4 < dims_data[4]));
|
||||
return (((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3) *
|
||||
dims_data[4] +
|
||||
i4;
|
||||
}
|
||||
|
||||
inline int Offset(const RuntimeShape& shape, int* index) {
|
||||
return Offset(shape, index[0], index[1], index[2], index[3]);
|
||||
}
|
||||
|
||||
inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
|
||||
TFLITE_DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
|
||||
TFLITE_DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
|
||||
TFLITE_DCHECK(i2 >= 0 && i2 < dims.sizes[2]);
|
||||
TFLITE_DCHECK(i3 >= 0 && i3 < dims.sizes[3]);
|
||||
TFLITE_DCHECK((i0 == 0 && dims.sizes[0] == 0) ||
|
||||
(i0 >= 0 && i0 < dims.sizes[0]));
|
||||
TFLITE_DCHECK((i1 == 0 && dims.sizes[1] == 0) ||
|
||||
(i1 >= 0 && i1 < dims.sizes[1]));
|
||||
TFLITE_DCHECK((i2 == 0 && dims.sizes[2] == 0) ||
|
||||
(i2 >= 0 && i2 < dims.sizes[2]));
|
||||
TFLITE_DCHECK((i3 == 0 && dims.sizes[3] == 0) ||
|
||||
(i3 >= 0 && i3 < dims.sizes[3]));
|
||||
return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] +
|
||||
i3 * dims.strides[3];
|
||||
}
|
||||
@@ -437,10 +459,6 @@ inline int Offset(const Dims<4>& dims, int* index) {
|
||||
return Offset(dims, index[0], index[1], index[2], index[3]);
|
||||
}
|
||||
|
||||
inline int Offset(const RuntimeShape& shape, int* index) {
|
||||
return Offset(shape, index[0], index[1], index[2], index[3]);
|
||||
}
|
||||
|
||||
// Get array size, DCHECKing that the dim index is in range.
|
||||
//
|
||||
// Note that this will be phased out with Dims<4>, since RuntimeShape::Dims()
|
||||
@@ -602,6 +620,58 @@ inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
|
||||
return MatchingFlatSize(dims, check_dims_1, check_dims_2, check_dims_3);
|
||||
}
|
||||
|
||||
// Flat size calculation, checking if their extended shapes match.
|
||||
inline int MatchingExtendedShapeFlatSize(const RuntimeShape& shape,
|
||||
const RuntimeShape& check_shape_0) {
|
||||
const int shape_dims = shape.DimensionsCount();
|
||||
const int check_shape_0_dims = check_shape_0.DimensionsCount();
|
||||
const int min_dims = std::min(shape_dims, check_shape_0_dims);
|
||||
|
||||
for (int i = 0; i < min_dims; ++i) {
|
||||
TFLITE_DCHECK_EQ(shape.Dims(shape_dims - 1 - i),
|
||||
check_shape_0.Dims(check_shape_0_dims - 1 - i));
|
||||
}
|
||||
for (int i = min_dims; i < shape_dims; ++i) {
|
||||
TFLITE_DCHECK_EQ(shape.Dims(shape_dims - 1 - i), 1);
|
||||
}
|
||||
for (int i = min_dims; i < check_shape_0_dims; ++i) {
|
||||
TFLITE_DCHECK_EQ(check_shape_0.Dims(check_shape_0_dims - 1 - i), 1);
|
||||
}
|
||||
return shape.FlatSize();
|
||||
}
|
||||
|
||||
inline int MatchingExtendedShapeFlatSize(const RuntimeShape& shape,
|
||||
const RuntimeShape& check_shape_0,
|
||||
const RuntimeShape& check_shape_1) {
|
||||
const int flat_size = MatchingExtendedShapeFlatSize(shape, check_shape_0);
|
||||
TFLITE_DCHECK_EQ(MatchingExtendedShapeFlatSize(shape, check_shape_1),
|
||||
flat_size);
|
||||
return flat_size;
|
||||
}
|
||||
|
||||
inline int MatchingExtendedShapeFlatSize(const RuntimeShape& shape,
|
||||
const RuntimeShape& check_shape_0,
|
||||
const RuntimeShape& check_shape_1,
|
||||
const RuntimeShape& check_shape_2) {
|
||||
const int flat_size = MatchingExtendedShapeFlatSize(shape, check_shape_0);
|
||||
TFLITE_DCHECK_EQ(
|
||||
MatchingExtendedShapeFlatSize(shape, check_shape_1, check_shape_2),
|
||||
flat_size);
|
||||
return flat_size;
|
||||
}
|
||||
|
||||
inline int MatchingExtendedShapeFlatSize(const RuntimeShape& shape,
|
||||
const RuntimeShape& check_shape_0,
|
||||
const RuntimeShape& check_shape_1,
|
||||
const RuntimeShape& check_shape_2,
|
||||
const RuntimeShape& check_shape_3) {
|
||||
const int flat_size = MatchingExtendedShapeFlatSize(shape, check_shape_0);
|
||||
TFLITE_DCHECK_EQ(MatchingExtendedShapeFlatSize(shape, check_shape_1,
|
||||
check_shape_2, check_shape_3),
|
||||
flat_size);
|
||||
return flat_size;
|
||||
}
|
||||
|
||||
// Data is required to be contiguous, and so many operators can use either the
|
||||
// full array flat size or the flat size with one dimension skipped (commonly
|
||||
// the depth).
|
||||
@@ -885,6 +955,8 @@ struct Conv3DParams {
|
||||
float float_activation_max;
|
||||
};
|
||||
|
||||
typedef Conv3DParams Conv3DTransposeParams;
|
||||
|
||||
struct DepthToSpaceParams {
|
||||
int32_t block_size;
|
||||
};
|
||||
@@ -1019,9 +1091,9 @@ struct PackParams {
|
||||
|
||||
struct PadParams {
|
||||
int8_t left_padding_count;
|
||||
int32_t left_padding[4];
|
||||
int32_t left_padding[5];
|
||||
int8_t right_padding_count;
|
||||
int32_t right_padding[4];
|
||||
int32_t right_padding[5];
|
||||
ResizingCategory resizing_category;
|
||||
};
|
||||
|
||||
@@ -1196,6 +1268,23 @@ inline void GetActivationParams(const P& params, int64_t* min, int64_t* max) {
|
||||
*min = params.int64_activation_min;
|
||||
*max = params.int64_activation_max;
|
||||
}
|
||||
|
||||
// Type trait to check of given type has size smaller than 4 bytes.
|
||||
template <typename T>
|
||||
struct is_small_integer
|
||||
: public std::integral_constant<bool,
|
||||
std::is_same<T, int8_t>::value ||
|
||||
std::is_same<T, uint8_t>::value ||
|
||||
std::is_same<T, int16_t>::value ||
|
||||
std::is_same<T, uint16_t>::value> {};
|
||||
|
||||
// Type trait to check of given type is int32 or int64.
|
||||
template <typename T>
|
||||
struct is_int32_or_int64
|
||||
: public std::integral_constant<bool, std::is_same<T, int32_t>::value ||
|
||||
std::is_same<T, int64_t>::value> {
|
||||
};
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
|
||||
|
||||
Reference in New Issue
Block a user