rolling 20210708

This commit is contained in:
jomjol
2021-08-07 15:25:27 +02:00
parent 6f06af1d5f
commit 32f15fc557
138 changed files with 8048 additions and 2292 deletions

View File

@@ -279,81 +279,125 @@ inline Integer FloorLog2(Integer n) {
}
}
// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
// softmax
// func - the function to build the LUT for (e.g exp(x))
// min,max - table limits
// table - pointer to buffer
// num - number of elements in the LUT
inline void gen_lut(double (*func)(double), double min, double max,
int16_t* table, const int num) {
// size of table should equal to num + 1
// last element only for slope calculation
double step = (max - min) / (num - 1);
double half_step = step / 2.0;
for (int i = 0; i < num - 1; i++) {
double sample_val = TfLiteRound(func(min + i * step) * 32768.0);
double midpoint_interp_val =
TfLiteRound((func(min + (i + 1) * step) * 32768.0 +
TfLiteRound(func(min + i * step) * 32768.0)) /
2.0);
double midpoint_val =
TfLiteRound(func(min + i * step + half_step) * 32768.0);
double midpoint_err = midpoint_interp_val - midpoint_val;
double bias = TfLiteRound(midpoint_err / 2.0);
table[i] = std::min<double>(std::max<double>(sample_val - bias, -32768.0),
32767.0);
}
table[num - 1] = std::min<double>(
std::max<double>(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
// The size of the LUT depends on the type of input. For int8 inputs a simple
// 256 entries LUT is used. For int16 inputs the high 9 bits are used for
// indexing and the 7 remaining bits are used for interpolation. We thus use a
// 513-entries LUT for int16 cases, 512 for the 9-bit indexing and 1 extra entry
// to interpolate the last value.
template <typename LutInT>
constexpr int lut_size() {
static_assert(std::is_same<LutInT, int8_t>::value ||
std::is_same<LutInT, int16_t>::value,
"Only LUTs with int8 or int16 inputs are supported.");
return std::is_same<LutInT, int8_t>::value ? 256 : 513;
}
// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
// softmax
// func - the function to build the LUT for (e.g exp(x))
// min,max - table limits
// table - pointer to buffer
// num - number of elements in the LUT
inline void gen_lut(float (*func)(float), float min, float max, int16_t* table,
const int num) {
// size of table should equal to num + 1
// last element only for slope calculation
float step = (max - min) / (num - 1);
float half_step = step / 2.0f;
for (int i = 0; i < num - 1; i++) {
float sample_val = TfLiteRound(func(min + i * step) * 32768.0f);
float midpoint_interp_val =
TfLiteRound((func(min + (i + 1) * step) * 32768.0f +
TfLiteRound(func(min + i * step) * 32768.0f)) /
2.0f);
float midpoint_val =
TfLiteRound(func(min + i * step + half_step) * 32768.0f);
float midpoint_err = midpoint_interp_val - midpoint_val;
float bias = TfLiteRound(midpoint_err / 2.0f);
table[i] = std::min<float>(std::max<float>(sample_val - bias, -32768.0f),
32767.0f);
// Generate a LUT for 'func' which can be used to approximate functions like
// exp, log, ...
//
// - func: the function to build the LUT for (e.g exp(x))
// - input_min, input_max: range of the func inputs
// - output_min, output_max: range of the func outputs
// - lut: pointer to the LUT table to fill, the table must be of size
// lut_size<LutInT>()
template <typename FloatT, typename LutInT, typename LutOutT>
inline void gen_lut(FloatT (*func)(FloatT), FloatT input_min, FloatT input_max,
FloatT output_min, FloatT output_max, LutOutT* lut) {
static_assert(std::is_same<LutInT, int8_t>::value ||
std::is_same<LutInT, int16_t>::value,
"Only LUTs with int8 or int16 inputs are supported.");
static_assert(std::is_same<LutOutT, int8_t>::value ||
std::is_same<LutOutT, int16_t>::value,
"Only LUTs with int8 or int16 outputs are supported.");
static_assert(std::is_floating_point<FloatT>::value,
"FloatT must be a floating-point type.");
const int nb_steps = std::is_same<LutInT, int8_t>::value ? 256 : 512;
const FloatT step = (input_max - input_min) / nb_steps;
const FloatT half_step = step / 2;
const FloatT output_scaling_inv =
static_cast<FloatT>(std::numeric_limits<LutOutT>::max() -
std::numeric_limits<LutOutT>::min() + 1) /
(output_max - output_min);
const FloatT table_min =
static_cast<FloatT>(std::numeric_limits<LutOutT>::min());
const FloatT table_max =
static_cast<FloatT>(std::numeric_limits<LutOutT>::max());
for (int i = 0; i < nb_steps; i++) {
const FloatT val = func(input_min + i * step);
const FloatT val_midpoint = func(input_min + i * step + half_step);
const FloatT val_next = func(input_min + (i + 1) * step);
const FloatT sample_val = TfLiteRound(val * output_scaling_inv);
const FloatT midpoint_interp_val =
TfLiteRound((val_next * output_scaling_inv +
TfLiteRound(val * output_scaling_inv)) /
2);
const FloatT midpoint_val = TfLiteRound(val_midpoint * output_scaling_inv);
const FloatT midpoint_err = midpoint_interp_val - midpoint_val;
const FloatT bias = TfLiteRound(midpoint_err / 2);
lut[i] = static_cast<LutOutT>(std::min<FloatT>(
std::max<FloatT>(sample_val - bias, table_min), table_max));
}
const bool with_extra_interpolation_value =
std::is_same<LutInT, int16_t>::value;
if (with_extra_interpolation_value) {
lut[nb_steps] = static_cast<LutOutT>(std::min<FloatT>(
std::max<FloatT>(TfLiteRound(func(input_max) * output_scaling_inv),
table_min),
table_max));
}
table[num - 1] = std::min<float>(
std::max<float>(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
}
// int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) {
// 512 base value, lut[513] only for calculate slope
uint16_t index = static_cast<uint16_t>(256 + (value >> 7));
// LUT must have 513 values
template <typename LutOutT>
inline LutOutT lut_lookup_with_interpolation(int16_t value,
const LutOutT* lut) {
static_assert(std::is_same<LutOutT, int8_t>::value ||
std::is_same<LutOutT, int16_t>::value,
"Only LUTs with int8 or int16 outputs are supported.");
// 512 base values, lut[513] is only used to calculate the slope
const uint16_t index = static_cast<uint16_t>(256 + (value >> 7));
assert(index < 512 && "LUT index out of range.");
int16_t offset = value & 0x7f;
const int16_t offset = value & 0x7f;
// base and slope are Q0.15
int16_t base = lut[index];
int16_t slope = lut[index + 1] - lut[index];
// Base and slope are Q0.x
const LutOutT base = lut[index];
const LutOutT slope = lut[index + 1] - lut[index];
// Q0.15 * Q0.7 = Q0.22
// Round and convert from Q0.22 to Q0.15
int32_t delta = (static_cast<int32_t>(slope) * offset + 64) >> 7;
// Q0.x * Q0.7 = Q0.(x + 7)
// Round and convert from Q0.(x + 7) to Q0.x
const int delta = (slope * offset + 64) >> 7;
// Q0.15 + Q0.15
return base + delta;
return static_cast<LutOutT>(base + delta);
}
// int16_t -> int16_t table lookup with interpolation
// LUT must have 513 values
inline int16_t lut_lookup(int16_t value, const int16_t* lut) {
return lut_lookup_with_interpolation(value, lut);
}
// int16_t -> int8_t table lookup with interpolation
// LUT must have 513 values
inline int8_t lut_lookup(int16_t value, const int8_t* lut) {
return lut_lookup_with_interpolation(value, lut);
}
// int8_t -> int8_t table lookup without interpolation
// LUT must have 256 values
inline int8_t lut_lookup(int8_t value, const int8_t* lut) {
return lut[128 + value];
}
// int8_t -> int16_t table lookup without interpolation
// LUT must have 256 values
inline int16_t lut_lookup(int8_t value, const int16_t* lut) {
return lut[128 + value];
}
// Table of sigmoid(i/24) at 0.16 format - 256 elements.
@@ -575,7 +619,8 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
// InputIntegerBits - z_b_headroom - 0.25);
const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
static_cast<int32_t>(InputIntegerBits - z_a_headroom_plus_1),
31 - kAccumIntegerBits)),
shifted_quarter);
// z_b is treated like z_a, but premultiplying by sqrt(0.5).
@@ -585,7 +630,8 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
static_cast<int32_t>(InputIntegerBits - z_b_headroom),
31 - kAccumIntegerBits)),
shifted_quarter);
const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));

View File

@@ -19,9 +19,8 @@ limitations under the License.
namespace tflite {
#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
(defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO) || \
defined(__ZEPHYR__)
#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
(defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(__ZEPHYR__)
#define TF_LITE_GLOBAL_STD_PREFIX
#else
#define TF_LITE_GLOBAL_STD_PREFIX std

View File

@@ -15,26 +15,6 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#define USE_NEON
#include <arm_neon.h>
#endif
#if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON
#define USE_NEON
#include "NEON_2_SSE.h"
#endif
// NEON_OR_PORTABLE(SomeFunc, args) calls NeonSomeFunc(args) if USE_NEON is
// defined, PortableSomeFunc(args) otherwise.
#ifdef USE_NEON
// Always use Neon code
#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__)
#else
// No NEON available: Use Portable code
#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__)
#endif // defined(USE_NEON)
// TFLM does not need to utilize any Neon optimizations.
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_

View File

@@ -15,6 +15,8 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
#include <type_traits>
#include "fixedpoint/fixedpoint.h"
#include "tensorflow/lite/kernels/internal/common.h"
@@ -27,25 +29,14 @@ inline void Add(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape& input2_shape, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) {
T activation_min, activation_max;
GetActivationParams(params, &activation_min, &activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
input1_data[i] + input2_data[i], params.quantized_activation_min,
params.quantized_activation_max);
}
}
inline void Add(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const float* input1_data,
const RuntimeShape& input2_shape, const float* input2_data,
const RuntimeShape& output_shape, float* output_data) {
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; i++) {
auto x = input1_data[i] + input2_data[i];
output_data[i] = ActivationFunctionWithMinMax(
x, params.float_activation_min, params.float_activation_max);
input1_data[i] + input2_data[i], activation_min, activation_max);
}
}
@@ -202,13 +193,12 @@ inline void Add(const ArithmeticParams& params,
}
}
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const float* input1_data,
const RuntimeShape& input2_shape,
const float* input2_data,
const RuntimeShape& output_shape,
float* output_data) {
template <typename T>
inline typename std::enable_if<!is_small_integer<T>::value, void>::type
BroadcastAdd4DSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape& input2_shape, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) {
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -216,6 +206,9 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
const RuntimeShape extended_output_shape =
RuntimeShape::ExtendedShape(4, output_shape);
T activation_min, activation_max;
GetActivationParams(params, &activation_min, &activation_max);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
@@ -232,51 +225,10 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
output_data[Offset(extended_output_shape, b, y, x, c)] =
ActivationFunctionWithMinMax(
ActivationFunctionWithMinMax<T>(
input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
input2_data[SubscriptToIndex(desc2, b, y, x, c)],
params.float_activation_min, params.float_activation_max);
}
}
}
}
}
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int32_t* input1_data,
const RuntimeShape& input2_shape,
const int32_t* input2_data,
const RuntimeShape& output_shape,
int32_t* output_data) {
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
const RuntimeShape extended_output_shape =
RuntimeShape::ExtendedShape(4, output_shape);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
output_data[Offset(extended_output_shape, b, y, x, c)] =
ActivationFunctionWithMinMax(
input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
input2_data[SubscriptToIndex(desc2, b, y, x, c)],
params.quantized_activation_min,
params.quantized_activation_max);
activation_min, activation_max);
}
}
}
@@ -287,10 +239,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
// is 32-bit for both cases. The overflow does not happen due to the
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
template <typename T>
inline void BroadcastAdd4DSlow(
const ArithmeticParams& params, const RuntimeShape& input1_shape,
const T* input1_data, const RuntimeShape& input2_shape,
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
inline typename std::enable_if<is_small_integer<T>::value, void>::type
BroadcastAdd4DSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape& input2_shape, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) {
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,

View File

@@ -15,7 +15,10 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
#include "tensorflow/lite/kernels/internal/types.h"
#include <algorithm>
#include <limits>
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_ops {
@@ -36,6 +39,47 @@ inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
}
}
inline void AddN(const ArithmeticParams& params,
const RuntimeShape& input_shape, const size_t num_inputs,
const int8_t* const* input_data, int8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
// Input offset is negative input zero point. Activation tensors are
// asymmetric quantized so they span the full int8 range.
// All inputs should have same zero-point and scale, this is checked during
// Prepare stage.
TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
// All inputs and output should have the same shape, this is checked during
// Prepare stage.
const size_t size = input_shape.FlatSize();
for (size_t i = 0; i < size; ++i) {
// accumulate in scaled_x before clamping to avoid overflow
const int32_t x = params.input1_offset; // x = 0
const int32_t shifted_x = x * (1 << params.left_shift);
int32_t scaled_x = MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_x, params.input1_multiplier, params.input1_shift);
for (size_t j = 0; j < num_inputs; ++j) {
const int32_t y = params.input1_offset + input_data[j][i];
const int32_t shifted_y = y * (1 << params.left_shift);
int32_t scaled_y = MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_y, params.input1_multiplier, params.input1_shift);
scaled_x += scaled_y;
}
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
scaled_x, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[i] = static_cast<int8_t>(clamped_output);
}
}
} // namespace reference_ops
} // namespace tflite

View File

@@ -0,0 +1,275 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
#include <algorithm>
#include <cstdint>
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/tensor_utils_common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
namespace batch_matmul {
// Determine which dimension is the broadcast dimension.
inline int broadcast_dim(int lhs_dim, int rhs_dim) {
if (lhs_dim == rhs_dim) return lhs_dim;
if (lhs_dim == 1) return rhs_dim;
TFLITE_DCHECK_EQ(rhs_dim, 1);
return lhs_dim;
}
// Compute the "extent" for iterating on this dimension.
// If we are broadcasting, then don't advance (i.e return 0).
inline int extent(const RuntimeShape& shape, int x) {
if (shape.Dims(x) == 1) {
return 0;
}
int prod = 1;
for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
prod *= shape.Dims(i);
}
return prod;
}
} // namespace batch_matmul
template <typename Ta, typename Tb, typename Tout>
inline void BatchMatMul(const RuntimeShape& lhs_shape, const Ta* lhs_data,
const RuntimeShape& rhs_shape, const Tb* rhs_data,
const RuntimeShape& output_shape, Tout* output_data) {
const RuntimeShape extended_lhs_shape =
RuntimeShape::ExtendedShape(5, lhs_shape);
const RuntimeShape extended_rhs_shape =
RuntimeShape::ExtendedShape(5, rhs_shape);
const int batch_dim0 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
const int batch_dim1 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
const int batch_dim2 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
// Set params for each matrix multiply.
const int lhs_rows = extended_lhs_shape.Dims(3);
const int rhs_cols = extended_rhs_shape.Dims(4);
const int accum_depth = extended_lhs_shape.Dims(4);
for (int b0 = 0; b0 < batch_dim0; ++b0) {
const Ta* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
const Tb* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
for (int b1 = 0; b1 < batch_dim1; ++b1) {
const Ta* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
const Tb* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
for (int b2 = 0; b2 < batch_dim2; ++b2) {
const Ta* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
const Tb* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
Tout* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
b1 * batch_dim2 + b2) *
lhs_rows * rhs_cols;
for (int j = 0; j < rhs_cols; ++j) {
for (int i = 0; i < lhs_rows; ++i) {
Tout total = 0;
for (int k = 0; k < accum_depth; ++k) {
total += static_cast<Tout>(lhs_ptr2[accum_depth * i + k]) *
static_cast<Tout>(rhs_ptr2[j * accum_depth + k]);
}
int idx = lhs_rows * j + i;
out_ptr[idx] = total;
}
}
}
}
}
}
inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
const RuntimeShape& rhs_shape, const int8_t* rhs_data,
const float* scaling_factors,
const int32_t* input_offset, int32_t* row_sums,
const RuntimeShape& output_shape, float* output_data,
bool* compute_row_sums) {
const RuntimeShape extended_lhs_shape =
RuntimeShape::ExtendedShape(5, lhs_shape);
const RuntimeShape extended_rhs_shape =
RuntimeShape::ExtendedShape(5, rhs_shape);
const int batch_dim0 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
const int batch_dim1 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
const int batch_dim2 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
// Set params for each matrix multiply.
const int lhs_rows = extended_lhs_shape.Dims(3);
const int rhs_cols = extended_rhs_shape.Dims(4);
const int accum_depth = extended_lhs_shape.Dims(4);
const int ioff_ext0 = rhs_ext0 == 0 ? 0 : rhs_cols;
const int ioff_ext1 = rhs_ext1 == 0 ? 0 : rhs_cols;
const int ioff_ext2 = rhs_ext2 == 0 ? 0 : rhs_cols;
const int woff_ext0 = lhs_ext0 == 0 ? 0 : lhs_rows;
const int woff_ext1 = lhs_ext1 == 0 ? 0 : lhs_rows;
const int woff_ext2 = lhs_ext2 == 0 ? 0 : lhs_rows;
if (!compute_row_sums || *compute_row_sums) {
int num_weights_matrices = 1;
for (int i = 1; i < extended_lhs_shape.DimensionsCount() - 2; ++i) {
num_weights_matrices *= extended_lhs_shape.Dims(i);
}
tensor_utils::ReductionSumVector(
lhs_data, row_sums, num_weights_matrices * lhs_rows, accum_depth);
if (compute_row_sums) {
*compute_row_sums = false;
}
}
for (int b0 = 0; b0 < batch_dim0; ++b0) {
const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
const int32_t* ioff_ptr0 = input_offset + (b0 * ioff_ext0);
const float* scale_ptr0 = scaling_factors + (b0 * ioff_ext0);
const int32_t* woff_ptr0 = row_sums + (b0 * woff_ext0);
for (int b1 = 0; b1 < batch_dim1; ++b1) {
const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
const int32_t* ioff_ptr1 = ioff_ptr0 + (b1 * ioff_ext1);
const float* scale_ptr1 = scale_ptr0 + (b1 * ioff_ext1);
const int32_t* woff_ptr1 = woff_ptr0 + (b1 * woff_ext1);
for (int b2 = 0; b2 < batch_dim2; ++b2) {
const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
const int32_t* ioff_ptr2 = ioff_ptr1 + (b2 * ioff_ext2);
const float* scale_ptr2 = scale_ptr1 + (b2 * ioff_ext2);
const int32_t* woff_ptr2 = woff_ptr1 + (b2 * woff_ext2);
float* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
b1 * batch_dim2 + b2) *
lhs_rows * rhs_cols;
for (int j = 0; j < rhs_cols; ++j) {
const float batch_scaling_factor = scale_ptr2[j];
const float batch_offset = static_cast<float>(ioff_ptr2[j]);
for (int i = 0; i < lhs_rows; ++i) {
int32_t total = 0;
for (int k = 0; k < accum_depth; ++k) {
total +=
lhs_ptr2[accum_depth * i + k] * rhs_ptr2[j * accum_depth + k];
}
int32_t row_sum = woff_ptr2[i];
total -= row_sum * batch_offset;
int idx = lhs_rows * j + i;
out_ptr[idx] += batch_scaling_factor * total;
}
}
}
}
}
}
template <typename T, typename AccumT>
inline void BatchMatMul(const FullyConnectedParams& params,
const RuntimeShape& lhs_shape, const T* lhs_data,
const RuntimeShape& rhs_shape, const T* rhs_data,
const RuntimeShape& output_shape, T* output_data) {
const RuntimeShape extended_lhs_shape =
RuntimeShape::ExtendedShape(5, lhs_shape);
const RuntimeShape extended_rhs_shape =
RuntimeShape::ExtendedShape(5, rhs_shape);
const int batch_dim0 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
const int batch_dim1 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
const int batch_dim2 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
// Set params for each matrix multiply.
const int lhs_rows = extended_lhs_shape.Dims(3);
const int rhs_cols = extended_rhs_shape.Dims(4);
const int accum_depth = extended_lhs_shape.Dims(4);
const int32_t input_offset = params.input_offset;
const int32_t filter_offset = params.weights_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
for (int b0 = 0; b0 < batch_dim0; ++b0) {
const T* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
const T* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
for (int b1 = 0; b1 < batch_dim1; ++b1) {
const T* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
const T* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
for (int b2 = 0; b2 < batch_dim2; ++b2) {
const T* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
const T* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
T* out_ptr = output_data +
((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) *
lhs_rows * rhs_cols;
for (int j = 0; j < rhs_cols; ++j) {
for (int i = 0; i < lhs_rows; ++i) {
AccumT total = 0;
for (int k = 0; k < accum_depth; ++k) {
AccumT lhs_val = lhs_ptr2[accum_depth * i + k];
AccumT rhs_val = rhs_ptr2[accum_depth * j + k];
total += (lhs_val + filter_offset) * (rhs_val + input_offset);
}
int32_t total_scaled = MultiplyByQuantizedMultiplier(
total, output_multiplier, output_shift);
total_scaled += output_offset;
total_scaled = std::max(total_scaled, output_activation_min);
total_scaled = std::min(total_scaled, output_activation_max);
const int idx = lhs_rows * j + i;
out_ptr[idx] = static_cast<T>(total_scaled);
}
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_

View File

@@ -0,0 +1,175 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
#include <algorithm>
#include <cstdint>
#include <limits>
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void CumSum(const T* input_data, const RuntimeShape& shape, int32_t axis,
bool exclusive, bool reverse, T* output_data) {
const int32_t rank = shape.DimensionsCount();
TFLITE_DCHECK_GE(rank, 1);
TFLITE_DCHECK_GE(axis, 0);
TFLITE_DCHECK_LT(axis, rank);
size_t inner = 1;
size_t outer = 1;
size_t depth = 1;
for (int32_t i = 0; i < rank; i++) {
if (i < axis)
inner *= shape.Dims(i);
else if (i > axis)
outer *= shape.Dims(i);
else
depth = shape.Dims(i);
}
for (size_t outer_index = 0; outer_index < outer; outer_index++) {
size_t outer_index_adj;
if (reverse)
outer_index_adj = (outer - 1) - outer_index;
else
outer_index_adj = outer_index;
for (size_t inner_index = 0; inner_index < inner; inner_index++) {
T accumulator = 0;
size_t inner_index_adj;
if (reverse)
inner_index_adj = (inner - 1) - inner_index;
else
inner_index_adj = inner_index;
for (size_t depth_index = 0; depth_index < depth; depth_index++) {
size_t depth_index_adj;
if (reverse)
depth_index_adj = (depth - 1) - depth_index;
else
depth_index_adj = depth_index;
size_t index = outer_index_adj;
index += inner_index_adj * depth * outer;
index += depth_index_adj * outer;
if (exclusive) {
output_data[index] = accumulator;
accumulator += input_data[index];
} else {
accumulator += input_data[index];
output_data[index] = accumulator;
}
}
}
}
}
//
// Quantized INT8 CUMSUM
//
inline void CumSum(const ArithmeticParams& params, const int8_t* input_data,
const RuntimeShape& shape, int32_t axis, bool exclusive,
bool reverse, int8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
// Input offset is negative input zero point. Activation tensors are
// asymmetric quantized so they span the full int8 range.
// All inputs should have same zero-point and scale, this is checked during
// Prepare stage.
TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
const int32_t rank = shape.DimensionsCount();
TFLITE_DCHECK_GE(rank, 1);
TFLITE_DCHECK_GE(axis, 0);
TFLITE_DCHECK_LT(axis, rank);
size_t inner = 1;
size_t outer = 1;
size_t depth = 1;
for (int32_t i = 0; i < rank; i++) {
if (i < axis)
inner *= shape.Dims(i);
else if (i > axis)
outer *= shape.Dims(i);
else
depth = shape.Dims(i);
}
for (size_t outer_index = 0; outer_index < outer; outer_index++) {
size_t outer_index_adj;
if (reverse)
outer_index_adj = (outer - 1) - outer_index;
else
outer_index_adj = outer_index;
for (size_t inner_index = 0; inner_index < inner; inner_index++) {
int32_t accumulator = params.input1_offset; // accumulator = 0
accumulator *= (1 << params.left_shift);
accumulator = MultiplyByQuantizedMultiplierSmallerThanOneExp(
accumulator, params.input1_multiplier, params.input1_shift);
size_t inner_index_adj;
if (reverse)
inner_index_adj = (inner - 1) - inner_index;
else
inner_index_adj = inner_index;
for (size_t depth_index = 0; depth_index < depth; depth_index++) {
size_t depth_index_adj;
if (reverse)
depth_index_adj = (depth - 1) - depth_index;
else
depth_index_adj = depth_index;
size_t index = outer_index_adj;
index += inner_index_adj * depth * outer;
index += depth_index_adj * outer;
const int32_t y = params.input1_offset + input_data[index];
const int32_t shifted_y = y * (1 << params.left_shift);
const int32_t scaled_y = MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_y, params.input1_multiplier, params.input1_shift);
int32_t scaled_output;
if (exclusive) {
scaled_output = accumulator;
accumulator += scaled_y;
} else {
accumulator += scaled_y;
scaled_output = accumulator;
}
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
scaled_output, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[index] = static_cast<int8_t>(clamped_output);
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_

View File

@@ -0,0 +1,79 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(4, unextended_input_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
const int input_depth = input_shape.Dims(3);
const int input_width = input_shape.Dims(2);
const int input_height = input_shape.Dims(1);
const int input_batch = input_shape.Dims(0);
const int output_depth = output_shape.Dims(3);
const int output_width = output_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_batch = output_shape.Dims(0);
const int32_t block_size = op_params.block_size;
TFLITE_DCHECK_EQ(input_width * block_size, output_width);
TFLITE_DCHECK_EQ(input_height * block_size, output_height);
TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size);
TFLITE_DCHECK_EQ(input_batch, output_batch);
for (int out_b = 0; out_b < output_batch; ++out_b) {
for (int out_h = 0; out_h < output_height; ++out_h) {
for (int out_w = 0; out_w < output_width; ++out_w) {
for (int out_d = 0; out_d < output_depth; ++out_d) {
const int in_d =
out_d + ((out_h % block_size) * block_size + out_w % block_size) *
output_depth;
const int in_w = out_w / block_size;
const int in_h = out_h / block_size;
const int in_b = out_b;
const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
const int output_index =
Offset(output_shape, out_b, out_h, out_w, out_d);
output_data[output_index] = input_data[input_index];
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_

View File

@@ -1,239 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
#include <algorithm>
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void DivCheckArithmeticParams(const ArithmeticParams& params) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
// Input offset is negative input zero point. Activation tensors are
// asymmetric quantized so they span the full int8 range.
constexpr int32_t max_value =
static_cast<int32_t>(std::numeric_limits<T>::max());
TFLITE_DCHECK_GE(params.input1_offset, -max_value);
TFLITE_DCHECK_LE(params.input1_offset, max_value);
TFLITE_DCHECK_GE(params.input2_offset, -max_value);
TFLITE_DCHECK_LE(params.input2_offset, max_value);
TFLITE_DCHECK_GE(params.output_offset, -max_value);
TFLITE_DCHECK_LE(params.output_offset, max_value);
}
// Element-wise div that can often be used for inner loop of broadcast Div as
// well as the non-broadcast Div.
template <typename T>
inline void DivElementwise(int size, const ArithmeticParams& params,
const T* input1_data, const T* input2_data,
T* output_data) {
DivCheckArithmeticParams<T>(params);
for (int i = 0; i < size; ++i) {
const int32_t input1_val = params.input1_offset + input1_data[i];
const int32_t input2_val = params.input2_offset + input2_data[i];
TFLITE_DCHECK_NE(input2_val, 0);
int recip_shift;
const int32_t input2_inv =
(input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
: -GetReciprocal(-input2_val, 31, &recip_shift);
const int headroom = CountLeadingSignBits(input1_val);
const int32_t unscaled_quotient =
MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
headroom);
const int total_shift = params.output_shift - recip_shift - headroom;
const int32_t unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplierSmallerThanOneExp(
unscaled_quotient, params.output_multiplier, total_shift);
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, unclamped_result));
output_data[i] = static_cast<T>(clamped_output);
}
}
inline void Div(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const uint8_t* input1_data,
const RuntimeShape& input2_shape, const uint8_t* input2_data,
const RuntimeShape& output_shape, uint8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
DivElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void Div(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int8_t* input1_data,
const RuntimeShape& input2_shape, const int8_t* input2_data,
const RuntimeShape& output_shape, int8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
DivElementwise(flat_size, params, input1_data, input2_data, output_data);
}
template <typename T, int N = 5>
inline void BroadcastDivSlowQuantized(
const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape,
const T* input1_data, const RuntimeShape& unextended_input2_shape,
const T* input2_data, const RuntimeShape& unextended_output_shape,
T* output_data) {
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
NdArrayDesc<N> desc1;
NdArrayDesc<N> desc2;
NdArrayDesc<N> output_desc;
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
unextended_input2_shape, &desc1, &desc2);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
&output_desc);
DivCheckArithmeticParams<T>(params);
auto div_func = [&](int indexes[N]) {
const int32_t input1_val =
params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
const int32_t input2_val =
params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
TFLITE_DCHECK_NE(input2_val, 0);
int recip_shift;
const int32_t input2_inv =
(input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
: -GetReciprocal(-input2_val, 31, &recip_shift);
const int headroom = CountLeadingSignBits(input1_val);
const int32_t unscaled_quotient =
MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
headroom);
const int total_shift = params.output_shift - recip_shift - headroom;
const int32_t unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplierSmallerThanOneExp(
unscaled_quotient, params.output_multiplier, total_shift);
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, unclamped_result));
output_data[SubscriptToIndex(output_desc, indexes)] =
static_cast<T>(clamped_output);
};
NDOpsHelper<N>(output_desc, div_func);
}
template <int N = 5>
inline void BroadcastDivSlow(const ArithmeticParams& params,
const RuntimeShape& unextended_input1_shape,
const uint8_t* input1_data,
const RuntimeShape& unextended_input2_shape,
const uint8_t* input2_data,
const RuntimeShape& unextended_output_shape,
uint8_t* output_data) {
BroadcastDivSlowQuantized<uint8_t, N>(
params, unextended_input1_shape, input1_data, unextended_input2_shape,
input2_data, unextended_output_shape, output_data);
}
template <int N = 5>
inline void BroadcastDivSlow(const ArithmeticParams& params,
const RuntimeShape& unextended_input1_shape,
const int8_t* input1_data,
const RuntimeShape& unextended_input2_shape,
const int8_t* input2_data,
const RuntimeShape& unextended_output_shape,
int8_t* output_data) {
BroadcastDivSlowQuantized<int8_t, N>(
params, unextended_input1_shape, input1_data, unextended_input2_shape,
input2_data, unextended_output_shape, output_data);
}
// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
// dimensionality if the runtime code does a single loop over one dimension
// that handles broadcasting as the base case. The code generator would then
// generate max(D1, D2) nested for loops.
template <typename T, int N = 5>
void BroadcastDivSlow(const ArithmeticParams& params,
const RuntimeShape& unextended_input1_shape,
const T* input1_data,
const RuntimeShape& unextended_input2_shape,
const T* input2_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
T output_activation_min;
T output_activation_max;
GetActivationParams(params, &output_activation_min, &output_activation_max);
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
NdArrayDesc<N> desc1;
NdArrayDesc<N> desc2;
NdArrayDesc<N> output_desc;
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
unextended_input2_shape, &desc1, &desc2);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
&output_desc);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest
// stride, typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
auto div_func = [&](int indexes[N]) {
output_data[SubscriptToIndex(output_desc, indexes)] =
ActivationFunctionWithMinMax(
input1_data[SubscriptToIndex(desc1, indexes)] /
input2_data[SubscriptToIndex(desc2, indexes)],
output_activation_min, output_activation_max);
};
NDOpsHelper<N>(output_desc, div_func);
}
template <typename T>
inline void Div(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape& input2_shape, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) {
T output_activation_min;
T output_activation_max;
GetActivationParams(params, &output_activation_min, &output_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
input1_data[i] / input2_data[i], output_activation_min,
output_activation_max);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_

View File

@@ -0,0 +1,35 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
#include <cmath>
#include <functional>
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
T FloorDiv(T input1, T input2) {
return std::floor(std::divides<double>()(static_cast<double>(input1),
static_cast<double>(input2)));
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_

View File

@@ -0,0 +1,44 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
#include <cmath>
#include <functional>
namespace tflite {
namespace reference_ops {
template <typename T>
T FloorMod(T input1, T input2) {
struct FloatMod {
float operator()(const float lhs, const float rhs) const {
return std::fmod(lhs, rhs);
}
};
using ModFunc = typename std::conditional<std::is_integral<T>::value,
std::modulus<T>, FloatMod>::type;
ModFunc mod_func;
T trunc_mod = mod_func(input1, input2);
return (trunc_mod != 0) && ((input2 < 0) != (trunc_mod < 0))
? (trunc_mod + input2)
: trunc_mod;
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_

View File

@@ -21,7 +21,7 @@ limitations under the License.
namespace tflite {
namespace reference_integer_ops {
inline void AveragePool(const PoolParams& params,
inline bool AveragePool(const PoolParams& params,
const RuntimeShape& input_shape,
const int8_t* input_data,
const RuntimeShape& output_shape, int8_t* output_data) {
@@ -66,6 +66,7 @@ inline void AveragePool(const PoolParams& params,
filter_count++;
}
}
if (filter_count == 0) return false;
// Round to the closest integer value.
acc = acc > 0 ? (acc + filter_count / 2) / filter_count
: (acc - filter_count / 2) / filter_count;
@@ -77,6 +78,7 @@ inline void AveragePool(const PoolParams& params,
}
}
}
return true;
}
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
@@ -136,7 +138,7 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
}
}
inline void AveragePool(const PoolParams& params,
inline bool AveragePool(const PoolParams& params,
const RuntimeShape& input_shape,
const int16_t* input_data,
const RuntimeShape& output_shape,
@@ -182,6 +184,7 @@ inline void AveragePool(const PoolParams& params,
filter_count++;
}
}
if (filter_count == 0) return false;
// Round to the closest integer value.
acc = acc > 0 ? (acc + filter_count / 2) / filter_count
: (acc - filter_count / 2) / filter_count;
@@ -193,6 +196,7 @@ inline void AveragePool(const PoolParams& params,
}
}
}
return true;
}
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,

View File

@@ -0,0 +1,256 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
#include <algorithm>
#include <cstddef>
#include <limits>
#include "fixedpoint/fixedpoint.h"
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_ops {
inline void LogSoftmax(const SoftmaxParams& params,
const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const int trailing_dim = input_shape.DimensionsCount() - 1;
const int outer_size =
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
const int depth =
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
for (int i = 0; i < outer_size; ++i) {
// Find max element value which we'll use to ensure numerical stability
// taking advantage of the following equality:
// log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
float max = std::numeric_limits<float>::lowest();
for (int c = 0; c < depth; ++c) {
max = std::max(max, input_data[i * depth + c]);
}
// Compute sum.
float sum = 0.f;
for (int c = 0; c < depth; ++c) {
sum += std::exp(input_data[i * depth + c] - max);
}
// Compute result.
const float log_sum = std::log(sum);
for (int c = 0; c < depth; ++c) {
output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
}
}
}
inline void LogSoftmax(const SoftmaxParams& params,
const RuntimeShape& input_shape,
const uint8_t* input_data,
const RuntimeShape& output_shape, uint8_t* output_data) {
const int32_t input_multiplier = params.input_multiplier;
const int32_t input_left_shift = params.input_left_shift;
const int32_t reverse_scaling_divisor = params.reverse_scaling_divisor;
const int32_t reverse_scaling_right_shift =
params.reverse_scaling_right_shift;
const int diff_min = params.diff_min;
// The representation chosen for the input to the exp() function is Q5.26.
// We need to leave extra space since values that we skip might be as large
// as -32 before multiplying by input_beta_multiplier, and therefore as
// large as -16 afterwards. Note that exp(-8) is definitely not
// insignificant to accumulation, but exp(-16) definitely is.
static constexpr int kScaledDiffIntegerBits = 5;
static constexpr int kAccumulationIntegerBits = 12;
static constexpr int kOutputIntegerBits = 4;
using FixedPointScaledDiff =
gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
using FixedPointAccum =
gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
const int trailing_dim = input_shape.DimensionsCount() - 1;
const int outer_size =
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
const int depth =
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
for (int i = 0; i < outer_size; ++i) {
uint8_t max_in_row = 0;
for (int c = 0; c < depth; ++c) {
max_in_row = std::max(max_in_row, input_data[i * depth + c]);
}
FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
for (int c = 0; c < depth; ++c) {
int32_t input_diff =
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
if (input_diff >= diff_min) {
const int32_t input_diff_rescaled =
MultiplyByQuantizedMultiplierGreaterThanOne(
input_diff, input_multiplier, input_left_shift);
const FixedPointScaledDiff scaled_diff_f8 =
FixedPointScaledDiff::FromRaw(input_diff_rescaled);
sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
exp_on_negative_values(scaled_diff_f8));
}
}
const int32_t fixed_log_sum_of_exps =
log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>(
sum_of_exps)
.raw();
// rescaled_diff_min is smallest representable in
// Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
// log-sub-exps that will be subtracted in the loop.
//
// The thresholds diff_min, etc are negative.
const int rescaled_diff_min =
fixed_log_sum_of_exps + std::numeric_limits<int32_t>::lowest();
const int adjusted_diff_min =
std::max(static_cast<int32_t>(
diff_min - 1), // Note use of > below instead of >= above.
MultiplyByQuantizedMultiplierSmallerThanOneExp(
rescaled_diff_min, reverse_scaling_divisor,
-reverse_scaling_right_shift));
for (int c = 0; c < depth; ++c) {
int32_t input_diff =
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
if (input_diff > adjusted_diff_min) {
const int32_t input_diff_rescaled =
MultiplyByQuantizedMultiplierGreaterThanOne(
input_diff, input_multiplier, input_left_shift);
int32_t unsat_output =
gemmlowp::RoundingDivideByPOT(
(input_diff_rescaled - fixed_log_sum_of_exps),
31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
255;
output_data[i * depth + c] = static_cast<uint8_t>(
std::max(std::min(unsat_output, static_cast<int32_t>(255)),
static_cast<int32_t>(0)));
} else {
// Set output to smallest value.
output_data[i * depth + c] = 0;
}
}
}
}
template <typename T>
inline void LogSoftmaxQuantized(const SoftmaxParams& params,
const size_t outer_size, const size_t depth,
const RuntimeShape& input_shape,
const T* input_data,
const RuntimeShape& output_shape,
T* output_data) {
const int32_t input_multiplier = params.input_multiplier;
const int32_t input_left_shift = params.input_left_shift;
const int32_t reverse_scaling_divisor = params.reverse_scaling_divisor;
const int32_t reverse_scaling_right_shift =
params.reverse_scaling_right_shift;
const int diff_min = params.diff_min;
static constexpr T kMinT8 = std::numeric_limits<T>::min();
static constexpr T kMaxT8 = std::numeric_limits<T>::max();
static constexpr int32_t kMinInt32 = std::numeric_limits<int32_t>::min();
// All IntegerBits must agree with Prepare function.
// Input is chosen as Q5.26 so exp(-1 * 2^5 * 2^-1) = exp(-16) is negligible.
static constexpr int kInputIntegerBits = 5;
static constexpr int kAccumulationIntegerBits = 12;
static constexpr int kOutputIntegerBits = 4;
using F5 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
using F12 = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
for (size_t outer_index = 0; outer_index < outer_size; ++outer_index) {
T max_in_row = kMinT8;
for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
max_in_row =
std::max(max_in_row, input_data[outer_index * depth + inner_index]);
}
// Accumulator "sum_of_exps_in_q12" is safe from overflowing in 2^12 steps.
F12 sum_of_exps_in_q12 = F12::FromRaw(0);
for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
int32_t input_diff =
static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
max_in_row;
if (input_diff >= diff_min) {
const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
input_diff, input_multiplier, input_left_shift);
sum_of_exps_in_q12 =
sum_of_exps_in_q12 +
gemmlowp::Rescale<kAccumulationIntegerBits>(
exp_on_negative_values(F5::FromRaw(input_diff_in_q5)));
}
}
const int32_t log_sum_of_exps_in_q5 =
log_x_for_x_greater_than_or_equal_to_1<kInputIntegerBits>(
sum_of_exps_in_q12)
.raw();
// Potentially reduced the valid range. shifted_log_sum_of_exps_in_q5 is
// smallest representable in Q5.26 plus the log_sum_of_exps.
const int32_t shifted_log_sum_of_exps_in_q5 =
log_sum_of_exps_in_q5 + kMinInt32;
const int32_t adjusted_diff_min =
std::max(static_cast<int32_t>(diff_min - 1),
MultiplyByQuantizedMultiplier(shifted_log_sum_of_exps_in_q5,
reverse_scaling_divisor,
-reverse_scaling_right_shift));
for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
int32_t input_diff =
static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
max_in_row;
// Note use of > below instead of >= above.
if (input_diff > adjusted_diff_min) {
const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
input_diff, input_multiplier, input_left_shift);
// Rescale and downcast.
int32_t output_in_q27 =
gemmlowp::RoundingDivideByPOT(
(input_diff_in_q5 - log_sum_of_exps_in_q5),
31 - kInputIntegerBits - kOutputIntegerBits) +
kMaxT8;
output_in_q27 =
std::max(std::min(output_in_q27, static_cast<int32_t>(kMaxT8)),
static_cast<int32_t>(kMinT8));
output_data[outer_index * depth + inner_index] =
static_cast<T>(output_in_q27);
} else {
output_data[outer_index * depth + inner_index] = kMinT8;
}
}
}
}
inline void LogSoftmax(const SoftmaxParams& params, const size_t outer_size,
const size_t depth, const RuntimeShape& input_shape,
const int8_t* input_data,
const RuntimeShape& output_shape, int8_t* output_data) {
LogSoftmaxQuantized(params, outer_size, depth, input_shape, input_data,
output_shape, output_data);
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_

View File

@@ -51,7 +51,7 @@ inline void Mul(const ArithmeticParams& params,
GetActivationParams(params, &output_activation_min, &output_activation_max);
const int flat_size =
MatchingFlatSize(input1_shape, input2_shape, output_shape);
MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
input1_data[i] * input2_data[i], output_activation_min,
@@ -66,7 +66,7 @@ inline void Mul(const ArithmeticParams& params,
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int flat_size =
MatchingFlatSize(input1_shape, input2_shape, output_shape);
MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
MulElementwise(flat_size, params, input1_data, input2_data, output_data);
}

View File

@@ -24,8 +24,8 @@ namespace tflite {
namespace reference_ops {
// TFLite Pad supports activation tensors with up to 4 dimensions.
constexpr int PadKernelMaxDimensionCount() { return 4; }
// TFLite Pad supports activation tensors with up to 5 dimensions.
constexpr int PadKernelMaxDimensionCount() { return 5; }
// There are two versions of pad: Pad and PadV2. In PadV2 there is a second
// scalar input that provides the padding value. Therefore pad_value_ptr can be
@@ -46,8 +46,8 @@ inline void PadImpl(const tflite::PadParams& op_params,
TFLITE_DCHECK_LE(op_params.left_padding_count, PadKernelMaxDimensionCount());
TFLITE_DCHECK_LE(op_params.right_padding_count, PadKernelMaxDimensionCount());
// Runtime calls are currently fixed at 4 dimensions. Copy inputs so we can
// pad them to 4 dims (yes, we are "padding the padding").
// Runtime calls are currently fixed at 5 dimensions. Copy inputs so we can
// pad them to 5 dims (yes, we are "padding the padding").
int left_padding_copy[PadKernelMaxDimensionCount()];
for (int i = 0; i < PadKernelMaxDimensionCount(); i++) {
left_padding_copy[i] = 0;
@@ -67,39 +67,46 @@ inline void PadImpl(const tflite::PadParams& op_params,
}
const int output_batch = ext_output_shape.Dims(0);
const int output_height = ext_output_shape.Dims(1);
const int output_width = ext_output_shape.Dims(2);
const int output_depth = ext_output_shape.Dims(3);
const int output_plane = ext_output_shape.Dims(1);
const int output_height = ext_output_shape.Dims(2);
const int output_width = ext_output_shape.Dims(3);
const int output_depth = ext_output_shape.Dims(4);
const int left_b_padding = left_padding_copy[0];
const int left_h_padding = left_padding_copy[1];
const int left_w_padding = left_padding_copy[2];
const int left_d_padding = left_padding_copy[3];
const int left_p_padding = left_padding_copy[1];
const int left_h_padding = left_padding_copy[2];
const int left_w_padding = left_padding_copy[3];
const int left_d_padding = left_padding_copy[4];
const int right_b_padding = right_padding_copy[0];
const int right_h_padding = right_padding_copy[1];
const int right_w_padding = right_padding_copy[2];
const int right_d_padding = right_padding_copy[3];
const int right_p_padding = right_padding_copy[1];
const int right_h_padding = right_padding_copy[2];
const int right_w_padding = right_padding_copy[3];
const int right_d_padding = right_padding_copy[4];
const T pad_value = *pad_value_ptr;
const T* in_ptr = input_data;
T* out_ptr = output_data;
for (int out_b = 0; out_b < output_batch; ++out_b) {
for (int out_h = 0; out_h < output_height; ++out_h) {
for (int out_w = 0; out_w < output_width; ++out_w) {
for (int out_d = 0; out_d < output_depth; ++out_d) {
if (out_b < left_b_padding ||
out_b >= output_batch - right_b_padding ||
out_h < left_h_padding ||
out_h >= output_height - right_h_padding ||
out_w < left_w_padding ||
out_w >= output_width - right_w_padding ||
out_d < left_d_padding ||
out_d >= output_depth - right_d_padding) {
*out_ptr++ = pad_value;
} else {
*out_ptr++ = *in_ptr++;
for (int out_p = 0; out_p < output_plane; ++out_p) {
for (int out_h = 0; out_h < output_height; ++out_h) {
for (int out_w = 0; out_w < output_width; ++out_w) {
for (int out_d = 0; out_d < output_depth; ++out_d) {
if (out_b < left_b_padding ||
out_b >= output_batch - right_b_padding ||
out_p < left_p_padding ||
out_p >= output_plane - right_p_padding ||
out_h < left_h_padding ||
out_h >= output_height - right_h_padding ||
out_w < left_w_padding ||
out_w >= output_width - right_w_padding ||
out_d < left_d_padding ||
out_d >= output_depth - right_d_padding) {
*out_ptr++ = pad_value;
} else {
*out_ptr++ = *in_ptr++;
}
}
}
}

View File

@@ -23,7 +23,7 @@ limitations under the License.
namespace tflite {
namespace reference_ops {
inline void AveragePool(const PoolParams& params,
inline bool AveragePool(const PoolParams& params,
const RuntimeShape& input_shape,
const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
@@ -66,6 +66,7 @@ inline void AveragePool(const PoolParams& params,
filter_count++;
}
}
if (filter_count == 0) return false;
const float average = total / filter_count;
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
ActivationFunctionWithMinMax(average, params.float_activation_min,
@@ -74,9 +75,10 @@ inline void AveragePool(const PoolParams& params,
}
}
}
return true;
}
inline void AveragePool(const PoolParams& params,
inline bool AveragePool(const PoolParams& params,
const RuntimeShape& input_shape,
const uint8_t* input_data,
const RuntimeShape& output_shape,
@@ -122,6 +124,7 @@ inline void AveragePool(const PoolParams& params,
filter_count++;
}
}
if (filter_count == 0) return false;
acc = (acc + filter_count / 2) / filter_count;
acc = std::max(acc, params.quantized_activation_min);
acc = std::min(acc, params.quantized_activation_max);
@@ -131,6 +134,7 @@ inline void AveragePool(const PoolParams& params,
}
}
}
return true;
}
inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,

View File

@@ -0,0 +1,774 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <limits>
#include <utility>
#include "fixedpoint/fixedpoint.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
#if defined(_MSC_VER)
#define __restrict__ __restrict
#endif
namespace tflite {
namespace tensor_utils {
namespace {
const int32_t kInt16Max = std::numeric_limits<int16_t>::max();
const int32_t kInt16Min = std::numeric_limits<int16_t>::min();
} // namespace
void PortableSymmetricQuantizeFloats(const float* values, const int size,
int8_t* quantized_values, float* min_value,
float* max_value, float* scaling_factor) {
auto minmax = std::minmax_element(values, values + size);
*min_value = *minmax.first;
*max_value = *minmax.second;
PortableSymmetricQuantizeFloats(values, size, quantized_values, *min_value,
*max_value, scaling_factor);
}
void PortableSymmetricQuantizeFloats(const float* values, const int size,
int8_t* quantized_values, float min_value,
float max_value, float* scaling_factor) {
const int32_t kScale = 127;
const float range = std::max(std::abs(min_value), std::abs(max_value));
if (range == 0) {
memset(quantized_values, 0, size * sizeof(int8_t));
*scaling_factor = 1;
return;
}
*scaling_factor = range / kScale;
const float scaling_factor_inv = kScale / range;
for (int i = 0; i < size; ++i) {
const int32_t quantized_value =
static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
// Clamp: just in case some odd numeric offset.
quantized_values[i] = static_cast<int8_t>(
std::min(kScale, std::max(-kScale, quantized_value)));
}
}
void PortableAsymmetricQuantizeFloats(const float* values, const int size,
int8_t* quantized_values,
float* scaling_factor, int32_t* offset) {
const int32_t kMinScale = -128;
const int32_t kMaxScale = 127;
const double qmin_double = kMinScale;
const double qmax_double = kMaxScale;
const auto minmax = std::minmax_element(values, values + size);
const double rmin = std::fmin(0, *minmax.first);
const double rmax = std::fmax(0, *minmax.second);
if (rmin == rmax) {
memset(quantized_values, 0, size * sizeof(int8_t));
*scaling_factor = 1;
*offset = 0;
return;
} else {
double scale = (rmax - rmin) / (qmax_double - qmin_double);
const double zero_point_from_min = qmin_double - rmin / scale;
const double zero_point_from_max = qmax_double - rmax / scale;
const double zero_point_from_min_error =
std::abs(qmin_double) + std::abs(rmin / scale);
const double zero_point_from_max_error =
std::abs(qmax_double) + std::abs(rmax / scale);
const double zero_point_double =
zero_point_from_min_error < zero_point_from_max_error
? zero_point_from_min
: zero_point_from_max;
int8_t nudged_zero_point = 0;
if (zero_point_double <= qmin_double) {
nudged_zero_point = kMinScale;
} else if (zero_point_double >= qmax_double) {
nudged_zero_point = kMaxScale;
} else {
nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
}
*scaling_factor = scale;
*offset = nudged_zero_point;
}
const float scaling_factor_inv = 1.0f / *scaling_factor;
for (int i = 0; i < size; ++i) {
const int32_t quantized_value = static_cast<int32_t>(
TfLiteRound(*offset + values[i] * scaling_factor_inv));
quantized_values[i] =
std::min(kMaxScale, std::max(kMinScale, quantized_value));
}
}
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
int m_rows, int m_cols,
const float* vector,
int n_batch, float* result) {
float* result_in_batch = result;
for (int b = 0; b < n_batch; b++) {
const float* matrix_ptr = matrix;
for (int r = 0; r < m_rows; r++) {
float dot_prod = 0.0f;
const float* vector_in_batch = vector + b * m_cols;
for (int c = 0; c < m_cols; c++) {
dot_prod += *matrix_ptr++ * *vector_in_batch++;
}
*result_in_batch += dot_prod;
++result_in_batch;
}
}
}
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors, const float* scaling_factors,
int n_batch, float* __restrict__ result) {
for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
const float batch_scaling_factor = scaling_factors[batch];
// Get the address of the first row.
const int8_t* row_ptr = matrix;
for (int row = 0; row < m_rows; ++row) {
// Initialize the dot product sum for the row to 0.
int32_t dotprod = 0;
#if defined(__GNUC__)
// Prefetch the row to cache.
__builtin_prefetch(row_ptr, 0 /* prefetch for read */,
3 /* temporal locality */);
#endif
for (int col = 0; col < m_cols; ++col, ++row_ptr) {
dotprod += (*row_ptr) * (vectors[col]);
} // for col
*result += dotprod * batch_scaling_factor;
++result;
} // for row
} // for batch
}
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors, const float* scaling_factors,
int n_batch, float* __restrict__ result, const float* per_channel_scale,
const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
bool* compute_row_sums, CpuBackendContext* context) {
if (input_offset == nullptr) {
PortableMatrixBatchVectorMultiplyAccumulate(
matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
return;
}
if (!compute_row_sums || *compute_row_sums) {
PortableReductionSumVector(matrix, row_sums, m_rows, m_cols);
if (compute_row_sums) {
*compute_row_sums = false;
}
}
for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
const float batch_scaling_factor = scaling_factors[batch];
const int32_t batch_offset = input_offset[batch];
const int8_t* row_ptr = matrix;
for (int row = 0; row < m_rows; ++row) {
int32_t dotprod = 0;
float scale = batch_scaling_factor;
if (per_channel_scale) {
scale *= per_channel_scale[row];
}
#if defined(__GNUC__)
// Prefetch the row to cache.
__builtin_prefetch(row_ptr, 0 /* prefetch for read */,
3 /* temporal locality */);
#endif
for (int col = 0; col < m_cols; ++col, ++row_ptr) {
dotprod += (*row_ptr) * vectors[col];
} // for col
dotprod -= row_sums[row] * batch_offset;
*result += dotprod * scale;
++result;
} // for row
} // for batch
}
void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
const float* __restrict__ matrix, const int32_t* __restrict__ segments,
const int32_t* __restrict__ indices, int m_rows, int m_cols,
const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
const int kBlockSize = 4;
TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
for (int batch = 0; batch < n_batch; batch++) {
const float* matrix_ptr = matrix;
for (int row = 0; row < m_rows; row++) {
float dot_prod = 0.0f;
const float* vector_in_batch = vector + batch * m_cols;
for (int i = segments[row]; i < segments[row + 1]; i++) {
const int block_start_index = indices[i] * kBlockSize;
const float* vector_block_in_batch_ptr =
vector_in_batch + block_start_index;
for (int c = 0; c < kBlockSize; c++) {
dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
}
}
result[batch * m_rows + row] += dot_prod;
}
}
}
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
float* __restrict__ result) {
const int kBlockSize = 16;
TFLITE_DCHECK_EQ( // NOLINT
m_cols % kBlockSize, 0);
for (int batch = 0; batch < n_batch; batch++) {
const float* matrix_ptr = matrix;
const uint8_t* ledger_ptr = ledger;
for (int row = 0; row < m_rows; row++) {
float dot_prod = 0.0f;
int num_nonzero_blocks = *ledger_ptr++;
if (num_nonzero_blocks > 0) {
const float* vector_in_batch = vector + batch * m_cols;
for (int i = 0; i < num_nonzero_blocks; i++) {
const int block_start_index = *ledger_ptr++ * kBlockSize;
const float* vector_block_in_batch_ptr =
vector_in_batch + block_start_index;
for (int c = 0; c < kBlockSize; c++) {
dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
}
}
}
result[batch * m_rows + row] += dot_prod;
}
}
}
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
const int m_cols, const int8_t* __restrict__ vectors,
const float* scaling_factors, int n_batch, float* __restrict__ result) {
static const int kBlockSize = 16;
TFLITE_DCHECK_EQ( // NOLINT
m_cols % kBlockSize, 0);
for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
const float batch_scaling_factor = scaling_factors[batch];
const uint8_t* ledger_ptr = ledger;
// Get the address of the first row.
const int8_t* row_ptr = matrix;
for (int row = 0; row < m_rows; ++row) {
// Initialize the dot product sum for the row to 0.
int32_t dotprod = 0;
#if defined(__GNUC__)
// Prefetch the row to cache.
__builtin_prefetch(row_ptr, 0 /* prefetch for read */,
3 /* temporal locality */);
#endif
int num_nonzero_blocks = *ledger_ptr++;
for (int i = 0; i < num_nonzero_blocks; i++) {
const int block_start_index = *ledger_ptr++ * kBlockSize;
const int8_t* vector_block_ptr = vectors + block_start_index;
for (int c = 0; c < kBlockSize; c++) {
dotprod += (*row_ptr++) * (*vector_block_ptr++);
} // for block
} // for num_nonzero_blocks
result[batch * m_rows + row] += dotprod * batch_scaling_factor;
} // for row
} // for batch
}
template <typename T>
void PortableMatrixBatchVectorMultiplyAccumulateImpl(
const int8_t* input, const int32_t* bias,
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
T* output) {
const int16_t output_max = std::numeric_limits<T>::max();
const int16_t output_min = std::numeric_limits<T>::min();
for (int batch = 0; batch < n_batch; ++batch) {
for (int row = 0; row < n_output; ++row) {
int32_t acc = bias[row];
for (int col = 0; col < n_input; ++col) {
int8_t input_val = input[batch * n_input + col];
int8_t weights_val = input_to_gate_weights[row * n_input + col];
acc += input_val * weights_val;
}
acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
acc += output_zp;
acc += output[batch * n_output + row];
if (acc > output_max) {
acc = output_max;
}
if (acc < output_min) {
acc = output_min;
}
output[batch * n_output + row] = static_cast<T>(acc);
}
}
}
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* input, const int32_t* bias,
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
int32_t* scratch, int16_t* output, CpuBackendContext* context) {
PortableMatrixBatchVectorMultiplyAccumulateImpl(
input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
n_output, output_zp, output);
}
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* input, const int32_t* bias,
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
int32_t* scratch, int8_t* output, CpuBackendContext* context) {
PortableMatrixBatchVectorMultiplyAccumulateImpl(
input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
n_output, output_zp, output);
}
void PortableMatrixBatchVectorMultiply(const int8_t* input,
int32_t input_zeropoint,
const int8_t* input_to_gate_weights,
int32_t input_to_gate_effective_scale_a,
int32_t input_to_gate_effective_scale_b,
int32_t n_batch, int32_t n_input,
int32_t n_cell, int8_t* gate_output,
int8_t gate_output_zp) {
const int32_t int8_max = std::numeric_limits<int8_t>::max();
const int32_t int8_min = std::numeric_limits<int8_t>::min();
for (int batch = 0; batch < n_batch; ++batch) {
for (int row = 0; row < n_cell; ++row) {
int32_t acc = 0;
for (int col = 0; col < n_input; ++col) {
int32_t input_val = input[batch * n_input + col];
int8_t weights_val = input_to_gate_weights[row * n_input + col];
acc += (input_val - input_zeropoint) * weights_val;
}
acc = MultiplyByQuantizedMultiplier(acc, input_to_gate_effective_scale_a,
input_to_gate_effective_scale_b);
acc += gate_output_zp;
if (acc > int8_max) {
acc = int8_max;
}
if (acc < int8_min) {
acc = int8_min;
}
gate_output[batch * n_cell + row] = static_cast<int8_t>(acc);
}
}
}
void PortableMatrixBatchVectorMultiply(
const int16_t* hidden, const int8_t* hidden_to_output_weights,
int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
int32_t n_output, int32_t output_zp, int8_t* proj_output) {
const int16_t int8_max = std::numeric_limits<int8_t>::max();
const int16_t int8_min = std::numeric_limits<int8_t>::min();
for (int batch = 0; batch < n_batch; ++batch) {
for (int row = 0; row < n_output; ++row) {
int64_t acc = gate_bias[row];
for (int col = 0; col < n_hidden; ++col) {
int16_t input_val = hidden[batch * n_hidden + col];
int8_t weights_val = hidden_to_output_weights[row * n_hidden + col];
int64_t curr = acc;
acc += input_val * weights_val;
if (input_val * weights_val > 0 && acc < curr) {
acc = std::numeric_limits<int32_t>::max();
}
if (input_val * weights_val < 0 && acc > curr) {
acc = std::numeric_limits<int32_t>::min();
}
}
acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
proj_effective_scale_b);
acc += output_zp;
if (acc > int8_max) {
acc = int8_max;
}
if (acc < int8_min) {
acc = int8_min;
}
proj_output[batch * n_output + row] = acc;
}
}
}
void PortableApplyLayerNorm(const int16_t* input,
const int16_t* layer_norm_weights,
const int32_t* bias, int32_t layer_norm_scale_a,
int32_t layer_norm_scale_b, int32_t variance_limit,
int n_batch, int n_input, int16_t* output) {
// The square of std::pow(2, 10), which is the extra factor that makes sure
// normalized values has enough resolution.
static const int kTwoToPower20 = 1 << 20;
for (int i = 0; i < n_batch; ++i) {
int64_t sum = 0;
int64_t sum_sq = 0;
for (int j = 0; j < n_input; ++j) {
const int32_t index = i * n_input + j;
int32_t val = static_cast<int32_t>(input[index]);
sum += val;
sum_sq += val * val;
}
int32_t mean =
static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
// TODO(b/173994730): Avoids overflow but only works for POT n_input.
int32_t temp = kTwoToPower20 / n_input;
int64_t variance =
sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
int32_t variance2 = static_cast<int32_t>(variance / kTwoToPower20);
if (variance2 < 1) {
variance2 = variance_limit;
}
int32_t stddev_inverse_a;
int stddev_inverse_b;
GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
&stddev_inverse_a, &stddev_inverse_b);
for (int j = 0; j < n_input; ++j) {
const int32_t index = i * n_input + j;
int32_t val = static_cast<int32_t>(input[index]);
int32_t shifted = 1024 * val - mean;
int32_t rescaled = MultiplyByQuantizedMultiplier(
shifted, stddev_inverse_a, stddev_inverse_b);
// TODO(jianlijianli): Saturate this.
int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
int32_t val4 =
static_cast<int32_t>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
int32_t val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
layer_norm_scale_b + 12);
val5 = std::min(std::max(kInt16Min, val5), kInt16Max);
output[index] = static_cast<int16_t>(val5);
}
}
}
void PortableApplyLayerNormFloat(const int16_t* input,
const int16_t* layer_norm_weights,
int32_t layer_norm_scale_a,
int32_t layer_norm_scale_b,
const int32_t* bias, int n_batch, int n_input,
int16_t* output) {
const int32_t int16_max = std::numeric_limits<int16_t>::max();
const int32_t int16_min = std::numeric_limits<int16_t>::min();
const float layer_norm_scale =
layer_norm_scale_a *
std::pow(2.0, static_cast<double>(layer_norm_scale_b - 31));
const float bias_scale =
static_cast<float>(std::pow(2.0, -10)) * layer_norm_scale;
for (int batch = 0; batch < n_batch; ++batch) {
float sum = 0.0f;
float sum_sq = 0.0f;
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
const float value = static_cast<float>(input[index]);
sum += value;
sum_sq += value * value;
}
const float mean = sum / n_input;
float stddev_inv = 0.0f;
const float variance = sum_sq / n_input - mean * mean;
if (variance == 0) {
stddev_inv = 1.0f / std::sqrt(1e-8f);
} else {
stddev_inv = 1.0f / std::sqrt(variance);
}
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
const float normalized_value =
(static_cast<float>(input[index]) - mean) * stddev_inv;
const float weighted_normalized_value =
normalized_value * layer_norm_weights[i] * layer_norm_scale +
bias[i] * bias_scale;
const int32_t quant_output = static_cast<int32_t>(std::round(
weighted_normalized_value * static_cast<float>(std::pow(2, 12))));
output[index] = std::min(int16_max, std::max(int16_min, quant_output));
}
}
}
void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
int32_t scalar, int32_t n_row,
int32_t n_col, int32_t* output) {
for (int i = 0; i < n_row; ++i) {
int32_t row_sum = 0;
for (int j = 0; j < n_col; ++j) {
row_sum += *matrix++;
}
output[i] += row_sum * scalar;
}
}
void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
int32_t n_input, int16_t* output) {
for (int batch = 0; batch < n_batch; ++batch) {
for (int c = 0; c < n_input; c++) {
using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
const int index = batch * n_input + c;
F3 sigmoid_input = F3::FromRaw(input[index]);
F0 sigmoid_output = gemmlowp::logistic(sigmoid_input);
output[index] = sigmoid_output.raw();
}
}
}
void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
int32_t n_input, int16_t* output) {
const int32_t int16_max = std::numeric_limits<int16_t>::max();
const int32_t int16_min = std::numeric_limits<int16_t>::min();
for (int batch = 0; batch < n_batch; ++batch) {
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
const float float_input =
input[index] * static_cast<float>(std::pow(2, -12));
const float float_output = 1.0f / (1.0f + std::exp(-float_input));
const int32_t quant_output = static_cast<int32_t>(
float_output * static_cast<float>(std::pow(2, 15)));
const int32_t quant_output_clamped =
std::min(int16_max, std::max(int16_min, quant_output));
output[index] = static_cast<int16_t>(quant_output_clamped);
}
}
}
template <int IntegerBits>
void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch,
int32_t n_input, int16_t* output) {
using FX = gemmlowp::FixedPoint<std::int16_t, IntegerBits>;
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
for (int batch = 0; batch < n_batch; ++batch) {
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
FX tanh_input = FX::FromRaw(input[index]);
F0 tanh_output = gemmlowp::tanh(tanh_input);
output[index] = tanh_output.raw();
}
}
}
void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
int32_t n_batch, int32_t n_input, int16_t* output) {
assert(integer_bits <= 6);
#define DISPATCH_TANH(i) \
case i: \
PortableApplyTanhImpl<i>(input, n_batch, n_input, output); \
break;
switch (integer_bits) {
DISPATCH_TANH(0);
DISPATCH_TANH(1);
DISPATCH_TANH(2);
DISPATCH_TANH(3);
DISPATCH_TANH(4);
DISPATCH_TANH(5);
DISPATCH_TANH(6);
default:
return;
}
#undef DISPATCH_TANH
}
void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
int32_t n_input, int32_t integer_bits,
int16_t* output) {
const int32_t int16_max = std::numeric_limits<int16_t>::max();
const int32_t int16_min = std::numeric_limits<int16_t>::min();
const double two = 2.0;
for (int batch = 0; batch < n_batch; ++batch) {
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
const float float_input =
input[index] * std::pow(two, static_cast<double>(integer_bits));
const float float_output = std::tanh(float_input);
const int32_t quant_output = static_cast<int32_t>(
float_output * static_cast<float>(std::pow(2, 15)));
const int32_t quant_output_clamped =
std::min(int16_max, std::max(int16_min, quant_output));
output[index] = static_cast<int16_t>(quant_output_clamped);
}
}
}
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
int n_batch, int n_input, int shift, int16_t* output) {
for (int batch = 0; batch < n_batch; ++batch) {
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
const int16_t a = input_1[index];
const int16_t b = input_2[index];
const int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
output[index] =
static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
}
}
}
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
int32_t multiplier, int32_t shift, int32_t n_batch,
int32_t n_input, int32_t output_zp, int8_t* output) {
for (int batch = 0; batch < n_batch; ++batch) {
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
const int16_t a = input_1[index];
const int16_t b = input_2[index];
int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
value -= output_zp;
value = std::min(std::max(static_cast<int32_t>(-128), value),
static_cast<int32_t>(127));
output[index] = static_cast<int8_t>(value);
}
}
}
void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
int n_batch, int n_input, int16_t* output) {
for (int batch = 0; batch < n_batch; ++batch) {
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
int32_t sum = input_1[index] + input_2[index];
const int32_t sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum));
output[index] = static_cast<int16_t>(sum_clamped);
}
}
}
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
int v_size) {
float result = 0.0;
for (int v = 0; v < v_size; v++) {
result += *vector1++ * *vector2++;
}
return result;
}
namespace {
inline int32_t VectorVectorDotProduct(const int16_t* vector1,
const int16_t* vector2, int v_size) {
int32_t result = 0;
for (int v = 0; v < v_size; v++) {
result += *vector1++ * *vector2++;
}
return result;
}
} // namespace
void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
const int16_t* vector2,
int v_size, int n_batch,
int32_t* result) {
for (int b = 0; b < n_batch; b++) {
result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
vector1 += v_size;
vector2 += v_size;
}
}
void PortableVectorBatchVectorCwiseProductAccumulate(
const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
int32_t multiplier, int shift, int16_t* result) {
for (int b = 0; b < n_batch; b++) {
for (int v = 0; v < v_size; v++) {
int32_t prod = vector[v] * *batch_vector++;
prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
int32_t output = prod + *result;
output = std::max(std::min(static_cast<int32_t>(32767), output),
static_cast<int32_t>(-32768));
*result++ = output;
}
}
}
void PortableSub1Vector(const float* vector, int v_size, float* result) {
for (int v = 0; v < v_size; v++) {
*result++ = 1.0f - *vector++;
}
}
void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result) {
static const int16_t kOne = 32767;
for (int v = 0; v < v_size; v++) {
*result++ = kOne - *vector++;
}
}
void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
const float scale, float* result) {
for (int v = 0; v < v_size; ++v) {
*result++ = scale * *vector++;
}
}
void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
float* __restrict__ output_vector,
int v_size, int n_batch) {
for (int batch = 0; batch < n_batch; ++batch) {
float sum = 0.0f;
for (int i = 0; i < v_size; ++i) {
sum += input_vector[i];
}
const float mean = sum / v_size;
float sum_diff_sq = 0.0f;
for (int i = 0; i < v_size; ++i) {
const float diff = input_vector[i] - mean;
sum_diff_sq += diff * diff;
}
const float variance = sum_diff_sq / v_size;
constexpr float kNormalizationConstant = 1e-8f;
const float stddev_inv =
1.0f / std::sqrt(variance + kNormalizationConstant);
for (int i = 0; i < v_size; ++i) {
output_vector[i] = (input_vector[i] - mean) * stddev_inv;
}
input_vector += v_size;
output_vector += v_size;
}
}
void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
const int8_t* recurrent, int8_t recurrent_zp,
int32_t input_effective_scale_a,
int32_t input_effective_scale_b,
int32_t recurrent_effective_scale_a,
int32_t recurrent_effective_scale_b,
int32_t n_batch, int32_t n_cell,
int16_t* output) {
const int32_t int16_max = std::numeric_limits<int16_t>::max();
const int32_t int16_min = std::numeric_limits<int16_t>::min();
for (int i = 0; i < n_batch * n_cell; ++i) {
int32_t x = static_cast<int32_t>(input[i]) - static_cast<int32_t>(input_zp);
int32_t h =
static_cast<int32_t>(recurrent[i]) - static_cast<int32_t>(recurrent_zp);
int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
input_effective_scale_b);
int32_t h_scaled = MultiplyByQuantizedMultiplier(
h, recurrent_effective_scale_a, recurrent_effective_scale_b);
int32_t y = h_scaled + x_scaled;
if (y > int16_max) {
y = int16_max;
}
if (y < int16_min) {
y = int16_min;
}
output[i] = static_cast<int16_t>(y);
}
}
} // namespace tensor_utils
} // namespace tflite

View File

@@ -0,0 +1,235 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
#include <algorithm>
#include <cstdint>
#if defined(_MSC_VER)
#define __restrict__ __restrict
#endif
namespace tflite {
// Not all backends support CpuBackendContext usage, so forward declare to avoid
// pulling in its implementation.
class CpuBackendContext;
namespace tensor_utils {
template <typename T>
bool PortableIsZeroVector(const T* vector, int v_size) {
for (int i = 0; i < v_size; ++i) {
if (vector[i] != 0) {
return false;
}
}
return true;
}
void PortableSymmetricQuantizeFloats(const float* values, const int size,
int8_t* quantized_values, float* min_value,
float* max_value, float* scaling_factor);
void PortableSymmetricQuantizeFloats(const float* values, const int size,
int8_t* quantized_values, float min_value,
float max_value, float* scaling_factor);
void PortableAsymmetricQuantizeFloats(const float* values, const int size,
int8_t* quantized_values,
float* scaling_factor, int32_t* offset);
// Multiply a matrix by a batch vector, and store results in a batch-size
// vector.
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
int m_rows, int m_cols,
const float* vector,
int n_batch, float* result);
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors, const float* scaling_factors,
int n_batch, float* __restrict__ result);
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors, const float* scaling_factors,
int n_batch, float* __restrict__ result, const float* per_channel_scale,
const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
bool* compute_row_sums, CpuBackendContext* context);
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vector, const float* scaling_factors,
int n_batch, int32_t* scratch, float* __restrict__ result,
CpuBackendContext* context);
void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
const float* __restrict__ matrix, const int32_t* __restrict__ segments,
const int32_t* __restrict__ indices, int m_rows, int m_cols,
const float* __restrict__ vector, int n_batch, float* __restrict__ result);
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
float* __restrict__ result);
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
const int m_cols, const int8_t* __restrict__ vectors,
const float* scaling_factors, int n_batch, float* __restrict__ result);
// Dot product of two vectors.
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
int v_size);
void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
const int16_t* vector2,
int v_size, int n_batch,
int32_t* result);
void PortableVectorBatchVectorCwiseProductAccumulate(
const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
int32_t multiplier, int shift, int16_t* result);
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* input, const int32_t* bias,
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
int32_t* scratch, int16_t* output, CpuBackendContext* context);
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* input, const int32_t* bias,
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
int32_t* scratch, int8_t* output, CpuBackendContext* context);
void PortableMatrixBatchVectorMultiply(const int8_t* input,
int32_t input_zeropoint,
const int8_t* input_to_gate_weights,
int32_t input_to_gate_effective_scale_a,
int32_t input_to_gate_effective_scale_b,
int32_t n_batch, int32_t n_input,
int32_t n_cell, int8_t* gate_output,
int8_t gate_output_zp);
void PortableMatrixBatchVectorMultiply(
const int16_t* hidden, const int8_t* hidden_to_output_weights,
int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
int32_t n_output, int32_t output_zp, int8_t* proj_output);
void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
int32_t scalar, int32_t n_row,
int32_t n_col, int32_t* output);
void PortableApplyLayerNorm(const int16_t* input,
const int16_t* layer_norm_weights,
const int32_t* bias, int32_t layer_norm_scale_a,
int32_t layer_norm_scale_b, int32_t variance_limit,
int n_batch, int n_input, int16_t* output);
void PortableApplyLayerNormFloat(const int16_t* input,
const int16_t* layer_norm_weights,
int32_t layer_norm_scale_a,
int32_t layer_norm_scale_b,
const int32_t* bias, int n_batch, int n_input,
int16_t* output);
void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
int32_t n_input, int16_t* output);
void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
int32_t n_input, int16_t* output);
void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
int32_t n_batch, int32_t n_input, int16_t* output);
void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
int32_t n_input, int32_t integer_bits,
int16_t* output);
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
int n_batch, int n_input, int shift, int16_t* output);
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
int32_t multiplier, int32_t shift, int32_t n_batch,
int32_t n_input, int32_t output_zp, int8_t* output);
void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
int n_batch, int n_input, int16_t* output);
template <typename T>
void PortableCwiseClipping(T* vector, const int v_size,
const T& clipping_value) {
for (int i = 0; i < v_size; i++) {
vector[i] = std::max(std::min(clipping_value, vector[i]),
static_cast<T>(-clipping_value));
}
}
// Batch vector initialization with another vector.
void PortableVectorBatchVectorAssign(const float* vector, int v_size,
int n_batch, float* batch_vector);
// Compute "1.0f - elements of vector" (used in CIFG).
void PortableSub1Vector(const float* vector, int v_size, float* result);
void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result);
// Multiply all elements of vector with a scalar.
void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
float* result);
// Reduce-sum on a vector:
// input_vector: pointer to input vector.
// output_vector: pointer to vector.
// output_size: output vector size.
// reduction_size: number of consecutive elements from input vector which are
// added to get one element of output.
template <typename INPUT, typename OUTPUT>
void PortableReductionSumVector(const INPUT* input_vector,
OUTPUT* output_vector, int output_size,
int reduction_size) {
for (int o = 0; o < output_size; o++) {
OUTPUT result = 0;
for (int r = 0; r < reduction_size; r++) {
result += input_vector[r];
}
output_vector[o] = result;
input_vector += reduction_size;
}
}
// Layer norm for each batch.
void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
float* __restrict__ output_vector,
int v_size, int n_batch);
// Saturate Add.
void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
const int8_t* recurrent, int8_t recurrent_zp,
int32_t input_effective_scale_a,
int32_t input_effective_scale_b,
int32_t recurrent_effective_scale_a,
int32_t recurrent_effective_scale_b,
int32_t n_batch, int32_t n_cell,
int16_t* output);
} // namespace tensor_utils
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_

View File

@@ -23,6 +23,25 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/types.h"
// Check if the reduction at index is the first one along the dimensions given
// in axis.
inline bool IsFirstReduction(const int* index, const int num_axis,
const int* axis) {
if (num_axis == 0) {
return true;
}
TFLITE_DCHECK(index != nullptr);
TFLITE_DCHECK(axis != nullptr);
for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
if (index[axis[axis_idx]] != 0) {
return false;
}
}
return true;
}
namespace tflite {
namespace reference_ops {
@@ -35,8 +54,7 @@ inline bool Reduce(const In* input_data, const int* input_dims,
const int* output_dims, const int input_num_dims,
const int output_num_dims, const int* axis,
const int num_axis, int* input_iter,
Out reducer(const Out current, const In in),
Out* output_data) {
Out reducer(Out current, const In in), Out* output_data) {
// Reset input iterator.
for (int idx = 0; idx < input_num_dims; ++idx) {
input_iter[idx] = 0;
@@ -53,6 +71,37 @@ inline bool Reduce(const In* input_data, const int* input_dims,
return true;
}
// Similar to above Reduce function but takes two reducer functions.
// The 'reducer_first' is called with the first value of the reduction,
// 'reducer_next' is then called for all the others.
template <typename In, typename Out>
inline bool Reduce(const In* input_data, const int* input_dims,
const int* output_dims, const int input_num_dims,
const int output_num_dims, const int* axis,
const int num_axis, int* input_iter,
const std::function<Out(In in)>& reducer_first,
const std::function<Out(Out current, In in)>& reducer_next,
Out* output_data) {
// Reset input iterator.
for (int idx = 0; idx < input_num_dims; ++idx) {
input_iter[idx] = 0;
}
// Iterate through input_data.
do {
size_t input_offset =
ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
input_iter, num_axis, axis);
if (IsFirstReduction(input_iter, num_axis, axis)) {
output_data[output_offset] = reducer_first(input_data[input_offset]);
} else {
output_data[output_offset] =
reducer_next(output_data[output_offset], input_data[input_offset]);
}
} while (NextIndex(input_num_dims, input_dims, input_iter));
return true;
}
// This method parses the input 'axis' to remove duplicates and handle negative
// values, and returns a valid 'out_axis'
inline bool ResolveAxis(const int num_dims, const int* axis,
@@ -111,7 +160,8 @@ inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
for (int idx = 0; idx < num_dims; ++idx) {
size_t current = static_cast<size_t>(dims[idx]);
// Overflow prevention.
if (num_elements > std::numeric_limits<size_t>::max() / current) {
if (current > 0 &&
num_elements > std::numeric_limits<size_t>::max() / current) {
return false;
}
num_elements *= current;
@@ -132,17 +182,20 @@ inline bool ReduceGeneric(const T* input_data, const int* input_dims,
bool keep_dims, int* temp_index, int* resolved_axis,
T init_value,
T reducer(const T current, const T in)) {
// Return early when input shape has zero dim.
for (int i = 0; i < input_num_dims; ++i) {
if (input_dims[i] == 0) return true;
}
// Reset output data.
if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
output_data)) {
return false;
}
// Return early when input shape has zero dim. This is done after initializing
// data for output tensor because there are cases that the input tensor is
// empty but output tensor is not. In that case, output tensor should be
// filled with init_value.
for (int i = 0; i < input_num_dims; ++i) {
if (input_dims[i] == 0) return true;
}
// Resolve axis.
int num_resolved_axis = 0;
if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
@@ -290,9 +343,9 @@ inline void Mean(const tflite::MeanParams& op_params,
constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
int32_t bias =
output_zero_point -
static_cast<int32_t>(input_zero_point * input_scale / output_scale);
float temp = input_zero_point * input_scale / output_scale;
temp = temp > 0 ? temp + 0.5f : temp - 0.5f;
int32_t bias = output_zero_point - static_cast<int32_t>(temp);
double real_scale =
static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
@@ -353,6 +406,14 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
temp_sum[idx] = U();
}
// Return early when input shape has zero dim. This is done after initializing
// data for output tensor because there are cases that the input tensor is
// empty but output tensor is not. In that case, output tensor should be
// filled with init_value.
for (int i = 0; i < input_num_dims; ++i) {
if (input_dims[i] == 0) return true;
}
// Resolve axis.
int num_resolved_axis = 0;
if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
@@ -405,6 +466,57 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
return true;
}
template <typename T>
inline bool QuantizedReduceProd(const T* input_data, int32_t input_zero_point,
const RuntimeShape& input_shape, T* output_data,
int32_t output_zero_point,
const RuntimeShape& output_shape,
const int* axis,
const int64_t num_axis_dimensions,
bool keep_dims, int* temp_index,
int* resolved_axis, int32_t* temp_prod,
int32_t scaling_multiplier, int scaling_shift) {
const int32_t kMinValue = std::numeric_limits<T>::min();
const int32_t kMaxValue = std::numeric_limits<T>::max();
// Resolve axis.
int num_resolved_axis = 0;
if (!ResolveAxis(input_shape.DimensionsCount(), axis, num_axis_dimensions,
resolved_axis, &num_resolved_axis)) {
return false;
}
// Calculate the reduced product by rescaling each multiplication step to
// avoid an overflow.
auto reducer_first = [&](T in) -> int32_t { return in - input_zero_point; };
auto reducer_next = [&](int32_t current, T in) -> int32_t {
const int64_t result =
static_cast<int64_t>(current) * (in - input_zero_point);
return MultiplyByQuantizedMultiplier(result, scaling_multiplier,
scaling_shift);
};
if (!Reduce<T, int32_t>(
input_data, input_shape.DimsData(), output_shape.DimsData(),
input_shape.DimensionsCount(), output_shape.DimensionsCount(),
resolved_axis, num_resolved_axis, temp_index, reducer_first,
reducer_next, temp_prod)) {
return false;
}
for (int i = 0; i < output_shape.FlatSize(); i++) {
int32_t result =
MultiplyByQuantizedMultiplier(static_cast<int64_t>(temp_prod[i]),
scaling_multiplier, scaling_shift) +
output_zero_point;
result = std::min(std::max(result, kMinValue), kMaxValue);
output_data[i] = static_cast<T>(result);
}
return true;
}
} // namespace reference_ops
} // namespace tflite

View File

@@ -0,0 +1,228 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <limits>
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void ComputeInterpolationValues(const float value, const float scale,
const bool half_pixel_centers,
int32_t input_size, float* scaled_value,
int32_t* lower_bound,
int32_t* upper_bound) {
if (half_pixel_centers) {
*scaled_value = (value + 0.5f) * scale - 0.5f;
} else {
*scaled_value = value * scale;
}
float scaled_value_floor = std::floor(*scaled_value);
*lower_bound = std::max(static_cast<int32_t>(scaled_value_floor),
static_cast<int32_t>(0));
*upper_bound =
std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1);
}
template <typename T>
inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_size_shape,
const int32_t* output_size_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
// If half_pixel_centers is True, align_corners must be False.
TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(4, unextended_input_shape);
const RuntimeShape output_size_shape =
RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
int32_t input_height = input_shape.Dims(1);
int32_t input_width = input_shape.Dims(2);
int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
int32_t output_height =
output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
int32_t output_width =
output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
float height_scale = static_cast<float>(input_height) / output_height;
float width_scale = static_cast<float>(input_width) / output_width;
if (op_params.align_corners && output_height > 1) {
height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
}
if (op_params.align_corners && output_width > 1) {
width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
}
const float rounding_offset = std::numeric_limits<T>::is_integer ? .5f : .0f;
for (int b = 0; b < batches; ++b) {
for (int y = 0; y < output_height; ++y) {
float input_y;
int32_t y0, y1;
ComputeInterpolationValues(y, height_scale, op_params.half_pixel_centers,
input_height, &input_y, &y0, &y1);
for (int x = 0; x < output_width; ++x) {
float input_x;
int32_t x0, x1;
ComputeInterpolationValues(x, width_scale, op_params.half_pixel_centers,
input_width, &input_x, &x0, &x1);
for (int c = 0; c < depth; ++c) {
T interpolation =
static_cast<T>(input_data[Offset(input_shape, b, y0, x0, c)] *
(1 - (input_y - y0)) * (1 - (input_x - x0)) +
input_data[Offset(input_shape, b, y1, x0, c)] *
(input_y - y0) * (1 - (input_x - x0)) +
input_data[Offset(input_shape, b, y0, x1, c)] *
(1 - (input_y - y0)) * (input_x - x0) +
input_data[Offset(input_shape, b, y1, x1, c)] *
(input_y - y0) * (input_x - x0) +
rounding_offset);
output_data[Offset(output_shape, b, y, x, c)] = interpolation;
}
}
}
}
}
inline void ComputeInterpolationValuesInteger(
const int32_t value, const int32_t scale_10, const bool half_pixel_centers,
int32_t input_size, int32_t* scaled_value, int32_t* lower_bound,
int32_t* upper_bound) {
if (half_pixel_centers) {
*scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
} else {
*scaled_value = value * scale_10;
}
constexpr int32_t zero = 0;
*lower_bound = std::max(*scaled_value / (1 << 10), zero);
*upper_bound =
std::min((*scaled_value + (1 << 10) - 1) / (1 << 10), input_size - 1);
}
// Same as above but doesn't use any floating-point for the resize
template <typename T>
inline void ResizeBilinearInteger(
const tflite::ResizeBilinearParams& op_params,
const RuntimeShape& unextended_input_shape, const T* input_data,
const RuntimeShape& unextended_output_size_shape,
const int32_t* output_size_data,
const RuntimeShape& unextended_output_shape, T* output_data) {
// If half_pixel_centers is True, align_corners must be False.
TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(4, unextended_input_shape);
const RuntimeShape output_size_shape =
RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
const int32_t input_height = input_shape.Dims(1);
const int32_t input_width = input_shape.Dims(2);
const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
const int32_t output_height =
output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
const int32_t output_width =
output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
int32_t height_scale_10 =
((1 << 10) * input_height + output_height / 2) / output_height;
int32_t width_scale_10 =
((1 << 10) * input_width + output_width / 2) / output_width;
if (op_params.align_corners && output_height > 1) {
height_scale_10 =
((1 << 10) * (input_height - 1) + (output_height - 1) / 2) /
(output_height - 1);
}
if (op_params.align_corners && output_width > 1) {
width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) /
(output_width - 1);
}
for (int b = 0; b < batches; ++b) {
for (int y = 0; y < output_height; ++y) {
int32_t input_y, y0, y1;
ComputeInterpolationValuesInteger(y, height_scale_10,
op_params.half_pixel_centers,
input_height, &input_y, &y0, &y1);
for (int x = 0; x < output_width; ++x) {
int32_t input_x, x0, x1;
ComputeInterpolationValuesInteger(x, width_scale_10,
op_params.half_pixel_centers,
input_width, &input_x, &x0, &x1);
for (int c = 0; c < depth; ++c) {
const int64_t output_20_ll =
static_cast<int64_t>(
input_data[Offset(input_shape, b, y0, x0, c)]) *
((1 << 10) - (input_y - (1 << 10) * y0)) *
((1 << 10) - (input_x - (1 << 10) * x0));
const int64_t output_20_lu =
static_cast<int64_t>(
input_data[Offset(input_shape, b, y1, x0, c)]) *
(input_y - (1 << 10) * y0) *
((1 << 10) - (input_x - (1 << 10) * x0));
const int64_t output_20_rl =
static_cast<int64_t>(
input_data[Offset(input_shape, b, y0, x1, c)]) *
((1 << 10) - (input_y - (1 << 10) * y0)) *
(input_x - (1 << 10) * x0);
const int64_t output_20_ru =
static_cast<int64_t>(
input_data[Offset(input_shape, b, y1, x1, c)]) *
(input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
const int64_t output_20 =
output_20_ll + output_20_lu + output_20_rl + output_20_ru;
const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
const T interpolation =
static_cast<T>((output_20 + round) / (1 << 20));
output_data[Offset(output_shape, b, y, x, c)] = interpolation;
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_

View File

@@ -159,7 +159,7 @@ inline int16_t SoftMaxCalculateExp(const SoftmaxParams& params,
std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
static_cast<int32_t>(32767));
// apply the exp() LUT activation function
return generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
return lut_lookup(sat_sym_scaled_diff, params.exp_lut);
}
// Quantized softmax with int16_t input and int16_t output.
inline void SoftmaxInt16(const SoftmaxParams& params,
@@ -207,8 +207,8 @@ inline void SoftmaxInt16(const SoftmaxParams& params,
std::min(std::max(sym_shifted_sum, static_cast<int32_t>(-32768)),
static_cast<int32_t>(32767)));
// apply 1/(1 + x) LUT activation function
int16_t reciprocal_scale_Q015 = generic_int16_table_lookup(
sat_sym_shifted_sum, params.one_over_one_plus_x_lut);
int16_t reciprocal_scale_Q015 =
lut_lookup(sat_sym_shifted_sum, params.one_over_one_plus_x_lut);
// Rescale the exp_result with reciprocal
// range of output is [0, 32767] correspond to [0.0, 1.0]

View File

@@ -0,0 +1,80 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
#include <cstdint>
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(4, unextended_input_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
const int input_depth = input_shape.Dims(3);
const int input_width = input_shape.Dims(2);
const int input_height = input_shape.Dims(1);
const int input_batch = input_shape.Dims(0);
const int output_depth = output_shape.Dims(3);
const int output_width = output_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_batch = output_shape.Dims(0);
const int32_t block_size = op_params.block_size;
TFLITE_DCHECK_EQ(input_width, output_width * block_size);
TFLITE_DCHECK_EQ(input_height, output_height * block_size);
TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth);
TFLITE_DCHECK_EQ(input_batch, output_batch);
for (int in_b = 0; in_b < input_batch; ++in_b) {
for (int in_h = 0; in_h < input_height; ++in_h) {
for (int in_w = 0; in_w < input_width; ++in_w) {
for (int in_d = 0; in_d < input_depth; ++in_d) {
const int out_d =
in_d + ((in_h % block_size) * block_size + in_w % block_size) *
input_depth;
const int out_w = in_w / block_size;
const int out_h = in_h / block_size;
const int out_b = in_b;
const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
const int output_index =
Offset(output_shape, out_b, out_h, out_w, out_d);
output_data[output_index] = input_data[input_index];
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_

View File

@@ -0,0 +1,111 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T, int N>
void TransposeImpl(const TransposeParams& params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
const int unextended_input_size = unextended_input_shape.DimensionsCount();
const int unextended_output_size = unextended_output_shape.DimensionsCount();
TFLITE_DCHECK_LE(unextended_input_size, N);
TFLITE_DCHECK_LE(unextended_output_size, N);
TFLITE_DCHECK_EQ(unextended_output_size, params.perm_count);
const int input_ext_size = N - unextended_input_size;
const int output_ext_size = N - unextended_output_size;
NdArrayDesc<N> input_desc;
NdArrayDesc<N> output_desc;
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_input_shape),
&input_desc);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
&output_desc);
// The perm data is extended to match the output, each index incremented by
// the amount of front padding of the input shape.
int extended_perm[N];
for (int i = 0; i < N; ++i) {
extended_perm[i] = i < output_ext_size
? i
: params.perm[i - output_ext_size] + input_ext_size;
}
// Permutes the input shape so we don't need to permute the indexes inside
// the loop. Check to make sure output_dims is matching input_dims.
NdArrayDesc<N> perm_input_desc;
for (int k = 0; k < N; ++k) {
TFLITE_DCHECK_EQ(input_desc.extents[extended_perm[k]],
output_desc.extents[k]);
perm_input_desc.extents[k] = input_desc.extents[extended_perm[k]];
perm_input_desc.strides[k] = input_desc.strides[extended_perm[k]];
}
// Naive transpose loop (iterate on output index and compute input index).
auto tranpose_func = [&](int indexes[N]) {
output_data[SubscriptToIndex(output_desc, indexes)] =
input_data[SubscriptToIndex(perm_input_desc, indexes)];
};
NDOpsHelper<N>(output_desc, tranpose_func);
}
template <typename T, int N = 5>
void Transpose(const TransposeParams& params,
const RuntimeShape& unextended_input_shape, const T* input_data,
const RuntimeShape& unextended_output_shape, T* output_data) {
// Transpose kernel only does rearranging values not numeric evaluations on
// each cell. It's safe to implement per size of scalar type and this trick
// keeps the total code size in a reasonable range.
switch (sizeof(T)) {
case 1:
TransposeImpl<int8_t, N>(params, unextended_input_shape,
reinterpret_cast<const int8_t*>(input_data),
unextended_output_shape,
reinterpret_cast<int8_t*>(output_data));
break;
case 2:
TransposeImpl<int16_t, N>(params, unextended_input_shape,
reinterpret_cast<const int16_t*>(input_data),
unextended_output_shape,
reinterpret_cast<int16_t*>(output_data));
break;
case 4:
TransposeImpl<int32_t, N>(params, unextended_input_shape,
reinterpret_cast<const int32_t*>(input_data),
unextended_output_shape,
reinterpret_cast<int32_t*>(output_data));
break;
case 8:
TransposeImpl<int64_t, N>(params, unextended_input_shape,
reinterpret_cast<const int64_t*>(input_data),
unextended_output_shape,
reinterpret_cast<int64_t*>(output_data));
break;
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_

View File

@@ -400,13 +400,22 @@ inline size_t ReducedOutputOffset(const int num_dims, const int* dims,
return offset;
}
// Since tensors with '0' in their shape are valid in TF, these offset functions
// allow that as long as the corresponding index is also 0. It is upto the
// calling ops to ensure that they perform verification checks on tensor shapes
// if they don't support a particular behavior.
inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) {
TFLITE_DCHECK_EQ(shape.DimensionsCount(), 4);
const int* dims_data = reinterpret_cast<const int*>(shape.DimsDataUpTo5D());
TFLITE_DCHECK(i0 >= 0 && i0 < dims_data[0]);
TFLITE_DCHECK(i1 >= 0 && i1 < dims_data[1]);
TFLITE_DCHECK(i2 >= 0 && i2 < dims_data[2]);
TFLITE_DCHECK(i3 >= 0 && i3 < dims_data[3]);
TFLITE_DCHECK((dims_data[0] == 0 && i0 == 0) ||
(i0 >= 0 && i0 < dims_data[0]));
TFLITE_DCHECK((dims_data[1] == 0 && i1 == 0) ||
(i1 >= 0 && i1 < dims_data[1]));
TFLITE_DCHECK((dims_data[2] == 0 && i2 == 0) ||
(i2 >= 0 && i2 < dims_data[2]));
TFLITE_DCHECK((dims_data[3] == 0 && i3 == 0) ||
(i3 >= 0 && i3 < dims_data[3]));
return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
}
@@ -414,21 +423,34 @@ inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3,
int i4) {
TFLITE_DCHECK_EQ(shape.DimensionsCount(), 5);
const int* dims_data = reinterpret_cast<const int*>(shape.DimsDataUpTo5D());
TFLITE_DCHECK(i0 >= 0 && i0 < dims_data[0]);
TFLITE_DCHECK(i1 >= 0 && i1 < dims_data[1]);
TFLITE_DCHECK(i2 >= 0 && i2 < dims_data[2]);
TFLITE_DCHECK(i3 >= 0 && i3 < dims_data[3]);
TFLITE_DCHECK(i4 >= 0 && i4 < dims_data[4]);
TFLITE_DCHECK((dims_data[0] == 0 && i0 == 0) ||
(i0 >= 0 && i0 < dims_data[0]));
TFLITE_DCHECK((dims_data[1] == 0 && i1 == 0) ||
(i1 >= 0 && i1 < dims_data[1]));
TFLITE_DCHECK((dims_data[2] == 0 && i2 == 0) ||
(i2 >= 0 && i2 < dims_data[2]));
TFLITE_DCHECK((dims_data[3] == 0 && i3 == 0) ||
(i3 >= 0 && i3 < dims_data[3]));
TFLITE_DCHECK((dims_data[4] == 0 && i4 == 0) ||
(i4 >= 0 && i4 < dims_data[4]));
return (((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3) *
dims_data[4] +
i4;
}
inline int Offset(const RuntimeShape& shape, int* index) {
return Offset(shape, index[0], index[1], index[2], index[3]);
}
inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
TFLITE_DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
TFLITE_DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
TFLITE_DCHECK(i2 >= 0 && i2 < dims.sizes[2]);
TFLITE_DCHECK(i3 >= 0 && i3 < dims.sizes[3]);
TFLITE_DCHECK((i0 == 0 && dims.sizes[0] == 0) ||
(i0 >= 0 && i0 < dims.sizes[0]));
TFLITE_DCHECK((i1 == 0 && dims.sizes[1] == 0) ||
(i1 >= 0 && i1 < dims.sizes[1]));
TFLITE_DCHECK((i2 == 0 && dims.sizes[2] == 0) ||
(i2 >= 0 && i2 < dims.sizes[2]));
TFLITE_DCHECK((i3 == 0 && dims.sizes[3] == 0) ||
(i3 >= 0 && i3 < dims.sizes[3]));
return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] +
i3 * dims.strides[3];
}
@@ -437,10 +459,6 @@ inline int Offset(const Dims<4>& dims, int* index) {
return Offset(dims, index[0], index[1], index[2], index[3]);
}
inline int Offset(const RuntimeShape& shape, int* index) {
return Offset(shape, index[0], index[1], index[2], index[3]);
}
// Get array size, DCHECKing that the dim index is in range.
//
// Note that this will be phased out with Dims<4>, since RuntimeShape::Dims()
@@ -602,6 +620,58 @@ inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
return MatchingFlatSize(dims, check_dims_1, check_dims_2, check_dims_3);
}
// Flat size calculation, checking if their extended shapes match.
inline int MatchingExtendedShapeFlatSize(const RuntimeShape& shape,
const RuntimeShape& check_shape_0) {
const int shape_dims = shape.DimensionsCount();
const int check_shape_0_dims = check_shape_0.DimensionsCount();
const int min_dims = std::min(shape_dims, check_shape_0_dims);
for (int i = 0; i < min_dims; ++i) {
TFLITE_DCHECK_EQ(shape.Dims(shape_dims - 1 - i),
check_shape_0.Dims(check_shape_0_dims - 1 - i));
}
for (int i = min_dims; i < shape_dims; ++i) {
TFLITE_DCHECK_EQ(shape.Dims(shape_dims - 1 - i), 1);
}
for (int i = min_dims; i < check_shape_0_dims; ++i) {
TFLITE_DCHECK_EQ(check_shape_0.Dims(check_shape_0_dims - 1 - i), 1);
}
return shape.FlatSize();
}
inline int MatchingExtendedShapeFlatSize(const RuntimeShape& shape,
const RuntimeShape& check_shape_0,
const RuntimeShape& check_shape_1) {
const int flat_size = MatchingExtendedShapeFlatSize(shape, check_shape_0);
TFLITE_DCHECK_EQ(MatchingExtendedShapeFlatSize(shape, check_shape_1),
flat_size);
return flat_size;
}
inline int MatchingExtendedShapeFlatSize(const RuntimeShape& shape,
const RuntimeShape& check_shape_0,
const RuntimeShape& check_shape_1,
const RuntimeShape& check_shape_2) {
const int flat_size = MatchingExtendedShapeFlatSize(shape, check_shape_0);
TFLITE_DCHECK_EQ(
MatchingExtendedShapeFlatSize(shape, check_shape_1, check_shape_2),
flat_size);
return flat_size;
}
inline int MatchingExtendedShapeFlatSize(const RuntimeShape& shape,
const RuntimeShape& check_shape_0,
const RuntimeShape& check_shape_1,
const RuntimeShape& check_shape_2,
const RuntimeShape& check_shape_3) {
const int flat_size = MatchingExtendedShapeFlatSize(shape, check_shape_0);
TFLITE_DCHECK_EQ(MatchingExtendedShapeFlatSize(shape, check_shape_1,
check_shape_2, check_shape_3),
flat_size);
return flat_size;
}
// Data is required to be contiguous, and so many operators can use either the
// full array flat size or the flat size with one dimension skipped (commonly
// the depth).
@@ -885,6 +955,8 @@ struct Conv3DParams {
float float_activation_max;
};
typedef Conv3DParams Conv3DTransposeParams;
struct DepthToSpaceParams {
int32_t block_size;
};
@@ -1019,9 +1091,9 @@ struct PackParams {
struct PadParams {
int8_t left_padding_count;
int32_t left_padding[4];
int32_t left_padding[5];
int8_t right_padding_count;
int32_t right_padding[4];
int32_t right_padding[5];
ResizingCategory resizing_category;
};
@@ -1196,6 +1268,23 @@ inline void GetActivationParams(const P& params, int64_t* min, int64_t* max) {
*min = params.int64_activation_min;
*max = params.int64_activation_max;
}
// Type trait to check of given type has size smaller than 4 bytes.
template <typename T>
struct is_small_integer
: public std::integral_constant<bool,
std::is_same<T, int8_t>::value ||
std::is_same<T, uint8_t>::value ||
std::is_same<T, int16_t>::value ||
std::is_same<T, uint16_t>::value> {};
// Type trait to check of given type is int32 or int64.
template <typename T>
struct is_int32_or_int64
: public std::integral_constant<bool, std::is_same<T, int32_t>::value ||
std::is_same<T, int64_t>::value> {
};
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_

View File

@@ -119,6 +119,7 @@ TfLiteStatus GetInputSafe(const TfLiteContext* context, const TfLiteNode* node,
TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
int index) {
TfLiteTensor* tensor = GetMutableInput(context, node, index);
if (tensor == nullptr) return nullptr;
return tensor->is_variable ? tensor : nullptr;
}
@@ -197,7 +198,7 @@ TfLiteStatus PopulateConvolutionQuantizationParams(
const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
int32_t* output_activation_min, int32_t* output_activation_max,
int32_t* per_channel_multiplier, int* per_channel_shift) {
int32_t* per_channel_multiplier, int32_t* per_channel_shift) {
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
return PopulateConvolutionQuantizationParams(
@@ -212,7 +213,8 @@ TfLiteStatus PopulateConvolutionQuantizationParams(
const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
int32_t* output_activation_min, int32_t* output_activation_max,
int32_t* per_channel_multiplier, int* per_channel_shift, int num_channels) {
int32_t* per_channel_multiplier, int32_t* per_channel_shift,
int num_channels) {
TF_LITE_ENSURE_EQ(context, input->quantization.type,
kTfLiteAffineQuantization);
TF_LITE_ENSURE_EQ(context, filter->quantization.type,
@@ -333,30 +335,49 @@ TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
}
namespace {
void CalculateActivationRangeQuantizedImpl(TfLiteFusedActivation activation,
int32_t qmin, int32_t qmax,
TfLiteTensor* output,
int32_t* act_min, int32_t* act_max) {
inline TfLiteStatus Quantize(TfLiteContext* context, float scale,
int32_t zero_point, float f, int32_t& q) {
const float tmp = TfLiteRound(f / scale);
const bool no_integer_overflow_from_quantization =
(tmp >= static_cast<float>(std::numeric_limits<int32_t>::min()) &&
tmp <= static_cast<float>(std::numeric_limits<int32_t>::max()));
TF_LITE_ENSURE(context, no_integer_overflow_from_quantization);
q = zero_point + static_cast<int32_t>(tmp);
return kTfLiteOk;
}
TfLiteStatus CalculateActivationRangeQuantizedImpl(
TfLiteContext* context, TfLiteFusedActivation activation, int32_t qmin,
int32_t qmax, TfLiteTensor* output, int32_t* act_min, int32_t* act_max) {
const auto scale = output->params.scale;
const auto zero_point = output->params.zero_point;
auto quantize = [scale, zero_point](float f) {
return zero_point + static_cast<int32_t>(TfLiteRound(f / scale));
};
int32_t tmp_q;
if (activation == kTfLiteActRelu) {
*act_min = std::max(qmin, quantize(0.0));
TF_LITE_ENSURE_OK(context,
Quantize(context, scale, zero_point, 0.0, tmp_q));
*act_min = std::max(qmin, tmp_q);
*act_max = qmax;
} else if (activation == kTfLiteActRelu6) {
*act_min = std::max(qmin, quantize(0.0));
*act_max = std::min(qmax, quantize(6.0));
TF_LITE_ENSURE_OK(context,
Quantize(context, scale, zero_point, 0.0, tmp_q));
*act_min = std::max(qmin, tmp_q);
TF_LITE_ENSURE_OK(context,
Quantize(context, scale, zero_point, 6.0, tmp_q));
*act_max = std::min(qmax, tmp_q);
} else if (activation == kTfLiteActReluN1To1) {
*act_min = std::max(qmin, quantize(-1.0));
*act_max = std::min(qmax, quantize(1.0));
TF_LITE_ENSURE_OK(context,
Quantize(context, scale, zero_point, -1.0, tmp_q));
*act_min = std::max(qmin, tmp_q);
TF_LITE_ENSURE_OK(context,
Quantize(context, scale, zero_point, 1.0, tmp_q));
*act_max = std::min(qmax, tmp_q);
} else {
*act_min = qmin;
*act_max = qmax;
}
return kTfLiteOk;
}
} // namespace
@@ -380,9 +401,8 @@ TfLiteStatus CalculateActivationRangeQuantized(TfLiteContext* context,
TF_LITE_ENSURE(context, false);
}
CalculateActivationRangeQuantizedImpl(activation, qmin, qmax, output, act_min,
act_max);
return kTfLiteOk;
return CalculateActivationRangeQuantizedImpl(context, activation, qmin, qmax,
output, act_min, act_max);
}
bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2) {
@@ -412,18 +432,15 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
const TfLiteTensor* input1,
const TfLiteTensor* input2,
TfLiteIntArray** output_shape) {
int dims1 = NumDimensions(input1);
int dims2 = NumDimensions(input2);
int out_dims = std::max(dims1, dims2);
if (NumElements(input1) == 0) {
*output_shape = TfLiteIntArrayCopy(input1->dims);
return kTfLiteOk;
}
const int dims1 = NumDimensions(input1);
const int dims2 = NumDimensions(input2);
const int out_dims = std::max(dims1, dims2);
std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
for (int i = 0; i < out_dims; ++i) {
int d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
int d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
const int d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
const int d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
if (!(d1 == d2 || d1 == 1 || d2 == 1)) {
context->ReportError(context,
"Given shapes, %s and %s, are not broadcastable.",
@@ -431,7 +448,12 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
GetShapeDebugString(input2->dims).c_str());
return kTfLiteError;
}
shape->data[out_dims - i - 1] = std::max(d1, d2);
if (d1 == 0 || d2 == 0) {
shape->data[out_dims - i - 1] = 0;
} else {
shape->data[out_dims - i - 1] = std::max(d1, d2);
}
}
*output_shape = shape.release();
return kTfLiteOk;
@@ -442,17 +464,20 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
const TfLiteTensor* input2,
const TfLiteTensor* input3,
TfLiteIntArray** output_shape) {
int dims1 = NumDimensions(input1);
int dims2 = NumDimensions(input2);
int dims3 = NumDimensions(input3);
int out_dims = std::max(std::max(dims1, dims2), dims3);
const int dims1 = NumDimensions(input1);
const int dims2 = NumDimensions(input2);
const int dims3 = NumDimensions(input3);
const int out_dims = std::max(std::max(dims1, dims2), dims3);
std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
for (int i = 0; i < out_dims; ++i) {
int d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
int d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
int d3 = i >= dims3 ? 1 : SizeOfDimension(input3, dims3 - i - 1);
const int d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
const int d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
const int d3 = i >= dims3 ? 1 : SizeOfDimension(input3, dims3 - i - 1);
const int min_value = std::min(std::min(d1, d2), d3);
int max_value = std::max(std::max(d1, d2), d3);
// If one dimention is 0, others must be 0 or 1.
if (min_value == 0) max_value = 0;
if (!(d1 == 1 || d1 == max_value) || !(d2 == 1 || d2 == max_value) ||
!(d3 == 1 || d3 == max_value)) {
context->ReportError(
@@ -473,42 +498,42 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
int TfLiteTypeGetSize(TfLiteType type) {
switch (type) {
case kTfLiteUInt8:
TF_LITE_ASSERT_EQ(sizeof(uint8_t), 1);
static_assert(sizeof(uint8_t) == 1, "");
return 1;
case kTfLiteInt8:
TF_LITE_ASSERT_EQ(sizeof(int8_t), 1);
static_assert(sizeof(int8_t) == 1, "");
return 1;
case kTfLiteBool:
return sizeof(bool);
case kTfLiteInt16:
TF_LITE_ASSERT_EQ(sizeof(int16_t), 2);
static_assert(sizeof(int16_t) == 2, "");
return 2;
case kTfLiteFloat16:
TF_LITE_ASSERT_EQ(sizeof(int16_t), 2);
static_assert(sizeof(int16_t) == 2, "");
return 2;
case kTfLiteFloat32:
TF_LITE_ASSERT_EQ(sizeof(float), 4);
static_assert(sizeof(float) == 4, "");
return 4;
case kTfLiteInt32:
TF_LITE_ASSERT_EQ(sizeof(int32_t), 4);
static_assert(sizeof(int32_t) == 4, "");
return 4;
case kTfLiteUInt32:
TF_LITE_ASSERT_EQ(sizeof(uint32_t), 4);
static_assert(sizeof(uint32_t) == 4, "");
return 4;
case kTfLiteInt64:
TF_LITE_ASSERT_EQ(sizeof(int64_t), 8);
static_assert(sizeof(int64_t) == 8, "");
return 8;
case kTfLiteUInt64:
TF_LITE_ASSERT_EQ(sizeof(uint64_t), 8);
static_assert(sizeof(uint64_t) == 8, "");
return 8;
case kTfLiteFloat64:
TF_LITE_ASSERT_EQ(sizeof(double), 8);
static_assert(sizeof(double) == 8, "");
return 8;
case kTfLiteComplex64:
TF_LITE_ASSERT_EQ(sizeof(std::complex<float>), 8);
static_assert(sizeof(std::complex<float>) == 8, "");
return 8;
case kTfLiteComplex128:
TF_LITE_ASSERT_EQ(sizeof(std::complex<double>), 16);
static_assert(sizeof(std::complex<double>) == 16, "");
return 16;
default:
return 0;

View File

@@ -214,14 +214,15 @@ TfLiteStatus PopulateConvolutionQuantizationParams(
const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
int32_t* output_activation_min, int32_t* output_activation_max,
int32_t* per_channel_multiplier, int* per_channel_shift);
int32_t* per_channel_multiplier, int32_t* per_channel_shift);
TfLiteStatus PopulateConvolutionQuantizationParams(
TfLiteContext* context, const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
int32_t* output_activation_min, int32_t* output_activation_max,
int32_t* per_channel_multiplier, int* per_channel_shift, int num_channels);
int32_t* per_channel_multiplier, int32_t* per_channel_shift,
int num_channels);
// Calculates the multiplication factor for a quantized convolution (or
// quantized depthwise convolution) involving the given tensors. Returns an

View File

@@ -15,69 +15,24 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_KERNELS_OP_MACROS_H_
#define TENSORFLOW_LITE_KERNELS_OP_MACROS_H_
// If we're on a platform without standard IO functions, fall back to a
// non-portable function.
#ifdef TF_LITE_MCU_DEBUG_LOG
#include "tensorflow/lite/micro/debug_log.h"
#define DEBUG_LOG(x) \
do { \
DebugLog(x); \
} while (0)
inline void InfiniteLoop() {
DEBUG_LOG("HALTED\n");
#if !defined(TF_LITE_MCU_DEBUG_LOG)
#include <cstdlib>
#define TFLITE_ABORT abort()
#else
inline void AbortImpl() {
DebugLog("HALTED\n");
while (1) {
}
}
#define TFLITE_ABORT AbortImpl();
#endif
#define TFLITE_ABORT InfiniteLoop();
#else // TF_LITE_MCU_DEBUG_LOG
#include <cstdio>
#include <cstdlib>
#define DEBUG_LOG(x) \
do { \
fprintf(stderr, "%s", (x)); \
} while (0)
// Report Error for unsupported type by op 'op_name' and returns kTfLiteError.
#define TF_LITE_UNSUPPORTED_TYPE(context, type, op_name) \
do { \
TF_LITE_KERNEL_LOG((context), "%s:%d Type %s is unsupported by op %s.", \
__FILE__, __LINE__, TfLiteTypeGetName(type), \
(op_name)); \
return kTfLiteError; \
} while (0)
#define TFLITE_ABORT abort()
#endif // TF_LITE_MCU_DEBUG_LOG
#if defined(NDEBUG) || defined(ARDUINO)
#if defined(NDEBUG)
#define TFLITE_ASSERT_FALSE (static_cast<void>(0))
#else
#define TFLITE_ASSERT_FALSE TFLITE_ABORT
#endif
#define TF_LITE_FATAL(msg) \
do { \
DEBUG_LOG(msg); \
DEBUG_LOG("\nFATAL\n"); \
TFLITE_ABORT; \
} while (0)
#define TF_LITE_ASSERT(x) \
do { \
if (!(x)) TF_LITE_FATAL(#x); \
} while (0)
#define TF_LITE_ASSERT_EQ(x, y) \
do { \
if ((x) != (y)) TF_LITE_FATAL(#x " didn't equal " #y); \
} while (0)
#endif // TENSORFLOW_LITE_KERNELS_OP_MACROS_H_

View File

@@ -20,7 +20,6 @@ limitations under the License.
namespace tflite {
// TODO(renjieliu): Migrate others to use ComputePaddingWithLeftover.
inline int ComputePadding(int stride, int dilation_rate, int in_size,
int filter_size, int out_size) {
int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
@@ -45,6 +44,11 @@ inline int ComputePaddingWithOffset(int stride, int dilation_rate, int in_size,
inline int ComputeOutSize(TfLitePadding padding, int image_size,
int filter_size, int stride, int dilation_rate = 1) {
int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
// TODO(b/186448822): This uses 0 since the function has no other way to
// report error case
if (stride == 0) return 0;
switch (padding) {
case kTfLitePaddingSame:
return (image_size + stride - 1) / stride;