Rolling 20210420

This commit is contained in:
jomjol
2021-04-20 19:44:16 +02:00
parent 520f818adc
commit ea2305de47
156 changed files with 11095 additions and 8601 deletions

View File

@@ -178,14 +178,54 @@ inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
// - input x is in the range -(1<<47) <= x < (1<<47)
assert(quantized_multiplier >= 0);
assert(shift >= -31 && shift < 8);
assert(x >= -(static_cast<int64_t>(1) << 47) &&
x < (static_cast<int64_t>(1) << 47));
int32_t reduced_multiplier = (quantized_multiplier + (1 << 15)) >> 16;
int32_t reduced_multiplier = (quantized_multiplier < 0x7FFF0000)
? ((quantized_multiplier + (1 << 15)) >> 16)
: 0x7FFF;
int total_shift = 15 - shift;
x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1));
int32_t result = x >> total_shift;
return result;
}
#ifdef USE_NEON
// Round uses ARM's rounding shift right.
inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
int32x4x4_t input_val, int32_t quantized_multiplier, int shift) {
const int left_shift = std::max(shift, 0);
const int right_shift = std::min(shift, 0);
int32x4x4_t result;
int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
result.val[0] =
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
multiplier_dup),
right_shift_dup);
result.val[1] =
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
multiplier_dup),
right_shift_dup);
result.val[2] =
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
multiplier_dup),
right_shift_dup);
result.val[3] =
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
multiplier_dup),
right_shift_dup);
return result;
}
#endif
template <typename T>
int CountLeadingZeros(T integer_input) {
static_assert(std::is_unsigned<T>::value,
@@ -261,10 +301,11 @@ inline void gen_lut(double (*func)(double), double min, double max,
TfLiteRound(func(min + i * step + half_step) * 32768.0);
double midpoint_err = midpoint_interp_val - midpoint_val;
double bias = TfLiteRound(midpoint_err / 2.0);
table[i] = std::min(std::max(sample_val - bias, -32768.0), 32767.0);
table[i] = std::min<double>(std::max<double>(sample_val - bias, -32768.0),
32767.0);
}
table[num - 1] =
std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
table[num - 1] = std::min<double>(
std::max<double>(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
}
// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
@@ -289,10 +330,11 @@ inline void gen_lut(float (*func)(float), float min, float max, int16_t* table,
TfLiteRound(func(min + i * step + half_step) * 32768.0f);
float midpoint_err = midpoint_interp_val - midpoint_val;
float bias = TfLiteRound(midpoint_err / 2.0f);
table[i] = std::min(std::max(sample_val - bias, -32768.0f), 32767.0f);
table[i] = std::min<float>(std::max<float>(sample_val - bias, -32768.0f),
32767.0f);
}
table[num - 1] = std::min(
std::max(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
table[num - 1] = std::min<float>(
std::max<float>(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
}
// int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax

View File

@@ -34,6 +34,7 @@ namespace tflite {
}
DECLARE_STD_GLOBAL_SWITCH1(TfLiteRound, round);
DECLARE_STD_GLOBAL_SWITCH1(TfLiteExpm1, expm1);
} // namespace tflite

View File

@@ -15,7 +15,6 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
#include <complex>
#include <vector>
#include "tensorflow/lite/c/common.h"

View File

@@ -289,7 +289,7 @@ void PreprocessSoftmaxScaling(double beta, double input_scale,
input_beta_real_multiplier = (1ll << 31) - 1.0;
}
#else // TFLITE_EMULATE_FLOAT
const double input_beta_real_multiplier = std::min(
const double input_beta_real_multiplier = std::min<double>(
beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
#endif // TFLITE_EMULATE_FLOAT

View File

@@ -202,14 +202,6 @@ inline void Add(const ArithmeticParams& params,
}
}
// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
// dimensionality if the runtime code does a single loop over one dimension
// that handles broadcasting as the base case. The code generator would then
// generate max(D1, D2) nested for loops.
// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
// reference_ops.h.
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const float* input1_data,

View File

@@ -0,0 +1,42 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
// T is expected to be either float or int.
template <typename T>
inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
const T* const* input_data, T* output_data) {
// All inputs and output should have the same shape, this is checked during
// Prepare stage.
const size_t size = input_shape.FlatSize();
for (size_t i = 0; i < size; ++i) {
T x = 0;
for (size_t j = 0; j < num_inputs; ++j) {
x += input_data[j][i];
}
output_data[i] = x;
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_

View File

@@ -15,12 +15,23 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
#include <functional>
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
std::function<bool(T, T)> GetComparefunction(bool is_arg_max) {
if (is_arg_max) {
return std::greater<T>();
} else {
return std::less<T>();
}
}
template <typename T1, typename T2, typename T3, typename Cmp>
void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
const T3* input2_data, const RuntimeShape& output_shape,
@@ -62,6 +73,15 @@ void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
}
}
}
template <typename T1, typename T2, typename T3>
void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
const T3* input2_data, const RuntimeShape& output_shape,
T2* output_data, const bool is_arg_max) {
ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data,
GetComparefunction<T1>(is_arg_max));
}
} // namespace reference_ops
} // namespace tflite

View File

@@ -0,0 +1,101 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
#include <cmath>
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
// TODO(b/135760455): Move this method anonymous namespace in a cc file.
inline RuntimeShape ExtendShapeBatchToSpace(const RuntimeShape& shape) {
if (shape.DimensionsCount() == 4) {
return shape;
}
RuntimeShape new_shape(4, 1);
new_shape.SetDim(0, shape.Dims(0));
new_shape.SetDim(1, shape.Dims(1));
new_shape.SetDim(3, shape.Dims(2));
return new_shape;
}
template <typename T>
inline void BatchToSpaceND(const RuntimeShape& unextended_input1_shape,
const T* input1_data,
const RuntimeShape& unextended_input2_shape,
const int32_t* block_shape_data,
const RuntimeShape& unextended_input3_shape,
const int32_t* crops_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
ruy::profiler::ScopeLabel label("BatchToSpaceND");
TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
unextended_output_shape.DimensionsCount());
const RuntimeShape input1_shape =
ExtendShapeBatchToSpace(unextended_input1_shape);
const RuntimeShape output_shape =
ExtendShapeBatchToSpace(unextended_output_shape);
const int output_width = output_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_batch_size = output_shape.Dims(0);
const int depth = input1_shape.Dims(3);
const int input_width = input1_shape.Dims(2);
const int input_height = input1_shape.Dims(1);
const int input_batch_size = input1_shape.Dims(0);
const int block_shape_height = block_shape_data[0];
const int block_shape_width =
unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
const int crops_top = crops_data[0];
const int crops_left =
unextended_input1_shape.DimensionsCount() == 4 ? crops_data[2] : 0;
for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
const int out_batch = in_batch % output_batch_size;
const int spatial_offset = in_batch / output_batch_size;
for (int in_h = 0; in_h < input_height; ++in_h) {
const int out_h = in_h * block_shape_height +
spatial_offset / block_shape_width - crops_top;
if (out_h < 0 || out_h >= output_height) {
continue;
}
for (int in_w = 0; in_w < input_width; ++in_w) {
const int out_w = in_w * block_shape_width +
spatial_offset % block_shape_width - crops_left;
if (out_w < 0 || out_w >= output_width) {
continue;
}
T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
const T* in =
input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
memcpy(out, in, depth * sizeof(T));
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_

View File

@@ -23,9 +23,6 @@ namespace tflite {
namespace reference_ops {
// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
// generalized and efficient BroadcastBinaryFunction.
//
// Also appears to duplicate MinimumMaximum.
//
// R: Result type. T1: Input 1 type. T2: Input 2 type.
@@ -63,7 +60,6 @@ inline void BroadcastBinaryFunction4DSlow(
}
// R: Result type. T1: Input 1 type. T2: Input 2 type.
// TODO(renjieliu): Refactor other binary functions to use this one.
template <typename R, typename T1, typename T2>
inline void BinaryFunction(const RuntimeShape& input1_shape,
const T1* input1_data,

View File

@@ -68,8 +68,7 @@ inline void Concatenation(const ConcatenationParams& params,
}
}
// TODO(prabhumk): This is the same as the optimized implementation.
// TODO(prabhumk): The quantized implementation of concatentation isn't fully
// TODO(b/174275780): The quantized implementation of concatentation isn't fully
// quantized as it takes scale as a floating point value. This should be fixed
// when optimizng this routine further.
inline void ConcatenationWithScaling(const ConcatenationParams& params,

View File

@@ -15,16 +15,13 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
#include "tensorflow/lite/kernels/internal/types.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& filter_shape,
const float* filter_data, const RuntimeShape& bias_shape,
@@ -108,8 +105,8 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
uint8_t* output_data, const RuntimeShape& im2col_shape,
uint8_t* im2col_data, void* cpu_backend_context) {
(void)cpu_backend_context; // only used in optimized code.
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;

View File

@@ -0,0 +1,239 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
#include <algorithm>
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void DivCheckArithmeticParams(const ArithmeticParams& params) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
// Input offset is negative input zero point. Activation tensors are
// asymmetric quantized so they span the full int8 range.
constexpr int32_t max_value =
static_cast<int32_t>(std::numeric_limits<T>::max());
TFLITE_DCHECK_GE(params.input1_offset, -max_value);
TFLITE_DCHECK_LE(params.input1_offset, max_value);
TFLITE_DCHECK_GE(params.input2_offset, -max_value);
TFLITE_DCHECK_LE(params.input2_offset, max_value);
TFLITE_DCHECK_GE(params.output_offset, -max_value);
TFLITE_DCHECK_LE(params.output_offset, max_value);
}
// Element-wise div that can often be used for inner loop of broadcast Div as
// well as the non-broadcast Div.
template <typename T>
inline void DivElementwise(int size, const ArithmeticParams& params,
const T* input1_data, const T* input2_data,
T* output_data) {
DivCheckArithmeticParams<T>(params);
for (int i = 0; i < size; ++i) {
const int32_t input1_val = params.input1_offset + input1_data[i];
const int32_t input2_val = params.input2_offset + input2_data[i];
TFLITE_DCHECK_NE(input2_val, 0);
int recip_shift;
const int32_t input2_inv =
(input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
: -GetReciprocal(-input2_val, 31, &recip_shift);
const int headroom = CountLeadingSignBits(input1_val);
const int32_t unscaled_quotient =
MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
headroom);
const int total_shift = params.output_shift - recip_shift - headroom;
const int32_t unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplierSmallerThanOneExp(
unscaled_quotient, params.output_multiplier, total_shift);
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, unclamped_result));
output_data[i] = static_cast<T>(clamped_output);
}
}
inline void Div(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const uint8_t* input1_data,
const RuntimeShape& input2_shape, const uint8_t* input2_data,
const RuntimeShape& output_shape, uint8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
DivElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void Div(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int8_t* input1_data,
const RuntimeShape& input2_shape, const int8_t* input2_data,
const RuntimeShape& output_shape, int8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
DivElementwise(flat_size, params, input1_data, input2_data, output_data);
}
template <typename T, int N = 5>
inline void BroadcastDivSlowQuantized(
const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape,
const T* input1_data, const RuntimeShape& unextended_input2_shape,
const T* input2_data, const RuntimeShape& unextended_output_shape,
T* output_data) {
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
NdArrayDesc<N> desc1;
NdArrayDesc<N> desc2;
NdArrayDesc<N> output_desc;
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
unextended_input2_shape, &desc1, &desc2);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
&output_desc);
DivCheckArithmeticParams<T>(params);
auto div_func = [&](int indexes[N]) {
const int32_t input1_val =
params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
const int32_t input2_val =
params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
TFLITE_DCHECK_NE(input2_val, 0);
int recip_shift;
const int32_t input2_inv =
(input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
: -GetReciprocal(-input2_val, 31, &recip_shift);
const int headroom = CountLeadingSignBits(input1_val);
const int32_t unscaled_quotient =
MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
headroom);
const int total_shift = params.output_shift - recip_shift - headroom;
const int32_t unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplierSmallerThanOneExp(
unscaled_quotient, params.output_multiplier, total_shift);
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, unclamped_result));
output_data[SubscriptToIndex(output_desc, indexes)] =
static_cast<T>(clamped_output);
};
NDOpsHelper<N>(output_desc, div_func);
}
template <int N = 5>
inline void BroadcastDivSlow(const ArithmeticParams& params,
const RuntimeShape& unextended_input1_shape,
const uint8_t* input1_data,
const RuntimeShape& unextended_input2_shape,
const uint8_t* input2_data,
const RuntimeShape& unextended_output_shape,
uint8_t* output_data) {
BroadcastDivSlowQuantized<uint8_t, N>(
params, unextended_input1_shape, input1_data, unextended_input2_shape,
input2_data, unextended_output_shape, output_data);
}
template <int N = 5>
inline void BroadcastDivSlow(const ArithmeticParams& params,
const RuntimeShape& unextended_input1_shape,
const int8_t* input1_data,
const RuntimeShape& unextended_input2_shape,
const int8_t* input2_data,
const RuntimeShape& unextended_output_shape,
int8_t* output_data) {
BroadcastDivSlowQuantized<int8_t, N>(
params, unextended_input1_shape, input1_data, unextended_input2_shape,
input2_data, unextended_output_shape, output_data);
}
// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
// dimensionality if the runtime code does a single loop over one dimension
// that handles broadcasting as the base case. The code generator would then
// generate max(D1, D2) nested for loops.
template <typename T, int N = 5>
void BroadcastDivSlow(const ArithmeticParams& params,
const RuntimeShape& unextended_input1_shape,
const T* input1_data,
const RuntimeShape& unextended_input2_shape,
const T* input2_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
T output_activation_min;
T output_activation_max;
GetActivationParams(params, &output_activation_min, &output_activation_max);
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
NdArrayDesc<N> desc1;
NdArrayDesc<N> desc2;
NdArrayDesc<N> output_desc;
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
unextended_input2_shape, &desc1, &desc2);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
&output_desc);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest
// stride, typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
auto div_func = [&](int indexes[N]) {
output_data[SubscriptToIndex(output_desc, indexes)] =
ActivationFunctionWithMinMax(
input1_data[SubscriptToIndex(desc1, indexes)] /
input2_data[SubscriptToIndex(desc2, indexes)],
output_activation_min, output_activation_max);
};
NDOpsHelper<N>(output_desc, div_func);
}
template <typename T>
inline void Div(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape& input2_shape, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) {
T output_activation_min;
T output_activation_max;
GetActivationParams(params, &output_activation_min, &output_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
input1_data[i] / input2_data[i], output_activation_min,
output_activation_max);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_

View File

@@ -0,0 +1,37 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void Elu(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
const float val = input_data[i];
output_data[i] = val < 0.0f ? TfLiteExpm1(val) : val;
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_

View File

@@ -0,0 +1,38 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
#include <cmath>
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void Exp(const T* input_data, const size_t num_elements,
T* output_data) {
ruy::profiler::ScopeLabel label("Exp");
for (size_t idx = 0; idx < num_elements; ++idx) {
output_data[idx] = std::exp(input_data[idx]);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_

View File

@@ -0,0 +1,38 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
#include <cmath>
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
void Fill(const RuntimeShape& value_shape, const T* value_data,
const RuntimeShape& output_shape, T* output_data) {
TFLITE_DCHECK_EQ(value_shape.DimensionsCount(), 0);
const int flat_size = output_shape.FlatSize();
for (int i = 0; i < flat_size; ++i) {
output_data[i] = *value_data;
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_

View File

@@ -31,7 +31,7 @@ inline void FullyConnected(
float* output_data) {
const float output_activation_min = params.float_activation_min;
const float output_activation_max = params.float_activation_max;
// TODO(benoitjacob): This really should be:
// TODO(b/62193649): This really should be:
// const int batches = ArraySize(output_dims, 1);
// but the current --variable_batch hack consists in overwriting the 3rd
// dimension with the runtime batch size, as we don't keep track for each
@@ -76,7 +76,7 @@ inline void FullyConnected(
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
// TODO(benoitjacob): This really should be:
// TODO(b/62193649): This really should be:
// const int batches = ArraySize(output_dims, 1);
// but the current --variable_batch hack consists in overwriting the 3rd
// dimension with the runtime batch size, as we don't keep track for each
@@ -123,7 +123,7 @@ inline void FullyConnected(
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
TFLITE_DCHECK_EQ(output_offset, 0);
// TODO(benoitjacob): This really should be:
// TODO(b/62193649): This really should be:
// const int batches = ArraySize(output_dims, 1);
// but the current --variable_batch hack consists in overwriting the 3rd
// dimension with the runtime batch size, as we don't keep track for each
@@ -176,7 +176,7 @@ inline void ShuffledFullyConnected(
TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
// TODO(benoitjacob): This really should be:
// TODO(b/62193649): This really should be:
// const int batches = ArraySize(output_dims, 1);
// but the current --variable_batch hack consists in overwriting the 3rd
// dimension with the runtime batch size, as we don't keep track for each

View File

@@ -34,55 +34,24 @@ inline void CheckArithmeticParams(const ArithmeticParams& params) {
TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
}
// Element-wise add that can often be used for inner loop of broadcast add as
// well as the non-broadcast add.
inline void AddElementwise(int size, const ArithmeticParams& params,
const int8_t* input1_data, const int8_t* input2_data,
int8_t* output_data) {
inline void ElementWise(
int size, const ArithmeticParams& params, const int8_t* input1_data,
const int8_t* input2_data, int8_t* output_data,
void (*check_arithmetic_params)(const ArithmeticParams&),
int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
CheckArithmeticParams(params);
for (int i = 0; i < size; ++i) {
const int32_t input1_val = params.input1_offset + input1_data[i];
const int32_t input2_val = params.input2_offset + input2_data[i];
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
const int32_t scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input1_val, params.input1_multiplier, params.input1_shift);
const int32_t scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input2_val, params.input2_multiplier, params.input2_shift);
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
raw_sum, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[i] = static_cast<int8_t>(clamped_output);
output_data[i] = binary_func(input1_data[i], input2_data[i], params);
}
}
inline void Add(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int8_t* input1_data,
const RuntimeShape& input2_shape, const int8_t* input2_data,
const RuntimeShape& output_shape, int8_t* output_data) {
CheckArithmeticParams(params);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int8_t* input1_data,
const RuntimeShape& input2_shape,
const int8_t* input2_data,
const RuntimeShape& output_shape,
int8_t* output_data) {
inline void BroadcastBinaryFunction4DSlow(
const ArithmeticParams& params, const RuntimeShape& input1_shape,
const int8_t* input1_data, const RuntimeShape& input2_shape,
const int8_t* input2_data, const RuntimeShape& output_shape,
int8_t* output_data,
void (*check_arithmetic_params)(const ArithmeticParams&),
int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -105,40 +74,70 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
const int32_t input1_val =
params.input1_offset +
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
const int32_t input2_val =
params.input2_offset +
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
const int32_t shifted_input1_val =
input1_val * (1 << params.left_shift);
const int32_t shifted_input2_val =
input2_val * (1 << params.left_shift);
const int32_t scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input1_val, params.input1_multiplier,
params.input1_shift);
const int32_t scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input2_val, params.input2_multiplier,
params.input2_shift);
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
raw_sum, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[Offset(extended_output_shape, b, y, x, c)] =
static_cast<int8_t>(clamped_output);
output_data[Offset(extended_output_shape, b, y, x, c)] = binary_func(
input1_data[SubscriptToIndex(desc1, b, y, x, c)],
input2_data[SubscriptToIndex(desc2, b, y, x, c)], params);
}
}
}
}
}
inline int8_t AddFunc(int8_t x, int8_t y, const ArithmeticParams& params) {
const int32_t input1_val = params.input1_offset + x;
const int32_t input2_val = params.input2_offset + y;
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
const int32_t scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input1_val, params.input1_multiplier, params.input1_shift);
const int32_t scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input2_val, params.input2_multiplier, params.input2_shift);
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
raw_sum, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
return static_cast<int8_t>(clamped_output);
}
// Element-wise add that can often be used for inner loop of broadcast add as
// well as the non-broadcast add.
inline void AddElementwise(int size, const ArithmeticParams& params,
const int8_t* input1_data, const int8_t* input2_data,
int8_t* output_data) {
ElementWise(size, params, input1_data, input2_data, output_data,
CheckArithmeticParams, AddFunc);
}
inline void Add(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int8_t* input1_data,
const RuntimeShape& input2_shape, const int8_t* input2_data,
const RuntimeShape& output_shape, int8_t* output_data) {
CheckArithmeticParams(params);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int8_t* input1_data,
const RuntimeShape& input2_shape,
const int8_t* input2_data,
const RuntimeShape& output_shape,
int8_t* output_data) {
BroadcastBinaryFunction4DSlow(params, input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data,
CheckArithmeticParams, AddFunc);
}
} // namespace reference_integer_ops
} // namespace tflite

View File

@@ -101,7 +101,7 @@ inline void ConvPerChannel(
// long as the filter size (filter_y * filter_x * in_channel)
// does not exceed 2^16, which is the case in all the models
// we have seen so far.
// TODO(jianlijianli): Add a check to make sure the
// TODO(b/174275578): Add a check to make sure the
// accumulator depth is smaller than 2^16.
acc += filter_val * (input_val + input_offset);
}

View File

@@ -95,7 +95,7 @@ inline void DepthwiseConvPerChannel(
// long as the filter size (filter_y * filter_x * in_channel)
// does not exceed 2^16, which is the case in all the models
// we have seen so far.
// TODO(jianlijianli): Add a check to make sure the
// TODO(b/174275578): Add a check to make sure the
// accumulator depth is smaller than 2^16.
acc += filter_val * (input_val + input_offset);
}

View File

@@ -58,23 +58,36 @@ inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
}
}
inline void Logistic(int32_t input_multiplier, int32_t input_size,
const int16_t* ptr_input_data, int16_t* ptr_output_data) {
inline void Logistic(int32_t input_multiplier, int32_t input_left_shift,
int32_t input_size, const int16_t* ptr_input_data,
int16_t* ptr_output_data) {
// We use the LUT for sigmoid and take into account, that
// tanh(x) = 2*sigmoid(2*x) - 1
int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
// We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
// In case of general parameter scale, multiplier 3 is taken into account
// in TanhPrepare function and it is included in
// input_multiplier already.
TFLITE_DCHECK_GE(input_left_shift, 0);
if (input_multiplier == 0) { // power of two case
input_multiplier = 3 << input_left_shift;
input_left_shift = 0;
}
int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
int32_t input_data = (*ptr_input_data) * input_data_mul;
int32_t input_data =
((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
// Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
// we do interpolation on unsigned values.
uint32_t abs_input_data = 3 * abs(input_data);
// We do interpolation on unsigned values.
uint32_t abs_input_data = abs(input_data);
// We divide by 2 power of 9, because
// we need to divide by 2 in power of 7 for
// the input conversion + 1/4 from the scale above.
// Define uh as uint32_t type not to make this function overflow.
uint32_t uh = abs_input_data >> 9;
uint32_t result;

View File

@@ -65,19 +65,25 @@ inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
// We use the LUT for sigmoid and take into account, that
// tanh(x) = 2*sigmoid(2*x) - 1
int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
// We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
// In case of general parameter scale, multiplier 3 is taken into account
// in TanhPrepare function and it is included in
// input_multiplier already.
if (input_multiplier == 0) { // power of two case
input_multiplier = 3 << input_left_shift;
input_left_shift = 0;
}
int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++) {
int32_t input_data = (*ptr_input_data) * input_data_mul;
int32_t input_data =
((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
if (input_left_shift == 1) {
input_data <<= 1;
}
// Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
uint32_t abs_input_data = 3 * abs(input_data);
uint32_t abs_input_data = abs(input_data);
uint32_t uh = abs_input_data >> 8;
int32_t result;

View File

@@ -0,0 +1,221 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_integer_ops {
// Fixed-point per-channel-quantization transpose convolution reference kernel.
inline void TransposeConv(
const ConvParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const RuntimeShape& input_shape,
const int8_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int8_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
int32_t* scratch_buffer) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int32_t input_offset = params.input_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_activation_min = std::numeric_limits<int8_t>::min();
const int32_t output_activation_max = std::numeric_limits<int8_t>::max();
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
const int num_elements = output_shape.FlatSize();
// We need to initialize scratch_buffer to all 0s, as we apply the same
// 'scatter' based trick as in float version.
memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
// Loop through input elements one at a time.
for (int batch = 0; batch < batches; ++batch) {
for (int in_y = 0; in_y < input_height; ++in_y) {
for (int in_x = 0; in_x < input_width; ++in_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
// Loop through the output elements it will influence.
const int out_x_origin = (in_x * stride_width) - pad_width;
const int out_y_origin = (in_y * stride_height) - pad_height;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
for (int out_channel = 0; out_channel < output_depth;
++out_channel) {
// Compute output element location.
const int out_x = out_x_origin + filter_x;
const int out_y = out_y_origin + filter_y;
// We cannot accumulate out of bounds.
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
(out_y < output_height)) {
const int8_t input_value = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
const int8_t filter_value =
filter_data[Offset(filter_shape, out_channel, filter_y,
filter_x, in_channel)];
scratch_buffer[Offset(output_shape, batch, out_y, out_x,
out_channel)] +=
(input_value + input_offset) * filter_value;
}
}
}
}
}
}
}
}
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
out_channel)];
if (bias_data) {
acc += bias_data[out_channel];
}
acc = MultiplyByQuantizedMultiplier(
acc, output_multiplier[out_channel], output_shift[out_channel]);
acc += output_offset;
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
static_cast<int8_t>(acc);
}
}
}
}
}
// int16_t input (zero_point=0), int8_t filter, int64 accumulator
inline void TransposeConv(
const ConvParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const RuntimeShape& input_shape,
const int16_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const std::int64_t* bias_data, const RuntimeShape& output_shape,
int16_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
std::int64_t* scratch_buffer) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int32_t output_activation_min = std::numeric_limits<int16_t>::min();
const int32_t output_activation_max = std::numeric_limits<int16_t>::max();
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
const int num_elements = output_shape.FlatSize();
// We need to initialize scratch_buffer to all 0s, as we apply the same
// 'scatter' based trick as in float version.
memset(scratch_buffer, 0, num_elements * sizeof(std::int64_t));
// Loop through input elements one at a time.
for (int batch = 0; batch < batches; ++batch) {
for (int in_y = 0; in_y < input_height; ++in_y) {
for (int in_x = 0; in_x < input_width; ++in_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
// Loop through the output elements it will influence.
const int out_x_origin = (in_x * stride_width) - pad_width;
const int out_y_origin = (in_y * stride_height) - pad_height;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
for (int out_channel = 0; out_channel < output_depth;
++out_channel) {
// Compute output element location.
const int out_x = out_x_origin + filter_x;
const int out_y = out_y_origin + filter_y;
// We cannot accumulate out of bounds.
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
(out_y < output_height)) {
const int32_t input_value = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
const int32_t filter_value =
filter_data[Offset(filter_shape, out_channel, filter_y,
filter_x, in_channel)];
scratch_buffer[Offset(output_shape, batch, out_y, out_x,
out_channel)] +=
input_value * filter_value;
}
}
}
}
}
}
}
}
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
std::int64_t acc = scratch_buffer[Offset(output_shape, batch, out_y,
out_x, out_channel)];
if (bias_data) {
acc += bias_data[out_channel];
}
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
acc, output_multiplier[out_channel], output_shift[out_channel]);
scaled_acc = std::max(scaled_acc, output_activation_min);
scaled_acc = std::min(scaled_acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
static_cast<int16_t>(scaled_acc);
}
}
}
}
}
} // namespace reference_integer_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_

View File

@@ -0,0 +1,69 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
#include <algorithm>
#include <limits>
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_ops {
inline void LeakyRelu(const tflite::LeakyReluParams& params,
const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
const float val = input_data[i];
// Note that alpha might be > 1 or < 0, so we don't use std::max here.
output_data[i] = val > 0 ? val : val * params.alpha;
}
}
template <typename T>
inline void QuantizeLeakyRelu(const LeakyReluParams& params,
const RuntimeShape& input_shape,
const T* input_data,
const RuntimeShape& output_shape,
T* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
static const int32_t quantized_min = std::numeric_limits<T>::min();
static const int32_t quantized_max = std::numeric_limits<T>::max();
for (int i = 0; i < flat_size; ++i) {
const int32_t input_value = input_data[i] - params.input_offset;
int32_t unclamped_output;
if (input_value >= 0) {
unclamped_output = params.output_offset +
MultiplyByQuantizedMultiplier(
input_value, params.output_multiplier_identity,
params.output_shift_identity);
} else {
unclamped_output = params.output_offset +
MultiplyByQuantizedMultiplier(
input_value, params.output_multiplier_alpha,
params.output_shift_alpha);
}
const T clamped_output =
std::min(quantized_max, std::max(quantized_min, unclamped_output));
output_data[i] = static_cast<T>(clamped_output);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_

View File

@@ -45,6 +45,7 @@ inline void Requantize(const input_type* input_data, int32_t size,
for (int i = 0; i < size; ++i) {
output_data[i] = input_data[i] ^ 0x80;
}
return;
}
}
static constexpr int32_t kMinOutput = std::numeric_limits<output_type>::min();

View File

@@ -0,0 +1,109 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
#include <cmath>
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
// TODO(b/135760455): Move this method anonymous namespace in a cc file.
inline RuntimeShape ExtendShapeSpaceToBatch(const RuntimeShape& shape) {
if (shape.DimensionsCount() == 4) {
return shape;
}
RuntimeShape new_shape(4, 1);
new_shape.SetDim(0, shape.Dims(0));
new_shape.SetDim(1, shape.Dims(1));
new_shape.SetDim(3, shape.Dims(2));
return new_shape;
}
template <typename T>
inline void SpaceToBatchND(const SpaceToBatchParams& params,
const RuntimeShape& unextended_input1_shape,
const T* input1_data,
const RuntimeShape& unextended_input2_shape,
const int32_t* block_shape_data,
const RuntimeShape& unextended_input3_shape,
const int32_t* paddings_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
ruy::profiler::ScopeLabel label("SpaceToBatchND");
TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
unextended_output_shape.DimensionsCount());
// Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
const RuntimeShape input1_shape =
ExtendShapeSpaceToBatch(unextended_input1_shape);
const RuntimeShape output_shape =
ExtendShapeSpaceToBatch(unextended_output_shape);
const int depth = input1_shape.Dims(3);
const int input_width = input1_shape.Dims(2);
const int input_height = input1_shape.Dims(1);
const int input_batch_size = input1_shape.Dims(0);
const int output_width = output_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_batch_size = output_shape.Dims(0);
const int block_shape_height = block_shape_data[0];
const int block_shape_width =
unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
const int padding_top = paddings_data[0];
const int padding_left =
unextended_input1_shape.DimensionsCount() == 4 ? paddings_data[2] : 0;
// For uint8 quantized, the correct padding "zero value" is the output offset.
const int32_t pad_value = params.output_offset;
for (int out_b = 0; out_b < output_batch_size; ++out_b) {
int input_batch = out_b % input_batch_size;
int shift_w = (out_b / input_batch_size) % block_shape_width;
int shift_h = (out_b / input_batch_size) / block_shape_width;
for (int out_h = 0; out_h < output_height; ++out_h) {
for (int out_w = 0; out_w < output_width; ++out_w) {
T* out = output_data + Offset(output_shape, out_b, out_h, out_w, 0);
if (out_h * block_shape_height + shift_h < padding_top ||
out_h * block_shape_height + shift_h >=
padding_top + input_height ||
out_w * block_shape_width + shift_w < padding_left ||
out_w * block_shape_width + shift_w >= padding_left + input_width) {
// This may not execute correctly when pad_value != 0 and T != uint8.
memset(out, pad_value, depth * sizeof(T));
} else {
const T* in =
input1_data +
Offset(input1_shape, input_batch,
(out_h * block_shape_height + shift_h) - padding_top,
(out_w * block_shape_width + shift_w) - padding_left, 0);
memcpy(out, in, depth * sizeof(T));
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_

View File

@@ -15,23 +15,28 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/portable_tensor.h"
#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void StridedSlice(const tflite::StridedSliceParams& op_params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
SequentialTensorWriter<T>* writer) {
using strided_slice::LoopCondition;
using strided_slice::StartForAxis;
using strided_slice::StopForAxis;
ruy::profiler::ScopeLabel label("StridedSlice");
// Note that the output_shape is not used herein.
tflite::StridedSliceParams params_copy = op_params;
@@ -57,7 +62,6 @@ inline void StridedSlice(const tflite::StridedSliceParams& op_params,
const int start_4 = StartForAxis(params_copy, input_shape, 4);
const int stop_4 = StopForAxis(params_copy, input_shape, 4, start_4);
T* out_ptr = output_data;
for (int offset_0 = start_0 * input_shape.Dims(1),
end_0 = stop_0 * input_shape.Dims(1),
step_0 = params_copy.strides[0] * input_shape.Dims(1);
@@ -81,13 +85,36 @@ inline void StridedSlice(const tflite::StridedSliceParams& op_params,
for (int offset_4 = offset_3 + start_4, end_4 = offset_3 + stop_4;
!LoopCondition(offset_4, end_4, params_copy.strides[4]);
offset_4 += params_copy.strides[4]) {
*out_ptr++ = input_data[offset_4];
writer->Write(offset_4);
}
}
}
}
}
}
template <typename T>
inline void StridedSlice(const tflite::StridedSliceParams& op_params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
SequentialTensorWriter<T> writer(input_data, output_data);
StridedSlice<T>(op_params, unextended_input_shape, unextended_output_shape,
&writer);
}
template <typename T>
inline void StridedSlice(const tflite::StridedSliceParams& op_params,
const RuntimeShape& unextended_input_shape,
const TfLiteTensor* input,
const RuntimeShape& unextended_output_shape,
TfLiteTensor* output) {
SequentialTensorWriter<T> writer(input, output);
StridedSlice<T>(op_params, unextended_input_shape, unextended_output_shape,
&writer);
}
} // namespace reference_ops
} // namespace tflite

View File

@@ -65,10 +65,6 @@ inline void SubNonBroadcast(const ArithmeticParams& params,
// dimensionality if the runtime code does a single loop over one dimension
// that handles broadcasting as the base case. The code generator would then
// generate max(D1, D2) nested for loops.
// TODO(b/151345101): BroadcastSub is intentionally duplicated from
// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
// reference_ops.h.
template <int N = 5>
inline void BroadcastSubSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
@@ -336,6 +332,50 @@ void BroadcastSubSlow(const ArithmeticParams& params,
NDOpsHelper<N>(output_desc, sub_func);
}
template <int N = 5>
inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int16_t* input1_data,
const RuntimeShape& input2_shape,
const int16_t* input2_data,
const RuntimeShape& output_shape,
int16_t* output_data) {
ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
NdArrayDesc<N> desc1;
NdArrayDesc<N> desc2;
NdArrayDesc<N> output_desc;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
auto sub_func = [&](int indexes[N]) {
const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
const int32_t scaled_input1_val =
gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
const int32_t scaled_input2_val =
gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
const int32_t raw_output = scaled_input1_val - scaled_input2_val;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[SubscriptToIndex(output_desc, indexes)] =
static_cast<int16_t>(clamped_output);
};
NDOpsHelper<N>(output_desc, sub_func);
}
// Element-wise Sub that can often be used for inner loop of broadcast sub as
// well as the non-broadcast sub.
inline void SubElementwise(int size, const ArithmeticParams& params,

View File

@@ -0,0 +1,217 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void TransposeConv(
const ConvParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& filter_shape,
const float* filter_data, const RuntimeShape& bias_shape,
const float* bias_data, const RuntimeShape& output_shape,
float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
// Although transpose convolution simplifies to convolution with transposed
// weights for strides of 1, non-unitary striding complicates matters. To
// keep this reference implementation as clear as possible, we use a
// "scatter" access pattern, where we loop through all the input elements,
// computing their influence on the output, rather than looping through the
// output elements in the typical "gather" access pattern of a conv. We
// therefore must initialize the output array to zero.
const int num_elements = output_shape.FlatSize();
for (int i = 0; i < num_elements; i++) {
output_data[i] = 0.0f;
}
// Loop through input elements one at a time.
for (int batch = 0; batch < batches; ++batch) {
for (int in_y = 0; in_y < input_height; ++in_y) {
for (int in_x = 0; in_x < input_width; ++in_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
// Loop through the output elements it will influence
const int out_x_origin = (in_x * stride_width) - pad_width;
const int out_y_origin = (in_y * stride_height) - pad_height;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
for (int out_channel = 0; out_channel < output_depth;
++out_channel) {
// Compute output element location
const int out_x = out_x_origin + filter_x;
const int out_y = out_y_origin + filter_y;
// We cannot accumulate out of bounds
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
(out_y < output_height)) {
float input_value = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
float filter_value =
filter_data[Offset(filter_shape, out_channel, filter_y,
filter_x, in_channel)];
output_data[Offset(output_shape, batch, out_y, out_x,
out_channel)] +=
input_value * filter_value;
}
}
}
}
}
}
}
}
if (bias_data) {
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
output_data[Offset(output_shape, batch, out_y, out_x,
out_channel)] += bias_data[out_channel];
}
}
}
}
}
}
inline void TransposeConv(
const ConvParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data, const RuntimeShape& im2col_shape,
uint8_t* im2col_data, int32_t* scratch_buffer) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int32_t input_offset = params.input_offset;
const int32_t filter_offset = params.weights_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
const int num_elements = output_shape.FlatSize();
// We need to initialize scratch_buffer to all 0s, as we apply the same
// 'scatter' based trick as in float version.
memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
// Loop through input elements one at a time.
for (int batch = 0; batch < batches; ++batch) {
for (int in_y = 0; in_y < input_height; ++in_y) {
for (int in_x = 0; in_x < input_width; ++in_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
// Loop through the output elements it will influence.
const int out_x_origin = (in_x * stride_width) - pad_width;
const int out_y_origin = (in_y * stride_height) - pad_height;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
for (int out_channel = 0; out_channel < output_depth;
++out_channel) {
// Compute output element location.
const int out_x = out_x_origin + filter_x;
const int out_y = out_y_origin + filter_y;
// We cannot accumulate out of bounds.
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
(out_y < output_height)) {
uint8_t input_value = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
uint8_t filter_value =
filter_data[Offset(filter_shape, out_channel, filter_y,
filter_x, in_channel)];
scratch_buffer[Offset(output_shape, batch, out_y, out_x,
out_channel)] +=
(input_value + input_offset) *
(filter_value + filter_offset);
}
}
}
}
}
}
}
}
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
out_channel)];
if (bias_data) {
acc += bias_data[out_channel];
}
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
acc, output_multiplier, output_shift);
scaled_acc += output_offset;
scaled_acc = std::max(scaled_acc, output_activation_min);
scaled_acc = std::min(scaled_acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
static_cast<uint8_t>(scaled_acc);
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_

View File

@@ -140,7 +140,7 @@ inline int StopForAxis(const tflite::StridedSliceParams& params,
// start_for_axis + 1 to generate a length 1 slice, since start_for_axis has
// already been adjusted for negative indices.
if (shrink_axis) {
stop = start_for_axis + 1;
return start_for_axis + 1;
}
// end_mask override

View File

@@ -43,6 +43,20 @@ struct PaddingValues {
int16_t height_offset;
};
struct Padding3DValues {
int16_t width;
int16_t height;
int16_t depth;
// offset is used for calculating "remaining" padding, for example, `width`
// is 1 and `width_offset` is 1, so padding_left is 1 while padding_right is
// 1 + 1 = 2.
int16_t width_offset;
// Same as width_offset except it's over the height dimension.
int16_t height_offset;
// Same as width_offset except it's over the depth dimension.
int16_t depth_offset;
};
// This enumeration allows for non-default formats for the weights array
// of a fully-connected operator, allowing the use of special optimized
// runtime paths.
@@ -170,7 +184,11 @@ class RuntimeShape {
// rolls out.
RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
if (size_ > kMaxSmallSize) {
#ifdef TF_LITE_STATIC_MEMORY
TFLITE_CHECK(false && "No shape resizing supported on this platform");
#else
dims_pointer_ = new int32_t[size_];
#endif
}
std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * size_);
}
@@ -392,6 +410,20 @@ inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) {
return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
}
inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3,
int i4) {
TFLITE_DCHECK_EQ(shape.DimensionsCount(), 5);
const int* dims_data = reinterpret_cast<const int*>(shape.DimsDataUpTo5D());
TFLITE_DCHECK(i0 >= 0 && i0 < dims_data[0]);
TFLITE_DCHECK(i1 >= 0 && i1 < dims_data[1]);
TFLITE_DCHECK(i2 >= 0 && i2 < dims_data[2]);
TFLITE_DCHECK(i3 >= 0 && i3 < dims_data[3]);
TFLITE_DCHECK(i4 >= 0 && i4 < dims_data[4]);
return (((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3) *
dims_data[4] +
i4;
}
inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
TFLITE_DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
TFLITE_DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
@@ -840,6 +872,19 @@ struct ConvParams {
float float_activation_max;
};
struct Conv3DParams {
Padding3DValues padding_values;
int stride_width;
int stride_height;
int stride_depth;
int dilation_width;
int dilation_height;
int dilation_depth;
// float activation params.
float float_activation_min;
float float_activation_max;
};
struct DepthToSpaceParams {
int32_t block_size;
};
@@ -907,6 +952,7 @@ struct FullyConnectedParams {
struct GatherParams {
int16_t axis;
int16_t batch_dims;
};
struct L2NormalizationParams {
@@ -1025,9 +1071,9 @@ struct ResizeNearestNeighborParams {
struct SliceParams {
int8_t begin_count;
int32_t begin[4];
int32_t begin[5];
int8_t size_count;
int32_t size[4];
int32_t size[5];
};
struct SoftmaxParams {