mirror of
https://github.com/jomjol/AI-on-the-edge-device.git
synced 2025-12-08 20:46:52 +03:00
Rolling 20210420
This commit is contained in:
@@ -178,14 +178,54 @@ inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
|
||||
// - input x is in the range -(1<<47) <= x < (1<<47)
|
||||
assert(quantized_multiplier >= 0);
|
||||
assert(shift >= -31 && shift < 8);
|
||||
assert(x >= -(static_cast<int64_t>(1) << 47) &&
|
||||
x < (static_cast<int64_t>(1) << 47));
|
||||
|
||||
int32_t reduced_multiplier = (quantized_multiplier + (1 << 15)) >> 16;
|
||||
int32_t reduced_multiplier = (quantized_multiplier < 0x7FFF0000)
|
||||
? ((quantized_multiplier + (1 << 15)) >> 16)
|
||||
: 0x7FFF;
|
||||
int total_shift = 15 - shift;
|
||||
x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1));
|
||||
int32_t result = x >> total_shift;
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifdef USE_NEON
|
||||
// Round uses ARM's rounding shift right.
|
||||
inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
|
||||
int32x4x4_t input_val, int32_t quantized_multiplier, int shift) {
|
||||
const int left_shift = std::max(shift, 0);
|
||||
const int right_shift = std::min(shift, 0);
|
||||
int32x4x4_t result;
|
||||
|
||||
int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
|
||||
int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
|
||||
int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
|
||||
|
||||
result.val[0] =
|
||||
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
|
||||
multiplier_dup),
|
||||
right_shift_dup);
|
||||
|
||||
result.val[1] =
|
||||
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
|
||||
multiplier_dup),
|
||||
right_shift_dup);
|
||||
|
||||
result.val[2] =
|
||||
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
|
||||
multiplier_dup),
|
||||
right_shift_dup);
|
||||
|
||||
result.val[3] =
|
||||
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
|
||||
multiplier_dup),
|
||||
right_shift_dup);
|
||||
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
int CountLeadingZeros(T integer_input) {
|
||||
static_assert(std::is_unsigned<T>::value,
|
||||
@@ -261,10 +301,11 @@ inline void gen_lut(double (*func)(double), double min, double max,
|
||||
TfLiteRound(func(min + i * step + half_step) * 32768.0);
|
||||
double midpoint_err = midpoint_interp_val - midpoint_val;
|
||||
double bias = TfLiteRound(midpoint_err / 2.0);
|
||||
table[i] = std::min(std::max(sample_val - bias, -32768.0), 32767.0);
|
||||
table[i] = std::min<double>(std::max<double>(sample_val - bias, -32768.0),
|
||||
32767.0);
|
||||
}
|
||||
table[num - 1] =
|
||||
std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
|
||||
table[num - 1] = std::min<double>(
|
||||
std::max<double>(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
|
||||
}
|
||||
|
||||
// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
|
||||
@@ -289,10 +330,11 @@ inline void gen_lut(float (*func)(float), float min, float max, int16_t* table,
|
||||
TfLiteRound(func(min + i * step + half_step) * 32768.0f);
|
||||
float midpoint_err = midpoint_interp_val - midpoint_val;
|
||||
float bias = TfLiteRound(midpoint_err / 2.0f);
|
||||
table[i] = std::min(std::max(sample_val - bias, -32768.0f), 32767.0f);
|
||||
table[i] = std::min<float>(std::max<float>(sample_val - bias, -32768.0f),
|
||||
32767.0f);
|
||||
}
|
||||
table[num - 1] = std::min(
|
||||
std::max(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
|
||||
table[num - 1] = std::min<float>(
|
||||
std::max<float>(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
|
||||
}
|
||||
|
||||
// int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
|
||||
|
||||
@@ -34,6 +34,7 @@ namespace tflite {
|
||||
}
|
||||
|
||||
DECLARE_STD_GLOBAL_SWITCH1(TfLiteRound, round);
|
||||
DECLARE_STD_GLOBAL_SWITCH1(TfLiteExpm1, expm1);
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
|
||||
|
||||
#include <complex>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/lite/c/common.h"
|
||||
|
||||
@@ -289,7 +289,7 @@ void PreprocessSoftmaxScaling(double beta, double input_scale,
|
||||
input_beta_real_multiplier = (1ll << 31) - 1.0;
|
||||
}
|
||||
#else // TFLITE_EMULATE_FLOAT
|
||||
const double input_beta_real_multiplier = std::min(
|
||||
const double input_beta_real_multiplier = std::min<double>(
|
||||
beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
|
||||
#endif // TFLITE_EMULATE_FLOAT
|
||||
|
||||
|
||||
@@ -202,14 +202,6 @@ inline void Add(const ArithmeticParams& params,
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
|
||||
// dimensionality if the runtime code does a single loop over one dimension
|
||||
// that handles broadcasting as the base case. The code generator would then
|
||||
// generate max(D1, D2) nested for loops.
|
||||
// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
|
||||
// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
|
||||
// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
|
||||
// reference_ops.h.
|
||||
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const float* input1_data,
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
// T is expected to be either float or int.
|
||||
template <typename T>
|
||||
inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
|
||||
const T* const* input_data, T* output_data) {
|
||||
// All inputs and output should have the same shape, this is checked during
|
||||
// Prepare stage.
|
||||
const size_t size = input_shape.FlatSize();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
T x = 0;
|
||||
for (size_t j = 0; j < num_inputs; ++j) {
|
||||
x += input_data[j][i];
|
||||
}
|
||||
output_data[i] = x;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
|
||||
@@ -15,12 +15,23 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
std::function<bool(T, T)> GetComparefunction(bool is_arg_max) {
|
||||
if (is_arg_max) {
|
||||
return std::greater<T>();
|
||||
} else {
|
||||
return std::less<T>();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename T3, typename Cmp>
|
||||
void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
|
||||
const T3* input2_data, const RuntimeShape& output_shape,
|
||||
@@ -62,6 +73,15 @@ void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename T3>
|
||||
void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
|
||||
const T3* input2_data, const RuntimeShape& output_shape,
|
||||
T2* output_data, const bool is_arg_max) {
|
||||
ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data,
|
||||
GetComparefunction<T1>(is_arg_max));
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
|
||||
@@ -0,0 +1,101 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
// TODO(b/135760455): Move this method anonymous namespace in a cc file.
|
||||
inline RuntimeShape ExtendShapeBatchToSpace(const RuntimeShape& shape) {
|
||||
if (shape.DimensionsCount() == 4) {
|
||||
return shape;
|
||||
}
|
||||
RuntimeShape new_shape(4, 1);
|
||||
new_shape.SetDim(0, shape.Dims(0));
|
||||
new_shape.SetDim(1, shape.Dims(1));
|
||||
new_shape.SetDim(3, shape.Dims(2));
|
||||
return new_shape;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void BatchToSpaceND(const RuntimeShape& unextended_input1_shape,
|
||||
const T* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const int32_t* block_shape_data,
|
||||
const RuntimeShape& unextended_input3_shape,
|
||||
const int32_t* crops_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BatchToSpaceND");
|
||||
TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
|
||||
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
|
||||
unextended_output_shape.DimensionsCount());
|
||||
|
||||
const RuntimeShape input1_shape =
|
||||
ExtendShapeBatchToSpace(unextended_input1_shape);
|
||||
const RuntimeShape output_shape =
|
||||
ExtendShapeBatchToSpace(unextended_output_shape);
|
||||
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_batch_size = output_shape.Dims(0);
|
||||
|
||||
const int depth = input1_shape.Dims(3);
|
||||
const int input_width = input1_shape.Dims(2);
|
||||
const int input_height = input1_shape.Dims(1);
|
||||
const int input_batch_size = input1_shape.Dims(0);
|
||||
|
||||
const int block_shape_height = block_shape_data[0];
|
||||
const int block_shape_width =
|
||||
unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
|
||||
const int crops_top = crops_data[0];
|
||||
const int crops_left =
|
||||
unextended_input1_shape.DimensionsCount() == 4 ? crops_data[2] : 0;
|
||||
for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
|
||||
const int out_batch = in_batch % output_batch_size;
|
||||
const int spatial_offset = in_batch / output_batch_size;
|
||||
for (int in_h = 0; in_h < input_height; ++in_h) {
|
||||
const int out_h = in_h * block_shape_height +
|
||||
spatial_offset / block_shape_width - crops_top;
|
||||
if (out_h < 0 || out_h >= output_height) {
|
||||
continue;
|
||||
}
|
||||
for (int in_w = 0; in_w < input_width; ++in_w) {
|
||||
const int out_w = in_w * block_shape_width +
|
||||
spatial_offset % block_shape_width - crops_left;
|
||||
|
||||
if (out_w < 0 || out_w >= output_width) {
|
||||
continue;
|
||||
}
|
||||
T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
|
||||
const T* in =
|
||||
input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
|
||||
memcpy(out, in, depth * sizeof(T));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
|
||||
@@ -23,9 +23,6 @@ namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
|
||||
// generalized and efficient BroadcastBinaryFunction.
|
||||
//
|
||||
// Also appears to duplicate MinimumMaximum.
|
||||
//
|
||||
// R: Result type. T1: Input 1 type. T2: Input 2 type.
|
||||
@@ -63,7 +60,6 @@ inline void BroadcastBinaryFunction4DSlow(
|
||||
}
|
||||
|
||||
// R: Result type. T1: Input 1 type. T2: Input 2 type.
|
||||
// TODO(renjieliu): Refactor other binary functions to use this one.
|
||||
template <typename R, typename T1, typename T2>
|
||||
inline void BinaryFunction(const RuntimeShape& input1_shape,
|
||||
const T1* input1_data,
|
||||
|
||||
@@ -68,8 +68,7 @@ inline void Concatenation(const ConcatenationParams& params,
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(prabhumk): This is the same as the optimized implementation.
|
||||
// TODO(prabhumk): The quantized implementation of concatentation isn't fully
|
||||
// TODO(b/174275780): The quantized implementation of concatentation isn't fully
|
||||
// quantized as it takes scale as a floating point value. This should be fixed
|
||||
// when optimizng this routine further.
|
||||
inline void ConcatenationWithScaling(const ConcatenationParams& params,
|
||||
|
||||
@@ -15,16 +15,13 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
|
||||
inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& filter_shape,
|
||||
const float* filter_data, const RuntimeShape& bias_shape,
|
||||
@@ -108,8 +105,8 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
||||
uint8_t* output_data, const RuntimeShape& im2col_shape,
|
||||
uint8_t* im2col_data, void* cpu_backend_context) {
|
||||
(void)cpu_backend_context; // only used in optimized code.
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
|
||||
@@ -0,0 +1,239 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void DivCheckArithmeticParams(const ArithmeticParams& params) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
// Input offset is negative input zero point. Activation tensors are
|
||||
// asymmetric quantized so they span the full int8 range.
|
||||
constexpr int32_t max_value =
|
||||
static_cast<int32_t>(std::numeric_limits<T>::max());
|
||||
TFLITE_DCHECK_GE(params.input1_offset, -max_value);
|
||||
TFLITE_DCHECK_LE(params.input1_offset, max_value);
|
||||
TFLITE_DCHECK_GE(params.input2_offset, -max_value);
|
||||
TFLITE_DCHECK_LE(params.input2_offset, max_value);
|
||||
TFLITE_DCHECK_GE(params.output_offset, -max_value);
|
||||
TFLITE_DCHECK_LE(params.output_offset, max_value);
|
||||
}
|
||||
|
||||
// Element-wise div that can often be used for inner loop of broadcast Div as
|
||||
// well as the non-broadcast Div.
|
||||
template <typename T>
|
||||
inline void DivElementwise(int size, const ArithmeticParams& params,
|
||||
const T* input1_data, const T* input2_data,
|
||||
T* output_data) {
|
||||
DivCheckArithmeticParams<T>(params);
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
TFLITE_DCHECK_NE(input2_val, 0);
|
||||
int recip_shift;
|
||||
const int32_t input2_inv =
|
||||
(input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
|
||||
: -GetReciprocal(-input2_val, 31, &recip_shift);
|
||||
const int headroom = CountLeadingSignBits(input1_val);
|
||||
const int32_t unscaled_quotient =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
|
||||
headroom);
|
||||
const int total_shift = params.output_shift - recip_shift - headroom;
|
||||
const int32_t unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
unscaled_quotient, params.output_multiplier, total_shift);
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[i] = static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
inline void Div(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const uint8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8_t* input2_data,
|
||||
const RuntimeShape& output_shape, uint8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
DivElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void Div(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const int8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const int8_t* input2_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
DivElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
template <typename T, int N = 5>
|
||||
inline void BroadcastDivSlowQuantized(
|
||||
const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape,
|
||||
const T* input1_data, const RuntimeShape& unextended_input2_shape,
|
||||
const T* input2_data, const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
|
||||
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
||||
unextended_input2_shape, &desc1, &desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
|
||||
&output_desc);
|
||||
|
||||
DivCheckArithmeticParams<T>(params);
|
||||
|
||||
auto div_func = [&](int indexes[N]) {
|
||||
const int32_t input1_val =
|
||||
params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
|
||||
const int32_t input2_val =
|
||||
params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
|
||||
TFLITE_DCHECK_NE(input2_val, 0);
|
||||
int recip_shift;
|
||||
const int32_t input2_inv =
|
||||
(input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
|
||||
: -GetReciprocal(-input2_val, 31, &recip_shift);
|
||||
const int headroom = CountLeadingSignBits(input1_val);
|
||||
const int32_t unscaled_quotient =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
|
||||
headroom);
|
||||
const int total_shift = params.output_shift - recip_shift - headroom;
|
||||
const int32_t unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
unscaled_quotient, params.output_multiplier, total_shift);
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
static_cast<T>(clamped_output);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, div_func);
|
||||
}
|
||||
|
||||
template <int N = 5>
|
||||
inline void BroadcastDivSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& unextended_input1_shape,
|
||||
const uint8_t* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const uint8_t* input2_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
uint8_t* output_data) {
|
||||
BroadcastDivSlowQuantized<uint8_t, N>(
|
||||
params, unextended_input1_shape, input1_data, unextended_input2_shape,
|
||||
input2_data, unextended_output_shape, output_data);
|
||||
}
|
||||
|
||||
template <int N = 5>
|
||||
inline void BroadcastDivSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& unextended_input1_shape,
|
||||
const int8_t* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const int8_t* input2_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
int8_t* output_data) {
|
||||
BroadcastDivSlowQuantized<int8_t, N>(
|
||||
params, unextended_input1_shape, input1_data, unextended_input2_shape,
|
||||
input2_data, unextended_output_shape, output_data);
|
||||
}
|
||||
|
||||
// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
|
||||
// dimensionality if the runtime code does a single loop over one dimension
|
||||
// that handles broadcasting as the base case. The code generator would then
|
||||
// generate max(D1, D2) nested for loops.
|
||||
template <typename T, int N = 5>
|
||||
void BroadcastDivSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& unextended_input1_shape,
|
||||
const T* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const T* input2_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
T output_activation_min;
|
||||
T output_activation_max;
|
||||
GetActivationParams(params, &output_activation_min, &output_activation_max);
|
||||
|
||||
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
|
||||
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
||||
unextended_input2_shape, &desc1, &desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
|
||||
&output_desc);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest
|
||||
// stride, typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
|
||||
auto div_func = [&](int indexes[N]) {
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
ActivationFunctionWithMinMax(
|
||||
input1_data[SubscriptToIndex(desc1, indexes)] /
|
||||
input2_data[SubscriptToIndex(desc2, indexes)],
|
||||
output_activation_min, output_activation_max);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, div_func);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void Div(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const T* input1_data,
|
||||
const RuntimeShape& input2_shape, const T* input2_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
T output_activation_min;
|
||||
T output_activation_max;
|
||||
GetActivationParams(params, &output_activation_min, &output_activation_max);
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = ActivationFunctionWithMinMax(
|
||||
input1_data[i] / input2_data[i], output_activation_min,
|
||||
output_activation_max);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
|
||||
@@ -0,0 +1,37 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
inline void Elu(const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
const float val = input_data[i];
|
||||
output_data[i] = val < 0.0f ? TfLiteExpm1(val) : val;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
|
||||
@@ -0,0 +1,38 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void Exp(const T* input_data, const size_t num_elements,
|
||||
T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("Exp");
|
||||
for (size_t idx = 0; idx < num_elements; ++idx) {
|
||||
output_data[idx] = std::exp(input_data[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
|
||||
@@ -0,0 +1,38 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
void Fill(const RuntimeShape& value_shape, const T* value_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
TFLITE_DCHECK_EQ(value_shape.DimensionsCount(), 0);
|
||||
const int flat_size = output_shape.FlatSize();
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = *value_data;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
|
||||
@@ -31,7 +31,7 @@ inline void FullyConnected(
|
||||
float* output_data) {
|
||||
const float output_activation_min = params.float_activation_min;
|
||||
const float output_activation_max = params.float_activation_max;
|
||||
// TODO(benoitjacob): This really should be:
|
||||
// TODO(b/62193649): This really should be:
|
||||
// const int batches = ArraySize(output_dims, 1);
|
||||
// but the current --variable_batch hack consists in overwriting the 3rd
|
||||
// dimension with the runtime batch size, as we don't keep track for each
|
||||
@@ -76,7 +76,7 @@ inline void FullyConnected(
|
||||
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
|
||||
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
// TODO(benoitjacob): This really should be:
|
||||
// TODO(b/62193649): This really should be:
|
||||
// const int batches = ArraySize(output_dims, 1);
|
||||
// but the current --variable_batch hack consists in overwriting the 3rd
|
||||
// dimension with the runtime batch size, as we don't keep track for each
|
||||
@@ -123,7 +123,7 @@ inline void FullyConnected(
|
||||
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
TFLITE_DCHECK_EQ(output_offset, 0);
|
||||
// TODO(benoitjacob): This really should be:
|
||||
// TODO(b/62193649): This really should be:
|
||||
// const int batches = ArraySize(output_dims, 1);
|
||||
// but the current --variable_batch hack consists in overwriting the 3rd
|
||||
// dimension with the runtime batch size, as we don't keep track for each
|
||||
@@ -176,7 +176,7 @@ inline void ShuffledFullyConnected(
|
||||
TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
|
||||
TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
|
||||
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
|
||||
// TODO(benoitjacob): This really should be:
|
||||
// TODO(b/62193649): This really should be:
|
||||
// const int batches = ArraySize(output_dims, 1);
|
||||
// but the current --variable_batch hack consists in overwriting the 3rd
|
||||
// dimension with the runtime batch size, as we don't keep track for each
|
||||
|
||||
@@ -34,55 +34,24 @@ inline void CheckArithmeticParams(const ArithmeticParams& params) {
|
||||
TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
|
||||
}
|
||||
|
||||
// Element-wise add that can often be used for inner loop of broadcast add as
|
||||
// well as the non-broadcast add.
|
||||
inline void AddElementwise(int size, const ArithmeticParams& params,
|
||||
const int8_t* input1_data, const int8_t* input2_data,
|
||||
int8_t* output_data) {
|
||||
inline void ElementWise(
|
||||
int size, const ArithmeticParams& params, const int8_t* input1_data,
|
||||
const int8_t* input2_data, int8_t* output_data,
|
||||
void (*check_arithmetic_params)(const ArithmeticParams&),
|
||||
int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
|
||||
CheckArithmeticParams(params);
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sum, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[i] = static_cast<int8_t>(clamped_output);
|
||||
output_data[i] = binary_func(input1_data[i], input2_data[i], params);
|
||||
}
|
||||
}
|
||||
|
||||
inline void Add(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const int8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const int8_t* input2_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
CheckArithmeticParams(params);
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int8_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int8_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
inline void BroadcastBinaryFunction4DSlow(
|
||||
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
||||
const int8_t* input1_data, const RuntimeShape& input2_shape,
|
||||
const int8_t* input2_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data,
|
||||
void (*check_arithmetic_params)(const ArithmeticParams&),
|
||||
int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
@@ -105,40 +74,70 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
const int32_t input1_val =
|
||||
params.input1_offset +
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
|
||||
const int32_t input2_val =
|
||||
params.input2_offset +
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
|
||||
const int32_t shifted_input1_val =
|
||||
input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val =
|
||||
input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier,
|
||||
params.input1_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier,
|
||||
params.input2_shift);
|
||||
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sum, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
||||
static_cast<int8_t>(clamped_output);
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] = binary_func(
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)],
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)], params);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline int8_t AddFunc(int8_t x, int8_t y, const ArithmeticParams& params) {
|
||||
const int32_t input1_val = params.input1_offset + x;
|
||||
const int32_t input2_val = params.input2_offset + y;
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sum, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
return static_cast<int8_t>(clamped_output);
|
||||
}
|
||||
|
||||
// Element-wise add that can often be used for inner loop of broadcast add as
|
||||
// well as the non-broadcast add.
|
||||
inline void AddElementwise(int size, const ArithmeticParams& params,
|
||||
const int8_t* input1_data, const int8_t* input2_data,
|
||||
int8_t* output_data) {
|
||||
ElementWise(size, params, input1_data, input2_data, output_data,
|
||||
CheckArithmeticParams, AddFunc);
|
||||
}
|
||||
|
||||
inline void Add(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const int8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const int8_t* input2_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
CheckArithmeticParams(params);
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int8_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int8_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
BroadcastBinaryFunction4DSlow(params, input1_shape, input1_data, input2_shape,
|
||||
input2_data, output_shape, output_data,
|
||||
CheckArithmeticParams, AddFunc);
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
|
||||
@@ -101,7 +101,7 @@ inline void ConvPerChannel(
|
||||
// long as the filter size (filter_y * filter_x * in_channel)
|
||||
// does not exceed 2^16, which is the case in all the models
|
||||
// we have seen so far.
|
||||
// TODO(jianlijianli): Add a check to make sure the
|
||||
// TODO(b/174275578): Add a check to make sure the
|
||||
// accumulator depth is smaller than 2^16.
|
||||
acc += filter_val * (input_val + input_offset);
|
||||
}
|
||||
|
||||
@@ -95,7 +95,7 @@ inline void DepthwiseConvPerChannel(
|
||||
// long as the filter size (filter_y * filter_x * in_channel)
|
||||
// does not exceed 2^16, which is the case in all the models
|
||||
// we have seen so far.
|
||||
// TODO(jianlijianli): Add a check to make sure the
|
||||
// TODO(b/174275578): Add a check to make sure the
|
||||
// accumulator depth is smaller than 2^16.
|
||||
acc += filter_val * (input_val + input_offset);
|
||||
}
|
||||
|
||||
@@ -58,23 +58,36 @@ inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
|
||||
}
|
||||
}
|
||||
|
||||
inline void Logistic(int32_t input_multiplier, int32_t input_size,
|
||||
const int16_t* ptr_input_data, int16_t* ptr_output_data) {
|
||||
inline void Logistic(int32_t input_multiplier, int32_t input_left_shift,
|
||||
int32_t input_size, const int16_t* ptr_input_data,
|
||||
int16_t* ptr_output_data) {
|
||||
// We use the LUT for sigmoid and take into account, that
|
||||
// tanh(x) = 2*sigmoid(2*x) - 1
|
||||
|
||||
int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
|
||||
// We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
|
||||
// In case of general parameter scale, multiplier 3 is taken into account
|
||||
// in TanhPrepare function and it is included in
|
||||
// input_multiplier already.
|
||||
|
||||
TFLITE_DCHECK_GE(input_left_shift, 0);
|
||||
if (input_multiplier == 0) { // power of two case
|
||||
input_multiplier = 3 << input_left_shift;
|
||||
input_left_shift = 0;
|
||||
}
|
||||
|
||||
int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
|
||||
|
||||
for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
|
||||
int32_t input_data = (*ptr_input_data) * input_data_mul;
|
||||
int32_t input_data =
|
||||
((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
|
||||
|
||||
// Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
|
||||
// we do interpolation on unsigned values.
|
||||
uint32_t abs_input_data = 3 * abs(input_data);
|
||||
// We do interpolation on unsigned values.
|
||||
uint32_t abs_input_data = abs(input_data);
|
||||
|
||||
// We divide by 2 power of 9, because
|
||||
// we need to divide by 2 in power of 7 for
|
||||
// the input conversion + 1/4 from the scale above.
|
||||
|
||||
// Define uh as uint32_t type not to make this function overflow.
|
||||
uint32_t uh = abs_input_data >> 9;
|
||||
uint32_t result;
|
||||
|
||||
@@ -65,19 +65,25 @@ inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
|
||||
// We use the LUT for sigmoid and take into account, that
|
||||
// tanh(x) = 2*sigmoid(2*x) - 1
|
||||
|
||||
int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
|
||||
// We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
|
||||
// In case of general parameter scale, multiplier 3 is taken into account
|
||||
// in TanhPrepare function and it is included in
|
||||
// input_multiplier already.
|
||||
|
||||
if (input_multiplier == 0) { // power of two case
|
||||
input_multiplier = 3 << input_left_shift;
|
||||
input_left_shift = 0;
|
||||
}
|
||||
|
||||
int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
|
||||
|
||||
int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++) {
|
||||
int32_t input_data = (*ptr_input_data) * input_data_mul;
|
||||
int32_t input_data =
|
||||
((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
|
||||
|
||||
if (input_left_shift == 1) {
|
||||
input_data <<= 1;
|
||||
}
|
||||
|
||||
// Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
|
||||
uint32_t abs_input_data = 3 * abs(input_data);
|
||||
uint32_t abs_input_data = abs(input_data);
|
||||
uint32_t uh = abs_input_data >> 8;
|
||||
int32_t result;
|
||||
|
||||
|
||||
@@ -0,0 +1,221 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
// Fixed-point per-channel-quantization transpose convolution reference kernel.
|
||||
inline void TransposeConv(
|
||||
const ConvParams& params, const int32_t* output_multiplier,
|
||||
const int32_t* output_shift, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
|
||||
int32_t* scratch_buffer) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_activation_min = std::numeric_limits<int8_t>::min();
|
||||
const int32_t output_activation_max = std::numeric_limits<int8_t>::max();
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
|
||||
const int num_elements = output_shape.FlatSize();
|
||||
// We need to initialize scratch_buffer to all 0s, as we apply the same
|
||||
// 'scatter' based trick as in float version.
|
||||
memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
|
||||
|
||||
// Loop through input elements one at a time.
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int in_y = 0; in_y < input_height; ++in_y) {
|
||||
for (int in_x = 0; in_x < input_width; ++in_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
// Loop through the output elements it will influence.
|
||||
const int out_x_origin = (in_x * stride_width) - pad_width;
|
||||
const int out_y_origin = (in_y * stride_height) - pad_height;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth;
|
||||
++out_channel) {
|
||||
// Compute output element location.
|
||||
const int out_x = out_x_origin + filter_x;
|
||||
const int out_y = out_y_origin + filter_y;
|
||||
// We cannot accumulate out of bounds.
|
||||
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
|
||||
(out_y < output_height)) {
|
||||
const int8_t input_value = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
const int8_t filter_value =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
scratch_buffer[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)] +=
|
||||
(input_value + input_offset) * filter_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)];
|
||||
if (bias_data) {
|
||||
acc += bias_data[out_channel];
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(
|
||||
acc, output_multiplier[out_channel], output_shift[out_channel]);
|
||||
acc += output_offset;
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
|
||||
static_cast<int8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// int16_t input (zero_point=0), int8_t filter, int64 accumulator
|
||||
inline void TransposeConv(
|
||||
const ConvParams& params, const int32_t* output_multiplier,
|
||||
const int32_t* output_shift, const RuntimeShape& input_shape,
|
||||
const int16_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const std::int64_t* bias_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
|
||||
std::int64_t* scratch_buffer) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int32_t output_activation_min = std::numeric_limits<int16_t>::min();
|
||||
const int32_t output_activation_max = std::numeric_limits<int16_t>::max();
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
|
||||
const int num_elements = output_shape.FlatSize();
|
||||
// We need to initialize scratch_buffer to all 0s, as we apply the same
|
||||
// 'scatter' based trick as in float version.
|
||||
memset(scratch_buffer, 0, num_elements * sizeof(std::int64_t));
|
||||
|
||||
// Loop through input elements one at a time.
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int in_y = 0; in_y < input_height; ++in_y) {
|
||||
for (int in_x = 0; in_x < input_width; ++in_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
// Loop through the output elements it will influence.
|
||||
const int out_x_origin = (in_x * stride_width) - pad_width;
|
||||
const int out_y_origin = (in_y * stride_height) - pad_height;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth;
|
||||
++out_channel) {
|
||||
// Compute output element location.
|
||||
const int out_x = out_x_origin + filter_x;
|
||||
const int out_y = out_y_origin + filter_y;
|
||||
// We cannot accumulate out of bounds.
|
||||
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
|
||||
(out_y < output_height)) {
|
||||
const int32_t input_value = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
const int32_t filter_value =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
scratch_buffer[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)] +=
|
||||
input_value * filter_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
std::int64_t acc = scratch_buffer[Offset(output_shape, batch, out_y,
|
||||
out_x, out_channel)];
|
||||
if (bias_data) {
|
||||
acc += bias_data[out_channel];
|
||||
}
|
||||
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
|
||||
acc, output_multiplier[out_channel], output_shift[out_channel]);
|
||||
scaled_acc = std::max(scaled_acc, output_activation_min);
|
||||
scaled_acc = std::min(scaled_acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
|
||||
static_cast<int16_t>(scaled_acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
|
||||
@@ -0,0 +1,69 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void LeakyRelu(const tflite::LeakyReluParams& params,
|
||||
const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
const float val = input_data[i];
|
||||
// Note that alpha might be > 1 or < 0, so we don't use std::max here.
|
||||
output_data[i] = val > 0 ? val : val * params.alpha;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void QuantizeLeakyRelu(const LeakyReluParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
T* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
static const int32_t quantized_min = std::numeric_limits<T>::min();
|
||||
static const int32_t quantized_max = std::numeric_limits<T>::max();
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
const int32_t input_value = input_data[i] - params.input_offset;
|
||||
int32_t unclamped_output;
|
||||
if (input_value >= 0) {
|
||||
unclamped_output = params.output_offset +
|
||||
MultiplyByQuantizedMultiplier(
|
||||
input_value, params.output_multiplier_identity,
|
||||
params.output_shift_identity);
|
||||
} else {
|
||||
unclamped_output = params.output_offset +
|
||||
MultiplyByQuantizedMultiplier(
|
||||
input_value, params.output_multiplier_alpha,
|
||||
params.output_shift_alpha);
|
||||
}
|
||||
const T clamped_output =
|
||||
std::min(quantized_max, std::max(quantized_min, unclamped_output));
|
||||
output_data[i] = static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
|
||||
@@ -45,6 +45,7 @@ inline void Requantize(const input_type* input_data, int32_t size,
|
||||
for (int i = 0; i < size; ++i) {
|
||||
output_data[i] = input_data[i] ^ 0x80;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
static constexpr int32_t kMinOutput = std::numeric_limits<output_type>::min();
|
||||
|
||||
@@ -0,0 +1,109 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
// TODO(b/135760455): Move this method anonymous namespace in a cc file.
|
||||
inline RuntimeShape ExtendShapeSpaceToBatch(const RuntimeShape& shape) {
|
||||
if (shape.DimensionsCount() == 4) {
|
||||
return shape;
|
||||
}
|
||||
RuntimeShape new_shape(4, 1);
|
||||
new_shape.SetDim(0, shape.Dims(0));
|
||||
new_shape.SetDim(1, shape.Dims(1));
|
||||
new_shape.SetDim(3, shape.Dims(2));
|
||||
return new_shape;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void SpaceToBatchND(const SpaceToBatchParams& params,
|
||||
const RuntimeShape& unextended_input1_shape,
|
||||
const T* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const int32_t* block_shape_data,
|
||||
const RuntimeShape& unextended_input3_shape,
|
||||
const int32_t* paddings_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("SpaceToBatchND");
|
||||
TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
|
||||
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
|
||||
unextended_output_shape.DimensionsCount());
|
||||
|
||||
// Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
|
||||
const RuntimeShape input1_shape =
|
||||
ExtendShapeSpaceToBatch(unextended_input1_shape);
|
||||
const RuntimeShape output_shape =
|
||||
ExtendShapeSpaceToBatch(unextended_output_shape);
|
||||
|
||||
const int depth = input1_shape.Dims(3);
|
||||
const int input_width = input1_shape.Dims(2);
|
||||
const int input_height = input1_shape.Dims(1);
|
||||
const int input_batch_size = input1_shape.Dims(0);
|
||||
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_batch_size = output_shape.Dims(0);
|
||||
|
||||
const int block_shape_height = block_shape_data[0];
|
||||
const int block_shape_width =
|
||||
unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
|
||||
const int padding_top = paddings_data[0];
|
||||
const int padding_left =
|
||||
unextended_input1_shape.DimensionsCount() == 4 ? paddings_data[2] : 0;
|
||||
|
||||
// For uint8 quantized, the correct padding "zero value" is the output offset.
|
||||
const int32_t pad_value = params.output_offset;
|
||||
for (int out_b = 0; out_b < output_batch_size; ++out_b) {
|
||||
int input_batch = out_b % input_batch_size;
|
||||
int shift_w = (out_b / input_batch_size) % block_shape_width;
|
||||
int shift_h = (out_b / input_batch_size) / block_shape_width;
|
||||
for (int out_h = 0; out_h < output_height; ++out_h) {
|
||||
for (int out_w = 0; out_w < output_width; ++out_w) {
|
||||
T* out = output_data + Offset(output_shape, out_b, out_h, out_w, 0);
|
||||
if (out_h * block_shape_height + shift_h < padding_top ||
|
||||
out_h * block_shape_height + shift_h >=
|
||||
padding_top + input_height ||
|
||||
out_w * block_shape_width + shift_w < padding_left ||
|
||||
out_w * block_shape_width + shift_w >= padding_left + input_width) {
|
||||
// This may not execute correctly when pad_value != 0 and T != uint8.
|
||||
memset(out, pad_value, depth * sizeof(T));
|
||||
} else {
|
||||
const T* in =
|
||||
input1_data +
|
||||
Offset(input1_shape, input_batch,
|
||||
(out_h * block_shape_height + shift_h) - padding_top,
|
||||
(out_w * block_shape_width + shift_w) - padding_left, 0);
|
||||
memcpy(out, in, depth * sizeof(T));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
|
||||
@@ -15,23 +15,28 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/portable_tensor.h"
|
||||
#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void StridedSlice(const tflite::StridedSliceParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
SequentialTensorWriter<T>* writer) {
|
||||
using strided_slice::LoopCondition;
|
||||
using strided_slice::StartForAxis;
|
||||
using strided_slice::StopForAxis;
|
||||
|
||||
ruy::profiler::ScopeLabel label("StridedSlice");
|
||||
|
||||
// Note that the output_shape is not used herein.
|
||||
tflite::StridedSliceParams params_copy = op_params;
|
||||
|
||||
@@ -57,7 +62,6 @@ inline void StridedSlice(const tflite::StridedSliceParams& op_params,
|
||||
const int start_4 = StartForAxis(params_copy, input_shape, 4);
|
||||
const int stop_4 = StopForAxis(params_copy, input_shape, 4, start_4);
|
||||
|
||||
T* out_ptr = output_data;
|
||||
for (int offset_0 = start_0 * input_shape.Dims(1),
|
||||
end_0 = stop_0 * input_shape.Dims(1),
|
||||
step_0 = params_copy.strides[0] * input_shape.Dims(1);
|
||||
@@ -81,13 +85,36 @@ inline void StridedSlice(const tflite::StridedSliceParams& op_params,
|
||||
for (int offset_4 = offset_3 + start_4, end_4 = offset_3 + stop_4;
|
||||
!LoopCondition(offset_4, end_4, params_copy.strides[4]);
|
||||
offset_4 += params_copy.strides[4]) {
|
||||
*out_ptr++ = input_data[offset_4];
|
||||
writer->Write(offset_4);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void StridedSlice(const tflite::StridedSliceParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
SequentialTensorWriter<T> writer(input_data, output_data);
|
||||
StridedSlice<T>(op_params, unextended_input_shape, unextended_output_shape,
|
||||
&writer);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void StridedSlice(const tflite::StridedSliceParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const TfLiteTensor* input,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
TfLiteTensor* output) {
|
||||
SequentialTensorWriter<T> writer(input, output);
|
||||
StridedSlice<T>(op_params, unextended_input_shape, unextended_output_shape,
|
||||
&writer);
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
|
||||
@@ -65,10 +65,6 @@ inline void SubNonBroadcast(const ArithmeticParams& params,
|
||||
// dimensionality if the runtime code does a single loop over one dimension
|
||||
// that handles broadcasting as the base case. The code generator would then
|
||||
// generate max(D1, D2) nested for loops.
|
||||
// TODO(b/151345101): BroadcastSub is intentionally duplicated from
|
||||
// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
|
||||
// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
|
||||
// reference_ops.h.
|
||||
template <int N = 5>
|
||||
inline void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
@@ -336,6 +332,50 @@ void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
NDOpsHelper<N>(output_desc, sub_func);
|
||||
}
|
||||
|
||||
template <int N = 5>
|
||||
inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int16_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int16_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
auto sub_func = [&](int indexes[N]) {
|
||||
const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
|
||||
const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
|
||||
const int32_t scaled_input1_val =
|
||||
gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
|
||||
const int32_t raw_output = scaled_input1_val - scaled_input2_val;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
static_cast<int16_t>(clamped_output);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, sub_func);
|
||||
}
|
||||
|
||||
// Element-wise Sub that can often be used for inner loop of broadcast sub as
|
||||
// well as the non-broadcast sub.
|
||||
inline void SubElementwise(int size, const ArithmeticParams& params,
|
||||
|
||||
@@ -0,0 +1,217 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
inline void TransposeConv(
|
||||
const ConvParams& params, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& filter_shape,
|
||||
const float* filter_data, const RuntimeShape& bias_shape,
|
||||
const float* bias_data, const RuntimeShape& output_shape,
|
||||
float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
|
||||
// Although transpose convolution simplifies to convolution with transposed
|
||||
// weights for strides of 1, non-unitary striding complicates matters. To
|
||||
// keep this reference implementation as clear as possible, we use a
|
||||
// "scatter" access pattern, where we loop through all the input elements,
|
||||
// computing their influence on the output, rather than looping through the
|
||||
// output elements in the typical "gather" access pattern of a conv. We
|
||||
// therefore must initialize the output array to zero.
|
||||
const int num_elements = output_shape.FlatSize();
|
||||
for (int i = 0; i < num_elements; i++) {
|
||||
output_data[i] = 0.0f;
|
||||
}
|
||||
|
||||
// Loop through input elements one at a time.
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int in_y = 0; in_y < input_height; ++in_y) {
|
||||
for (int in_x = 0; in_x < input_width; ++in_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
// Loop through the output elements it will influence
|
||||
const int out_x_origin = (in_x * stride_width) - pad_width;
|
||||
const int out_y_origin = (in_y * stride_height) - pad_height;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth;
|
||||
++out_channel) {
|
||||
// Compute output element location
|
||||
const int out_x = out_x_origin + filter_x;
|
||||
const int out_y = out_y_origin + filter_y;
|
||||
// We cannot accumulate out of bounds
|
||||
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
|
||||
(out_y < output_height)) {
|
||||
float input_value = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
float filter_value =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
output_data[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)] +=
|
||||
input_value * filter_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias_data) {
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
output_data[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)] += bias_data[out_channel];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void TransposeConv(
|
||||
const ConvParams& params, const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data, const RuntimeShape& im2col_shape,
|
||||
uint8_t* im2col_data, int32_t* scratch_buffer) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
|
||||
const int num_elements = output_shape.FlatSize();
|
||||
// We need to initialize scratch_buffer to all 0s, as we apply the same
|
||||
// 'scatter' based trick as in float version.
|
||||
memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
|
||||
|
||||
// Loop through input elements one at a time.
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int in_y = 0; in_y < input_height; ++in_y) {
|
||||
for (int in_x = 0; in_x < input_width; ++in_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
// Loop through the output elements it will influence.
|
||||
const int out_x_origin = (in_x * stride_width) - pad_width;
|
||||
const int out_y_origin = (in_y * stride_height) - pad_height;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth;
|
||||
++out_channel) {
|
||||
// Compute output element location.
|
||||
const int out_x = out_x_origin + filter_x;
|
||||
const int out_y = out_y_origin + filter_y;
|
||||
// We cannot accumulate out of bounds.
|
||||
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
|
||||
(out_y < output_height)) {
|
||||
uint8_t input_value = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
uint8_t filter_value =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
scratch_buffer[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)] +=
|
||||
(input_value + input_offset) *
|
||||
(filter_value + filter_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)];
|
||||
if (bias_data) {
|
||||
acc += bias_data[out_channel];
|
||||
}
|
||||
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
|
||||
acc, output_multiplier, output_shift);
|
||||
scaled_acc += output_offset;
|
||||
scaled_acc = std::max(scaled_acc, output_activation_min);
|
||||
scaled_acc = std::min(scaled_acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
|
||||
static_cast<uint8_t>(scaled_acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
|
||||
@@ -140,7 +140,7 @@ inline int StopForAxis(const tflite::StridedSliceParams& params,
|
||||
// start_for_axis + 1 to generate a length 1 slice, since start_for_axis has
|
||||
// already been adjusted for negative indices.
|
||||
if (shrink_axis) {
|
||||
stop = start_for_axis + 1;
|
||||
return start_for_axis + 1;
|
||||
}
|
||||
|
||||
// end_mask override
|
||||
|
||||
@@ -43,6 +43,20 @@ struct PaddingValues {
|
||||
int16_t height_offset;
|
||||
};
|
||||
|
||||
struct Padding3DValues {
|
||||
int16_t width;
|
||||
int16_t height;
|
||||
int16_t depth;
|
||||
// offset is used for calculating "remaining" padding, for example, `width`
|
||||
// is 1 and `width_offset` is 1, so padding_left is 1 while padding_right is
|
||||
// 1 + 1 = 2.
|
||||
int16_t width_offset;
|
||||
// Same as width_offset except it's over the height dimension.
|
||||
int16_t height_offset;
|
||||
// Same as width_offset except it's over the depth dimension.
|
||||
int16_t depth_offset;
|
||||
};
|
||||
|
||||
// This enumeration allows for non-default formats for the weights array
|
||||
// of a fully-connected operator, allowing the use of special optimized
|
||||
// runtime paths.
|
||||
@@ -170,7 +184,11 @@ class RuntimeShape {
|
||||
// rolls out.
|
||||
RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
|
||||
if (size_ > kMaxSmallSize) {
|
||||
#ifdef TF_LITE_STATIC_MEMORY
|
||||
TFLITE_CHECK(false && "No shape resizing supported on this platform");
|
||||
#else
|
||||
dims_pointer_ = new int32_t[size_];
|
||||
#endif
|
||||
}
|
||||
std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * size_);
|
||||
}
|
||||
@@ -392,6 +410,20 @@ inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) {
|
||||
return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
|
||||
}
|
||||
|
||||
inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3,
|
||||
int i4) {
|
||||
TFLITE_DCHECK_EQ(shape.DimensionsCount(), 5);
|
||||
const int* dims_data = reinterpret_cast<const int*>(shape.DimsDataUpTo5D());
|
||||
TFLITE_DCHECK(i0 >= 0 && i0 < dims_data[0]);
|
||||
TFLITE_DCHECK(i1 >= 0 && i1 < dims_data[1]);
|
||||
TFLITE_DCHECK(i2 >= 0 && i2 < dims_data[2]);
|
||||
TFLITE_DCHECK(i3 >= 0 && i3 < dims_data[3]);
|
||||
TFLITE_DCHECK(i4 >= 0 && i4 < dims_data[4]);
|
||||
return (((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3) *
|
||||
dims_data[4] +
|
||||
i4;
|
||||
}
|
||||
|
||||
inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
|
||||
TFLITE_DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
|
||||
TFLITE_DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
|
||||
@@ -840,6 +872,19 @@ struct ConvParams {
|
||||
float float_activation_max;
|
||||
};
|
||||
|
||||
struct Conv3DParams {
|
||||
Padding3DValues padding_values;
|
||||
int stride_width;
|
||||
int stride_height;
|
||||
int stride_depth;
|
||||
int dilation_width;
|
||||
int dilation_height;
|
||||
int dilation_depth;
|
||||
// float activation params.
|
||||
float float_activation_min;
|
||||
float float_activation_max;
|
||||
};
|
||||
|
||||
struct DepthToSpaceParams {
|
||||
int32_t block_size;
|
||||
};
|
||||
@@ -907,6 +952,7 @@ struct FullyConnectedParams {
|
||||
|
||||
struct GatherParams {
|
||||
int16_t axis;
|
||||
int16_t batch_dims;
|
||||
};
|
||||
|
||||
struct L2NormalizationParams {
|
||||
@@ -1025,9 +1071,9 @@ struct ResizeNearestNeighborParams {
|
||||
|
||||
struct SliceParams {
|
||||
int8_t begin_count;
|
||||
int32_t begin[4];
|
||||
int32_t begin[5];
|
||||
int8_t size_count;
|
||||
int32_t size[4];
|
||||
int32_t size[5];
|
||||
};
|
||||
|
||||
struct SoftmaxParams {
|
||||
|
||||
Reference in New Issue
Block a user