Rolling 20220103

This commit is contained in:
jomjol
2022-01-03 17:32:01 +01:00
parent 8e8897c70d
commit 8dd3a92671
358 changed files with 12460 additions and 5646 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,112 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
#include <cstdint>
#include "tensorflow/lite/kernels/op_macros.h"
#ifndef TFLITE_DCHECK
#define TFLITE_DCHECK(condition) (condition) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
#ifndef TFLITE_DCHECK_EQ
#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
#ifndef TFLITE_DCHECK_NE
#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
#ifndef TFLITE_DCHECK_GE
#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
#ifndef TFLITE_DCHECK_GT
#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
#ifndef TFLITE_DCHECK_LE
#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
#ifndef TFLITE_DCHECK_LT
#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
// TODO(ahentz): Clean up: We should stick to the DCHECK versions.
#ifndef TFLITE_CHECK
#define TFLITE_CHECK(condition) (condition) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TFLITE_CHECK_EQ
#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TFLITE_CHECK_NE
#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TFLITE_CHECK_GE
#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TFLITE_CHECK_GT
#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TFLITE_CHECK_LE
#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TFLITE_CHECK_LT
#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TF_LITE_STATIC_MEMORY
// TODO(b/162019032): Consider removing these type-aliases.
using int8 = std::int8_t;
using uint8 = std::uint8_t;
using int16 = std::int16_t;
using uint16 = std::uint16_t;
using int32 = std::int32_t;
using uint32 = std::uint32_t;
#endif // !defined(TF_LITE_STATIC_MEMORY)
// TFLITE_DEPRECATED()
//
// Duplicated from absl/base/macros.h to avoid pulling in that library.
// Marks a deprecated class, struct, enum, function, method and variable
// declarations. The macro argument is used as a custom diagnostic message (e.g.
// suggestion of a better alternative).
//
// Example:
//
// class TFLITE_DEPRECATED("Use Bar instead") Foo {...};
// TFLITE_DEPRECATED("Use Baz instead") void Bar() {...}
//
// Every usage of a deprecated entity will trigger a warning when compiled with
// clang's `-Wdeprecated-declarations` option. This option is turned off by
// default, but the warnings will be reported by clang-tidy.
#if defined(__clang__) && __cplusplus >= 201103L
#define TFLITE_DEPRECATED(message) __attribute__((deprecated(message)))
#endif
#ifndef TFLITE_DEPRECATED
#define TFLITE_DEPRECATED(message)
#endif
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_

View File

@@ -0,0 +1,40 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_CPPMATH_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_CPPMATH_H_
#include <cmath>
namespace tflite {
#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
(defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(__ZEPHYR__)
#define TF_LITE_GLOBAL_STD_PREFIX
#else
#define TF_LITE_GLOBAL_STD_PREFIX std
#endif
#define DECLARE_STD_GLOBAL_SWITCH1(tf_name, std_name) \
template <class T> \
inline T tf_name(const T x) { \
return TF_LITE_GLOBAL_STD_PREFIX::std_name(x); \
}
DECLARE_STD_GLOBAL_SWITCH1(TfLiteRound, round);
DECLARE_STD_GLOBAL_SWITCH1(TfLiteExpm1, expm1);
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_CPPMATH_H_

View File

@@ -0,0 +1,35 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
#include <cmath>
namespace tflite {
#if defined(TF_LITE_USE_GLOBAL_MAX) || defined(__ZEPHYR__)
inline float TfLiteMax(const float& x, const float& y) {
return std::max(x, y);
}
#else
template <class T>
inline T TfLiteMax(const T& x, const T& y) {
return std::fmax(x, y);
}
#endif
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_

View File

@@ -0,0 +1,35 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
#include <cmath>
namespace tflite {
#if defined(TF_LITE_USE_GLOBAL_MIN) || defined(__ZEPHYR__)
inline float TfLiteMin(const float& x, const float& y) {
return std::min(x, y);
}
#else
template <class T>
inline T TfLiteMin(const T& x, const T& y) {
return std::fmin(x, y);
}
#endif
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_

View File

@@ -0,0 +1,20 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
// TFLM does not need to utilize any Neon optimizations.
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_

View File

@@ -0,0 +1,122 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
#include <vector>
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
inline RuntimeShape GetTensorShape(std::vector<int32_t> data) {
return RuntimeShape(data.size(), data.data());
}
// A list of tensors in a format that can be used by kernels like split and
// concatenation.
template <typename T>
class VectorOfTensors {
public:
// Build with the tensors in 'tensor_list'.
VectorOfTensors(const TfLiteContext& context,
const TfLiteIntArray& tensor_list) {
int num_tensors = tensor_list.size;
all_data_.reserve(num_tensors);
all_shape_.reserve(num_tensors);
all_shape_ptr_.reserve(num_tensors);
for (int i = 0; i < num_tensors; ++i) {
TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
all_data_.push_back(GetTensorData<T>(t));
all_shape_.push_back(GetTensorShape(t));
}
// Taking the pointer from inside a std::vector is only OK if the vector is
// never modified, so we populate all_shape in the previous loop and then we
// are free to grab iterators here.
for (int i = 0; i < num_tensors; ++i) {
all_shape_ptr_.push_back(&all_shape_[i]);
}
}
// Return a pointer to the data pointers of all tensors in the list. For
// example:
// float* const* f = v.data();
// f[0][1] is the second element of the first tensor.
T* const* data() const { return all_data_.data(); }
// Return a pointer the shape pointers of all tensors in the list. For
// example:
// const RuntimeShape* const* d = v.dims();
// dims[1] are the dimensions of the second tensor in the list.
const RuntimeShape* const* shapes() const { return all_shape_ptr_.data(); }
private:
std::vector<T*> all_data_;
std::vector<RuntimeShape> all_shape_;
std::vector<RuntimeShape*> all_shape_ptr_;
};
// A list of quantized tensors in a format that can be used by kernels like
// split and concatenation.
class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t> {
public:
// Build with the tensors in 'tensor_list'.
VectorOfQuantizedTensors(const TfLiteContext& context,
const TfLiteIntArray& tensor_list)
: VectorOfTensors<uint8_t>(context, tensor_list) {
for (int i = 0; i < tensor_list.size; ++i) {
TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
zero_point_.push_back(t->params.zero_point);
scale_.push_back(t->params.scale);
}
}
const float* scale() const { return scale_.data(); }
const int32_t* zero_point() const { return zero_point_.data(); }
private:
std::vector<int32_t> zero_point_;
std::vector<float> scale_;
};
// Writes randomly accessed values from `input` sequentially into `output`.
template <typename T>
class SequentialTensorWriter {
public:
SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output) {
input_data_ = GetTensorData<T>(input);
output_ptr_ = GetTensorData<T>(output);
}
SequentialTensorWriter(const T* input_data, T* output_data)
: input_data_(input_data), output_ptr_(output_data) {}
void Write(int position) { *output_ptr_++ = input_data_[position]; }
void WriteN(int position, int len) {
memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
output_ptr_ += len;
}
private:
const T* input_data_;
T* output_ptr_;
};
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_

View File

@@ -0,0 +1,124 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
#include <algorithm>
#include <cmath>
#include <cstdint>
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#if defined(_MSC_VER)
#define __restrict__ __restrict
#endif
namespace tflite {
namespace tensor_utils {
// Multiplies a matrix with a scalar and reduce the result on each row to a
// scalar.
// Parameters:
// - matrix: matrix of size n_row * n_col
// - scalar: the scalar that is multiplied to each element in the matrix
// - n_row: the row count of the matrix
// - n_col: the column count of the matrix
// - output: the 32bit output
// Note: We do not need saturation because the int8 * int8 is safe from overflow
// in (2^31-1) / (2^14) = 131072, which is bigger than the n_row. Non-zero
// initial output value is not exceptionally large.
void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
int32_t n_row, int32_t n_col,
int32_t* output);
// Add another vector for each batch in the batch vector.
template <typename T>
void VectorBatchVectorAdd(const T* vector, int v_size, int n_batch,
T* batch_vector) {
for (int b = 0; b < n_batch; b++) {
for (int i = 0; i < v_size; ++i) {
batch_vector[i] += vector[i];
}
batch_vector += v_size;
}
}
// Cwise product of two vectors.
template <typename T>
inline void VectorVectorCwiseProduct(const T* __restrict__ vector1,
const T* __restrict__ vector2, int v_size,
T* __restrict__ result) {
for (int v = 0; v < v_size; v++) {
*result++ = *vector1++ * *vector2++;
}
}
// Cwise product of a vector and a batch-vector.
template <typename T>
inline void VectorBatchVectorCwiseProduct(const T* vector, int v_size,
const T* batch_vector, int n_batch,
T* result) {
for (int b = 0; b < n_batch; b++) {
VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
// Update the pointers.
result += v_size;
batch_vector += v_size;
}
}
// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
// assumption here is that result array is initialized to valid values.
template <typename T>
inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
const T* __restrict__ vector2,
int v_size,
T* __restrict__ result) {
for (int v = 0; v < v_size; v++) {
*result++ += *vector1++ * *vector2++;
}
}
// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
// operation, the assumption here is that result array is initialized to valid
// values.
template <typename T>
inline void VectorBatchVectorCwiseProductAccumulate(const T* vector, int v_size,
const T* batch_vector,
int n_batch, T* result) {
for (int b = 0; b < n_batch; b++) {
VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
// Update the pointers.
result += v_size;
batch_vector += v_size;
}
}
// Batch vector initialization with another vector.
template <typename T>
void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch,
T* batch_vector) {
for (int b = 0; b < n_batch; b++) {
std::copy_n(vector, v_size, batch_vector + b * v_size);
}
}
} // namespace tensor_utils
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_

View File

@@ -0,0 +1,416 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include <algorithm>
#include <cmath>
#include <limits>
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
namespace tflite {
namespace {
// These constants are used to manipulate the binary representation of doubles.
// Double-precision binary64 floating point format is:
// Bit | 63 | 62-52 | 51-0 |
// | Sign | Exponent | Fraction |
// To avoid 64-bit integers as much as possible, I break this into high and
// low 32-bit chunks. High is:
// Bit | 31 | 30-20 | 19-0 |
// | Sign | Exponent | High Fraction |
// Low is:
// Bit | 31-0 |
// | Low Fraction |
// We then access the components through logical bit-wise operations to
// extract the parts needed, with the positions and masks derived from the
// layout shown above.
constexpr uint64_t kSignMask = 0x8000000000000000LL;
constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
constexpr int32_t kExponentShift = 52;
constexpr int32_t kExponentBias = 1023;
constexpr uint32_t kExponentIsBadNum = 0x7ff;
constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
constexpr uint32_t kFractionShift = 22;
constexpr uint32_t kFractionRoundingMask = 0x003fffff;
constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
} // namespace
void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
int* shift) {
#if TFLITE_SINGLE_ROUNDING
// Single-rounding MultiplyByQuantizedMultiplier only supports positive
// multipliers.
// TFLITE_DCHECK(double_multiplier >= 0);
#endif
if (double_multiplier == 0.) {
*quantized_multiplier = 0;
*shift = 0;
return;
}
#ifdef TFLITE_EMULATE_FLOAT
// If we're trying to avoid the use of floating-point instructions (for
// example on microcontrollers) then use an alternative implementation
// that only requires integer and bitwise operations. To enable this, you
// need to set the define during the build process for your platform.
int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
#else // TFLITE_EMULATE_FLOAT
const double q = std::frexp(double_multiplier, shift);
auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1LL << 31)));
#endif // TFLITE_EMULATE_FLOAT
TFLITE_CHECK(q_fixed <= (1LL << 31));
if (q_fixed == (1LL << 31)) {
q_fixed /= 2;
++*shift;
}
TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
// A shift amount smaller than -31 would cause all bits to be shifted out
// and thus all results would be zero. We implement that instead with
// q_fixed==0, so as to avoid hitting issues with right-shift
// operations with shift amounts greater than 31. Note that this happens
// roughly when abs(double_multiplier) < 2^-31 and the present handling means
// that we're effectively flushing tiny double_multiplier's to zero.
// We could conceivably handle values in the range (roughly) [32, 63]
// as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
// the present handling is just doing 'flush denormals to zero'. We could
// reconsider and actually generate nonzero denormals if a need arises.
if (*shift < -31) {
*shift = 0;
q_fixed = 0;
}
#if TFLITE_SINGLE_ROUNDING
// Single-rounding MultiplyByQuantizedMultiplier doesn't support a shift > 30,
// saturate it.
if (*shift > 30) {
*shift = 30;
q_fixed = (1LL << 31) - 1;
}
#endif
*quantized_multiplier = static_cast<int32_t>(q_fixed);
}
void QuantizeMultiplierGreaterThanOne(double double_multiplier,
int32_t* quantized_multiplier,
int* left_shift) {
TFLITE_CHECK_GT(double_multiplier, 1.);
QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
TFLITE_CHECK_GE(*left_shift, 0);
}
void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
int32_t* quantized_multiplier,
int* left_shift) {
TFLITE_CHECK_LT(double_multiplier, 1.);
TFLITE_CHECK_GT(double_multiplier, 0.);
int shift;
QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
TFLITE_CHECK_LE(shift, 0);
*left_shift = shift;
}
int64_t IntegerFrExp(double input, int* shift) {
// Make sure our assumptions about the double layout hold.
TFLITE_CHECK_EQ(8, sizeof(double));
// We want to access the bits of the input double value directly, which is
// tricky to do safely, so use a union to handle the casting.
union {
double double_value;
uint64_t double_as_uint;
} cast_union;
cast_union.double_value = input;
const uint64_t u = cast_union.double_as_uint;
// If the bitfield is all zeros apart from the sign bit, this is a normalized
// zero value, so return standard values for this special case.
if ((u & ~kSignMask) == 0) {
*shift = 0;
return 0;
}
// Deal with NaNs and Infs, which are always indicated with a fixed pattern in
// the exponent, and distinguished by whether the fractions are zero or
// non-zero.
const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
if (exponent_part == kExponentIsBadNum) {
*shift = std::numeric_limits<int>::max();
if (u & kFractionMask) {
// NaN, so just return zero (with the exponent set to INT_MAX).
return 0;
} else {
// Infinity, so return +/- INT_MAX.
if (u & kSignMask) {
return std::numeric_limits<int64_t>::min();
} else {
return std::numeric_limits<int64_t>::max();
}
}
}
// The shift is fairly easy to extract from the high bits of the double value,
// just by masking it out and applying a bias. The std::frexp() implementation
// always returns values between 0.5 and 1.0 though, whereas the exponent
// assumes 1.0 to 2.0 is the standard range, so I add on one to match that
// interface.
*shift = (exponent_part - kExponentBias) + 1;
// There's an implicit high bit in the double format definition, so make sure
// we include that at the top, and then reconstruct the rest of the fractional
// value from the remaining fragments.
int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
// We're cutting off some bits at the bottom, so to exactly match the standard
// frexp implementation here we'll apply rounding by adding one to the least
// significant bit of the result if the discarded portion is over half of the
// maximum.
if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
fraction += 1;
}
// Negate the fraction if the sign bit was set.
if (u & kSignMask) {
fraction *= -1;
}
return fraction;
}
double DoubleFromFractionAndShift(int64_t fraction, int shift) {
union {
double double_value;
uint64_t double_as_uint;
} result;
// Detect NaNs and infinities.
if (shift == std::numeric_limits<int>::max()) {
if (fraction == 0) {
return std::numeric_limits<double>::quiet_NaN();
} else if (fraction > 0) {
return std::numeric_limits<double>::infinity();
} else {
return -std::numeric_limits<double>::infinity();
}
}
// Return a normalized zero for a zero fraction.
if (fraction == 0) {
result.double_as_uint = 0;
return result.double_value;
}
bool is_negative = (fraction < 0);
int64_t encoded_fraction = is_negative ? -fraction : fraction;
int64_t encoded_shift = (shift - 1);
while (encoded_fraction < 0x40000000) {
encoded_fraction *= 2;
encoded_shift -= 1;
}
while (encoded_fraction > 0x80000000) {
encoded_fraction /= 2;
encoded_shift += 1;
}
encoded_fraction -= 0x40000000;
if (encoded_shift < -1022) {
encoded_shift = -1023;
} else if (encoded_shift > 1022) {
encoded_shift = 1023;
}
encoded_shift += kExponentBias;
uint64_t encoded_sign = is_negative ? kSignMask : 0;
result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
(encoded_fraction << kFractionShift);
return result.double_value;
}
double IntegerDoubleMultiply(double a, double b) {
int a_shift;
const int64_t a_fraction = IntegerFrExp(a, &a_shift);
int b_shift;
const int64_t b_fraction = IntegerFrExp(b, &b_shift);
// Detect NaNs and infinities.
if (a_shift == std::numeric_limits<int>::max() ||
(b_shift == std::numeric_limits<int>::max())) {
return std::numeric_limits<double>::quiet_NaN();
}
const int result_shift = a_shift + b_shift + 1;
const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
return DoubleFromFractionAndShift(result_fraction, result_shift);
}
int IntegerDoubleCompare(double a, double b) {
int a_shift;
const int64_t a_fraction = IntegerFrExp(a, &a_shift);
int b_shift;
const int64_t b_fraction = IntegerFrExp(b, &b_shift);
// Detect NaNs and infinities.
if (a_shift == std::numeric_limits<int>::max() ||
(b_shift == std::numeric_limits<int>::max())) {
return 1;
}
if ((a_fraction == 0) && (b_fraction < 0)) {
return 1;
} else if ((a_fraction < 0) && (b_fraction == 0)) {
return -1;
} else if (a_shift < b_shift) {
return -1;
} else if (a_shift > b_shift) {
return 1;
} else if (a_fraction < b_fraction) {
return -1;
} else if (a_fraction > b_fraction) {
return 1;
} else {
return 0;
}
}
void PreprocessSoftmaxScaling(double beta, double input_scale,
int input_integer_bits,
int32_t* quantized_multiplier, int* left_shift) {
// If the overall multiplier (input and beta) is large, then exp() of an
// input difference of 1 scaled by this will be large. In other words, we
// can cap the multiplier and know that, when it is used, the output will be
// (round to) zero wherever the input is not at the maximum value.
// If the overall scale is less than one, and input_integer_bits=0, then the
// result is double equivalent of Q0.31 (actually with more precision). Thus
// this generates a Q(input_integer_bits).(31-input_integer_bits)
// representation.
#if TFLITE_SINGLE_ROUNDING
const double max_real_multiplier = (1LL << 30) - 1.0;
#else
const double max_real_multiplier = (1LL << 31) - 1.0;
#endif
#ifdef TFLITE_EMULATE_FLOAT
const double input_beta = IntegerDoubleMultiply(beta, input_scale);
int shift;
int64_t fraction = IntegerFrExp(input_beta, &shift);
shift += (31 - input_integer_bits);
double input_beta_real_multiplier =
DoubleFromFractionAndShift(fraction, shift);
if (IntegerDoubleCompare(input_beta_real_multiplier, max_real_multiplier) >
0) {
input_beta_real_multiplier = max_real_multiplier;
}
#else // TFLITE_EMULATE_FLOAT
const double input_beta_real_multiplier =
std::min<double>(beta * input_scale * (1 << (31 - input_integer_bits)),
max_real_multiplier);
#endif // TFLITE_EMULATE_FLOAT
QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
quantized_multiplier, left_shift);
}
void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
int input_integer_bits,
int32_t* quantized_multiplier,
int* left_shift,
int32_t* reverse_scaling_divisor,
int* reverse_scaling_left_shift) {
PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
quantized_multiplier, left_shift);
// Also calculate what amounts to the inverse scaling factor for the input.
const double real_reverse_scaling_divisor =
(1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
reverse_scaling_divisor,
reverse_scaling_left_shift);
}
int CalculateInputRadius(int input_integer_bits, int input_left_shift,
int total_signed_bits) {
#ifdef TFLITE_EMULATE_FLOAT
int64_t result = (1 << input_integer_bits) - 1;
result <<= (total_signed_bits - input_integer_bits);
result >>= input_left_shift;
return result;
#else // TFLITE_EMULATE_FLOAT
const double max_input_rescaled =
1.0 * ((1 << input_integer_bits) - 1) *
(1LL << (total_signed_bits - input_integer_bits)) /
(1LL << input_left_shift);
// Tighten bound using floor. Suppose that we could use the exact value.
// After scaling the difference, the result would be at the maximum. Thus we
// must ensure that our value has lower magnitude.
return static_cast<int>(std::floor(max_input_rescaled));
#endif // TFLITE_EMULATE_FLOAT
}
void NudgeQuantizationRange(const float min, const float max,
const int quant_min, const int quant_max,
float* nudged_min, float* nudged_max,
float* nudged_scale) {
// This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
const float quant_min_float = static_cast<float>(quant_min);
const float quant_max_float = static_cast<float>(quant_max);
*nudged_scale = (max - min) / (quant_max_float - quant_min_float);
const float zero_point_from_min = quant_min_float - min / *nudged_scale;
uint16_t nudged_zero_point;
if (zero_point_from_min < quant_min_float) {
nudged_zero_point = static_cast<uint16_t>(quant_min);
} else if (zero_point_from_min > quant_max_float) {
nudged_zero_point = static_cast<uint16_t>(quant_max);
} else {
nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
}
*nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
*nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
}
void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
const float nudged_max, const float* input_data,
float* output_data, const float size) {
// This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
const float inv_nudged_scale = 1.0f / nudged_scale;
for (int i = 0; i < size; i++) {
const float src_val = input_data[i];
const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
const float clamped_shifted = clamped - nudged_min;
const float dst_val =
TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
nudged_min;
output_data[i] = dst_val;
}
}
bool CheckedLog2(const float x, int* log2_result) {
// Using TfLiteRound instead of std::round and std::log instead of
// std::log2 to work around these functions being missing in a toolchain
// used in some TensorFlow tests as of May 2018.
const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
const float x_log2_rounded = TfLiteRound(x_log2);
const float x_log2_fracpart = x_log2 - x_log2_rounded;
*log2_result = static_cast<int>(x_log2_rounded);
return std::abs(x_log2_fracpart) < 1e-3f;
}
void QuantizeMultiplierArray(const double* effective_scales, size_t size,
int32_t* effective_scale_significand,
int* effective_shift) {
for (size_t i = 0; i < size; ++i) {
QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
&effective_shift[i]);
}
}
} // namespace tflite

View File

@@ -0,0 +1,292 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
#include <cmath>
#include <cstdint>
#include <limits>
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
// Given the min and max values of a float array, return
// reasonable quantization parameters to use for this array.
template <typename T>
QuantizationParams ChooseQuantizationParams(double rmin, double rmax,
bool narrow_range) {
const T qmin = std::numeric_limits<T>::min() + (narrow_range ? 1 : 0);
const T qmax = std::numeric_limits<T>::max();
const double qmin_double = qmin;
const double qmax_double = qmax;
// 0 should always be a representable value. Let's assume that the initial
// min,max range contains 0.
TFLITE_CHECK_LE(rmin, 0.);
TFLITE_CHECK_GE(rmax, 0.);
if (rmin == rmax) {
// Special case where the min,max range is a point. Should be {0}.
TFLITE_CHECK_EQ(rmin, 0.);
TFLITE_CHECK_EQ(rmax, 0.);
QuantizationParams quantization_params;
quantization_params.zero_point = 0;
quantization_params.scale = 0.;
return quantization_params;
}
// General case.
//
// First determine the scale.
const double scale = (rmax - rmin) / (qmax_double - qmin_double);
// Zero-point computation.
// First the initial floating-point computation. The zero-point can be
// determined from solving an affine equation for any known pair
// (real value, corresponding quantized value).
// We know two such pairs: (rmin, qmin) and (rmax, qmax).
// The arithmetic error on the zero point computed from either pair
// will be roughly machine_epsilon * (sum of absolute values of terms)
// so we want to use the variant that adds the smaller terms.
const double zero_point_from_min = qmin_double - rmin / scale;
const double zero_point_from_max = qmax_double - rmax / scale;
const double zero_point_from_min_error =
std::abs(qmin_double) + std::abs(rmin / scale);
const double zero_point_from_max_error =
std::abs(qmax_double) + std::abs(rmax / scale);
const double zero_point_double =
zero_point_from_min_error < zero_point_from_max_error
? zero_point_from_min
: zero_point_from_max;
// Now we need to nudge the zero point to be an integer
// (our zero points are integer, and this is motivated by the requirement
// to be able to represent the real value "0" exactly as a quantized value,
// which is required in multiple places, for example in Im2col with SAME
// padding).
T nudged_zero_point = 0;
if (zero_point_double < qmin_double) {
nudged_zero_point = qmin;
} else if (zero_point_double > qmax_double) {
nudged_zero_point = qmax;
} else {
nudged_zero_point = static_cast<T>(round(zero_point_double));
}
// The zero point should always be in the range of quantized value,
// [qmin, qmax].
TFLITE_CHECK_GE(nudged_zero_point, qmin);
TFLITE_CHECK_LE(nudged_zero_point, qmax);
// Finally, store the result nudged quantization params.
QuantizationParams quantization_params;
quantization_params.zero_point = nudged_zero_point;
quantization_params.scale = scale;
return quantization_params;
}
template <typename T>
QuantizationParams ChooseQuantizationParams(double rmin, double rmax) {
return ChooseQuantizationParams<T>(rmin, rmax, false);
}
// Converts a floating-point number to an integer. For all inputs x where
// static_cast<IntOut>(x) is legal according to the C++ standard, the result
// is identical to that cast (i.e. the result is x with its fractional part
// truncated whenever that is representable as IntOut).
//
// static_cast would cause undefined behavior for the following cases, which
// have well-defined behavior for this function:
//
// 1. If x is NaN, the result is zero.
//
// 2. If the truncated form of x is above the representable range of IntOut,
// the result is std::numeric_limits<IntOut>::max().
//
// 3. If the truncated form of x is below the representable range of IntOut,
// the result is std::numeric_limits<IntOut>::min().
//
// Note that cases #2 and #3 cover infinities as well as finite numbers.
//
// The range of FloatIn must include the range of IntOut, otherwise
// the results are undefined.
// TODO(sfeuz): Replace by absl::SafeCast once available.
template <class IntOut, class FloatIn>
IntOut SafeCast(FloatIn x) {
static_assert(!std::numeric_limits<FloatIn>::is_integer,
"FloatIn is integer");
static_assert(std::numeric_limits<IntOut>::is_integer,
"IntOut is not integer");
static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
// Special case NaN, for which the logic below doesn't work.
if (std::isnan(x)) {
return 0;
}
// Negative values all clip to zero for unsigned results.
if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
return 0;
}
// Handle infinities.
if (std::isinf(x)) {
return x < 0 ? std::numeric_limits<IntOut>::min()
: std::numeric_limits<IntOut>::max();
}
// Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
// unless x is zero in which case exp == 0. Note that this implies that the
// magnitude of x is strictly less than 2^exp.
int exp = 0;
std::frexp(x, &exp);
// Let N be the number of non-sign bits in the representation of IntOut. If
// the magnitude of x is strictly less than 2^N, the truncated version of x
// is representable as IntOut. The only representable integer for which this
// is not the case is kMin for signed types (i.e. -2^N), but that is covered
// by the fall-through below.
if (exp <= std::numeric_limits<IntOut>::digits) {
return x;
}
// Handle numbers with magnitude >= 2^N.
return x < 0 ? std::numeric_limits<IntOut>::min()
: std::numeric_limits<IntOut>::max();
}
// Decompose a double multiplier into a Q0.31 int32 representation of its
// significand, and shift representation of NEGATIVE its exponent ---
// this is intended as a RIGHT-shift.
//
// Restricted to the case where the multiplier < 1 (and non-negative).
void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
int32_t* quantized_multiplier,
int* left_shift);
// Decompose a double multiplier into a Q0.31 int32 representation of its
// significand, and shift representation of its exponent.
//
// Restricted to the case where the multiplier > 1.
void QuantizeMultiplierGreaterThanOne(double double_multiplier,
int32_t* quantized_multiplier,
int* left_shift);
// Decompose a double multiplier into a Q0.31 int32 representation of its
// significand, and shift representation of its exponent.
//
// Handles an arbitrary positive multiplier. The 'shift' output-value is
// basically the 'floating-point exponent' of the multiplier:
// Negative for a right-shift (when the multiplier is <1), positive for a
// left-shift (when the multiplier is >1)
void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
int* shift);
// Splits a double input value into a returned fraction, and a shift value from
// the exponent, using only bitwise and integer operations to support
// microcontrollers and other environments without floating-point support.
//
// This is designed to be a replacement for how std::frexp() is used within the
// QuantizeMultiplier() function, and so has a different signature than the
// standard version, returning a 64-bit integer rather than a double. This
// result has a maximum value of 1<<31, with the fraction expressed as a
// proportion of that maximum.
//
// std::frexp() returns NaNs and infinities unmodified, but since we're
// returning integers that can't represent those values, instead we return
// a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64
// result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and
// std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will
// result in return values that end up truncating some bits at the end,
// reflecting the loss of precision inherent in denormalization.
int64_t IntegerFrExp(double input, int* shift);
// Converts an integer fraction in the format produced by IntegerFrExp (where
// 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an
// IEEE binary64 double format result. The implementation uses only integer and
// bitwise operators, so no floating point hardware support or emulation is
// needed. This is here so quantized operations can run non-time-critical
// preparation calculations on microcontrollers and other platforms without
// float support.
double DoubleFromFractionAndShift(int64_t fraction, int shift);
// Performs a multiplication of two numbers in double format, using only integer
// and bitwise instructions. This is aimed at supporting housekeeping functions
// for quantized operations on microcontrollers without floating-point hardware.
double IntegerDoubleMultiply(double a, double b);
// Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is
// greater than b. It is implemented using only integer and logical instructions
// so that it can be easily run on microcontrollers for quantized operations.
int IntegerDoubleCompare(double a, double b);
// This first creates a multiplier in a double equivalent of
// Q(input_integer_bits).(31-input_integer_bits) representation, with extra
// precision in the double's fractional bits. It then splits the result into
// significand and exponent.
void PreprocessSoftmaxScaling(double beta, double input_scale,
int input_integer_bits,
int32_t* quantized_multiplier, int* left_shift);
// Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
int input_integer_bits,
int32_t* quantized_multiplier,
int* left_shift,
int32_t* reverse_scaling_divisor,
int* reverse_scaling_left_shift);
// Calculate the largest input that will result in a within-bounds intermediate
// result within MultiplyByQuantizedMultiplierGreaterThanOne. In other words,
// it must not overflow before we reduce the value by multiplication by the
// input multiplier. The negative radius is used as the minimum difference in
// Softmax.
int CalculateInputRadius(int input_integer_bits, int input_left_shift,
int total_signed_bits = 31);
// Nudges a min/max quantization range to ensure zero is zero.
// Gymnastics with nudged zero point is to ensure that real zero maps to
// an integer, which is required for e.g. zero-padding in convolutional layers.
// Outputs nudged_min, nudged_max, nudged_scale.
void NudgeQuantizationRange(const float min, const float max,
const int quant_min, const int quant_max,
float* nudged_min, float* nudged_max,
float* nudged_scale);
// Fake quantizes (quantizes and dequantizes) input_data using the scale,
// nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code
// in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor.
void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
const float nudged_max, const float* input_data,
float* output_data, const float size);
// If x is approximately a power of two (with any positive or negative
// exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise
// returns false.
bool CheckedLog2(const float x, int* log2_result);
// Decomposes an array of double multipliers into a Q0.31 int32 representation
// of its significand, and shift representation of its exponent.
//
// Handles an arbitrary multiplier. The 'shift' output-value is
// basically the 'floating-point exponent' of the multiplier:
// Negative for a right-shift (when the multiplier is <1), positive for a
// left-shift (when the multiplier is >1)
void QuantizeMultiplierArray(const double* effective_scales, size_t size,
int32_t* effective_scale_significand,
int* effective_shift);
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_

View File

@@ -0,0 +1,399 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
#include <type_traits>
#include "fixedpoint/fixedpoint.h"
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void Add(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape& input2_shape, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) {
T activation_min, activation_max;
GetActivationParams(params, &activation_min, &activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
input1_data[i] + input2_data[i], activation_min, activation_max);
}
}
// Element-wise add that can often be used for inner loop of broadcast add as
// well as the non-broadcast add.
// This function is used for 8-bit as well as for 16-bit, but the accumulator
// is 32-bit for both cases. The overflow does not happen due to the
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
template <typename T>
inline void AddElementwise(int size, const ArithmeticParams& params,
const T* input1_data, const T* input2_data,
T* output_data) {
TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
for (int i = 0; i < size; ++i) {
const int32_t input1_val = params.input1_offset + input1_data[i];
const int32_t input2_val = params.input2_offset + input2_data[i];
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
const int32_t scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input1_val, params.input1_multiplier, params.input1_shift);
const int32_t scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input2_val, params.input2_multiplier, params.input2_shift);
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
raw_sum, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[i] = static_cast<T>(clamped_output);
}
}
// Scalar-broadcast add that can be used for inner loop of more general
// broadcast add, so that, for example, scalar-broadcast with batch will still
// be fast.
inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
uint8_t input1_data, const uint8_t* input2_data,
uint8_t* output_data) {
TFLITE_DCHECK_GT(params.input1_offset, -256);
TFLITE_DCHECK_GT(params.input2_offset, -256);
TFLITE_DCHECK_LT(params.input1_offset, 256);
TFLITE_DCHECK_LT(params.input2_offset, 256);
const int32_t input1_val = params.input1_offset + input1_data;
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
const int32_t scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input1_val, params.input1_multiplier, params.input1_shift);
for (int i = 0; i < size; ++i) {
const int32_t input2_val = params.input2_offset + input2_data[i];
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
const int32_t scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input2_val, params.input2_multiplier, params.input2_shift);
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
raw_sum, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[i] = static_cast<uint8_t>(clamped_output);
}
}
inline void Add(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const uint8_t* input1_data,
const RuntimeShape& input2_shape, const uint8_t* input2_data,
const RuntimeShape& output_shape, uint8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
TFLITE_DCHECK_GT(params.input1_offset, -256);
TFLITE_DCHECK_GT(params.input2_offset, -256);
TFLITE_DCHECK_LT(params.input1_offset, 256);
TFLITE_DCHECK_LT(params.input2_offset, 256);
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void AddGeneralParamScale(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int16_t* input1_data,
const RuntimeShape& input2_shape,
const int16_t* input2_data,
const RuntimeShape& output_shape,
int16_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
int max_value = std::numeric_limits<int16_t>::max();
TFLITE_DCHECK_GT(params.input1_offset, -max_value);
TFLITE_DCHECK_GT(params.input2_offset, -max_value);
TFLITE_DCHECK_LT(params.input1_offset, max_value);
TFLITE_DCHECK_LT(params.input2_offset, max_value);
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void Add(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int16_t* input1_data,
const RuntimeShape& input2_shape, const int16_t* input2_data,
const RuntimeShape& output_shape, int16_t* output_data,
bool pot_scale = true) {
if (!pot_scale) {
AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data);
return;
}
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int input1_shift = params.input1_shift;
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
const int16_t output_activation_min = params.quantized_activation_min;
const int16_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
TFLITE_DCHECK_LE(input1_shift, 0);
TFLITE_DCHECK_LE(params.input2_shift, 0);
const int16_t* not_shift_input =
input1_shift == 0 ? input1_data : input2_data;
const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
const int input_right_shift =
input1_shift == 0 ? -params.input2_shift : -input1_shift;
for (int i = 0; i < flat_size; i++) {
// F0 uses 0 integer bits, range [-1, 1].
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
F0 scaled_input = F0::FromRaw(
gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
const int16_t raw_output = result.raw();
const int16_t clamped_output = std::min(
output_activation_max, std::max(output_activation_min, raw_output));
output_data[i] = clamped_output;
}
}
template <typename T>
inline typename std::enable_if<!is_small_integer<T>::value, void>::type
BroadcastAdd4DSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape& input2_shape, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) {
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
const RuntimeShape extended_output_shape =
RuntimeShape::ExtendedShape(4, output_shape);
T activation_min, activation_max;
GetActivationParams(params, &activation_min, &activation_max);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
output_data[Offset(extended_output_shape, b, y, x, c)] =
ActivationFunctionWithMinMax<T>(
input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
input2_data[SubscriptToIndex(desc2, b, y, x, c)],
activation_min, activation_max);
}
}
}
}
}
// This function is used for 8-bit as well as for 16-bit, but the accumulator
// is 32-bit for both cases. The overflow does not happen due to the
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
template <typename T>
inline typename std::enable_if<is_small_integer<T>::value, void>::type
BroadcastAdd4DSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape& input2_shape, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) {
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
const RuntimeShape extended_output_shape =
RuntimeShape::ExtendedShape(4, output_shape);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
const int32_t input1_val =
params.input1_offset +
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
const int32_t input2_val =
params.input2_offset +
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
const int32_t shifted_input1_val =
input1_val * (1 << params.left_shift);
const int32_t shifted_input2_val =
input2_val * (1 << params.left_shift);
const int32_t scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input1_val, params.input1_multiplier,
params.input1_shift);
const int32_t scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input2_val, params.input2_multiplier,
params.input2_shift);
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
raw_sum, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[Offset(extended_output_shape, b, y, x, c)] =
static_cast<T>(clamped_output);
}
}
}
}
}
inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
const RuntimeShape& unswitched_input1_shape,
const uint8_t* unswitched_input1_data,
const RuntimeShape& unswitched_input2_shape,
const uint8_t* unswitched_input2_data,
const RuntimeShape& output_shape,
uint8_t* output_data) {
ArithmeticParams switched_params = unswitched_params;
switched_params.input1_offset = unswitched_params.input2_offset;
switched_params.input1_multiplier = unswitched_params.input2_multiplier;
switched_params.input1_shift = unswitched_params.input2_shift;
switched_params.input2_offset = unswitched_params.input1_offset;
switched_params.input2_multiplier = unswitched_params.input1_multiplier;
switched_params.input2_shift = unswitched_params.input1_shift;
const bool use_unswitched =
unswitched_params.broadcast_category ==
tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
const ArithmeticParams& params =
use_unswitched ? unswitched_params : switched_params;
const uint8_t* input1_data =
use_unswitched ? unswitched_input1_data : unswitched_input2_data;
const uint8_t* input2_data =
use_unswitched ? unswitched_input2_data : unswitched_input1_data;
// Fivefold nested loops. The second input resets its position for each
// iteration of the second loop. The first input resets its position at the
// beginning of the fourth loop. The innermost loop is an elementwise add of
// sections of the arrays.
uint8_t* output_data_ptr = output_data;
const uint8_t* input1_data_ptr = input1_data;
const uint8_t* input2_data_reset = input2_data;
// In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
// between input shapes. y3 for input 1 is always broadcast, and so the
// dimension there is 1, whereas optionally y1 might be broadcast for input 2.
// Put another way,
// input1.shape.FlatSize = y0 * y1 * y2 * y4,
// input2.shape.FlatSize = y0 * y2 * y3 * y4.
int y0 = params.broadcast_shape[0];
int y1 = params.broadcast_shape[1];
int y2 = params.broadcast_shape[2];
int y3 = params.broadcast_shape[3];
int y4 = params.broadcast_shape[4];
if (y4 > 1) {
// General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
// dimension.
for (int i0 = 0; i0 < y0; ++i0) {
const uint8_t* input2_data_ptr;
for (int i1 = 0; i1 < y1; ++i1) {
input2_data_ptr = input2_data_reset;
for (int i2 = 0; i2 < y2; ++i2) {
for (int i3 = 0; i3 < y3; ++i3) {
AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
output_data_ptr);
input2_data_ptr += y4;
output_data_ptr += y4;
}
// We have broadcast y4 of input1 data y3 times, and now move on.
input1_data_ptr += y4;
}
}
// We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
input2_data_reset = input2_data_ptr;
}
} else {
// Special case of y4 == 1, in which the innermost loop is a single element
// and can be combined with the next (y3) as an inner broadcast.
//
// Note that this handles the case of pure scalar broadcast when
// y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
// broadcast with batch (as y2 > 1).
//
// NOTE The process is the same as the above general case except simplified
// for y4 == 1 and the loop over y3 is contained within the
// AddScalarBroadcast function.
for (int i0 = 0; i0 < y0; ++i0) {
const uint8_t* input2_data_ptr;
for (int i1 = 0; i1 < y1; ++i1) {
input2_data_ptr = input2_data_reset;
for (int i2 = 0; i2 < y2; ++i2) {
AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
output_data_ptr);
input2_data_ptr += y3;
output_data_ptr += y3;
input1_data_ptr += 1;
}
}
input2_data_reset = input2_data_ptr;
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_

View File

@@ -0,0 +1,86 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
#include <algorithm>
#include <limits>
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_ops {
// T is expected to be either float or int.
template <typename T>
inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
const T* const* input_data, T* output_data) {
// All inputs and output should have the same shape, this is checked during
// Prepare stage.
const size_t size = input_shape.FlatSize();
for (size_t i = 0; i < size; ++i) {
T x = 0;
for (size_t j = 0; j < num_inputs; ++j) {
x += input_data[j][i];
}
output_data[i] = x;
}
}
inline void AddN(const ArithmeticParams& params,
const RuntimeShape& input_shape, const size_t num_inputs,
const int8_t* const* input_data, int8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
// Input offset is negative input zero point. Activation tensors are
// asymmetric quantized so they span the full int8 range.
// All inputs should have same zero-point and scale, this is checked during
// Prepare stage.
TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
// All inputs and output should have the same shape, this is checked during
// Prepare stage.
const size_t size = input_shape.FlatSize();
for (size_t i = 0; i < size; ++i) {
// accumulate in scaled_x before clamping to avoid overflow
const int32_t x = params.input1_offset; // x = 0
const int32_t shifted_x = x * (1 << params.left_shift);
int32_t scaled_x = MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_x, params.input1_multiplier, params.input1_shift);
for (size_t j = 0; j < num_inputs; ++j) {
const int32_t y = params.input1_offset + input_data[j][i];
const int32_t shifted_y = y * (1 << params.left_shift);
int32_t scaled_y = MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_y, params.input1_multiplier, params.input1_shift);
scaled_x += scaled_y;
}
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
scaled_x, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[i] = static_cast<int8_t>(clamped_output);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_

View File

@@ -0,0 +1,88 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
#include <functional>
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
std::function<bool(T, T)> GetComparefunction(bool is_arg_max) {
if (is_arg_max) {
return std::greater<T>();
} else {
return std::less<T>();
}
}
template <typename T1, typename T2, typename T3, typename Cmp>
void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
const T3* input2_data, const RuntimeShape& output_shape,
T2* output_data, const Cmp& cmp) {
TFLITE_DCHECK_GT(input1_shape.DimensionsCount(), 0);
TFLITE_DCHECK_EQ(input1_shape.DimensionsCount() - 1,
output_shape.DimensionsCount());
int axis = input2_data[0];
if (axis < 0) {
axis += input1_shape.DimensionsCount();
}
const int axis_size = input1_shape.Dims(axis);
int outer_size = 1;
for (int i = 0; i < axis; ++i) {
TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
outer_size *= input1_shape.Dims(i);
}
int inner_size = 1;
const int dims_count = input1_shape.DimensionsCount();
for (int i = axis + 1; i < dims_count; ++i) {
TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i - 1));
inner_size *= input1_shape.Dims(i);
}
for (int outer = 0; outer < outer_size; ++outer) {
for (int inner = 0; inner < inner_size; ++inner) {
auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
T2 min_max_index = 0;
for (int i = 1; i < axis_size; ++i) {
const auto& curr_value =
input1_data[(outer * axis_size + i) * inner_size + inner];
if (cmp(curr_value, min_max_value)) {
min_max_value = curr_value;
min_max_index = static_cast<T2>(i);
}
}
output_data[outer * inner_size + inner] = min_max_index;
}
}
}
template <typename T1, typename T2, typename T3>
void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
const T3* input2_data, const RuntimeShape& output_shape,
T2* output_data, const bool is_arg_max) {
ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data,
GetComparefunction<T1>(is_arg_max));
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_

View File

@@ -0,0 +1,275 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
#include <algorithm>
#include <cstdint>
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/tensor_utils_common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
namespace batch_matmul {
// Determine which dimension is the broadcast dimension.
inline int broadcast_dim(int lhs_dim, int rhs_dim) {
if (lhs_dim == rhs_dim) return lhs_dim;
if (lhs_dim == 1) return rhs_dim;
TFLITE_DCHECK_EQ(rhs_dim, 1);
return lhs_dim;
}
// Compute the "extent" for iterating on this dimension.
// If we are broadcasting, then don't advance (i.e return 0).
inline int extent(const RuntimeShape& shape, int x) {
if (shape.Dims(x) == 1) {
return 0;
}
int prod = 1;
for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
prod *= shape.Dims(i);
}
return prod;
}
} // namespace batch_matmul
template <typename Ta, typename Tb, typename Tout>
inline void BatchMatMul(const RuntimeShape& lhs_shape, const Ta* lhs_data,
const RuntimeShape& rhs_shape, const Tb* rhs_data,
const RuntimeShape& output_shape, Tout* output_data) {
const RuntimeShape extended_lhs_shape =
RuntimeShape::ExtendedShape(5, lhs_shape);
const RuntimeShape extended_rhs_shape =
RuntimeShape::ExtendedShape(5, rhs_shape);
const int batch_dim0 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
const int batch_dim1 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
const int batch_dim2 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
// Set params for each matrix multiply.
const int lhs_rows = extended_lhs_shape.Dims(3);
const int rhs_cols = extended_rhs_shape.Dims(4);
const int accum_depth = extended_lhs_shape.Dims(4);
for (int b0 = 0; b0 < batch_dim0; ++b0) {
const Ta* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
const Tb* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
for (int b1 = 0; b1 < batch_dim1; ++b1) {
const Ta* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
const Tb* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
for (int b2 = 0; b2 < batch_dim2; ++b2) {
const Ta* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
const Tb* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
Tout* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
b1 * batch_dim2 + b2) *
lhs_rows * rhs_cols;
for (int j = 0; j < rhs_cols; ++j) {
for (int i = 0; i < lhs_rows; ++i) {
Tout total = 0;
for (int k = 0; k < accum_depth; ++k) {
total += static_cast<Tout>(lhs_ptr2[accum_depth * i + k]) *
static_cast<Tout>(rhs_ptr2[j * accum_depth + k]);
}
int idx = lhs_rows * j + i;
out_ptr[idx] = total;
}
}
}
}
}
}
inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
const RuntimeShape& rhs_shape, const int8_t* rhs_data,
const float* scaling_factors,
const int32_t* input_offset, int32_t* row_sums,
const RuntimeShape& output_shape, float* output_data,
bool* compute_row_sums) {
const RuntimeShape extended_lhs_shape =
RuntimeShape::ExtendedShape(5, lhs_shape);
const RuntimeShape extended_rhs_shape =
RuntimeShape::ExtendedShape(5, rhs_shape);
const int batch_dim0 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
const int batch_dim1 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
const int batch_dim2 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
// Set params for each matrix multiply.
const int lhs_rows = extended_lhs_shape.Dims(3);
const int rhs_cols = extended_rhs_shape.Dims(4);
const int accum_depth = extended_lhs_shape.Dims(4);
const int ioff_ext0 = rhs_ext0 == 0 ? 0 : rhs_cols;
const int ioff_ext1 = rhs_ext1 == 0 ? 0 : rhs_cols;
const int ioff_ext2 = rhs_ext2 == 0 ? 0 : rhs_cols;
const int woff_ext0 = lhs_ext0 == 0 ? 0 : lhs_rows;
const int woff_ext1 = lhs_ext1 == 0 ? 0 : lhs_rows;
const int woff_ext2 = lhs_ext2 == 0 ? 0 : lhs_rows;
if (!compute_row_sums || *compute_row_sums) {
int num_weights_matrices = 1;
for (int i = 1; i < extended_lhs_shape.DimensionsCount() - 2; ++i) {
num_weights_matrices *= extended_lhs_shape.Dims(i);
}
tensor_utils::ReductionSumVector(
lhs_data, row_sums, num_weights_matrices * lhs_rows, accum_depth);
if (compute_row_sums) {
*compute_row_sums = false;
}
}
for (int b0 = 0; b0 < batch_dim0; ++b0) {
const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
const int32_t* ioff_ptr0 = input_offset + (b0 * ioff_ext0);
const float* scale_ptr0 = scaling_factors + (b0 * ioff_ext0);
const int32_t* woff_ptr0 = row_sums + (b0 * woff_ext0);
for (int b1 = 0; b1 < batch_dim1; ++b1) {
const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
const int32_t* ioff_ptr1 = ioff_ptr0 + (b1 * ioff_ext1);
const float* scale_ptr1 = scale_ptr0 + (b1 * ioff_ext1);
const int32_t* woff_ptr1 = woff_ptr0 + (b1 * woff_ext1);
for (int b2 = 0; b2 < batch_dim2; ++b2) {
const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
const int32_t* ioff_ptr2 = ioff_ptr1 + (b2 * ioff_ext2);
const float* scale_ptr2 = scale_ptr1 + (b2 * ioff_ext2);
const int32_t* woff_ptr2 = woff_ptr1 + (b2 * woff_ext2);
float* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
b1 * batch_dim2 + b2) *
lhs_rows * rhs_cols;
for (int j = 0; j < rhs_cols; ++j) {
const float batch_scaling_factor = scale_ptr2[j];
const float batch_offset = static_cast<float>(ioff_ptr2[j]);
for (int i = 0; i < lhs_rows; ++i) {
int32_t total = 0;
for (int k = 0; k < accum_depth; ++k) {
total +=
lhs_ptr2[accum_depth * i + k] * rhs_ptr2[j * accum_depth + k];
}
int32_t row_sum = woff_ptr2[i];
total -= row_sum * batch_offset;
int idx = lhs_rows * j + i;
out_ptr[idx] += batch_scaling_factor * total;
}
}
}
}
}
}
template <typename T, typename AccumT>
inline void BatchMatMul(const FullyConnectedParams& params,
const RuntimeShape& lhs_shape, const T* lhs_data,
const RuntimeShape& rhs_shape, const T* rhs_data,
const RuntimeShape& output_shape, T* output_data) {
const RuntimeShape extended_lhs_shape =
RuntimeShape::ExtendedShape(5, lhs_shape);
const RuntimeShape extended_rhs_shape =
RuntimeShape::ExtendedShape(5, rhs_shape);
const int batch_dim0 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
const int batch_dim1 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
const int batch_dim2 = batch_matmul::broadcast_dim(
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
// Set params for each matrix multiply.
const int lhs_rows = extended_lhs_shape.Dims(3);
const int rhs_cols = extended_rhs_shape.Dims(4);
const int accum_depth = extended_lhs_shape.Dims(4);
const int32_t input_offset = params.input_offset;
const int32_t filter_offset = params.weights_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
for (int b0 = 0; b0 < batch_dim0; ++b0) {
const T* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
const T* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
for (int b1 = 0; b1 < batch_dim1; ++b1) {
const T* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
const T* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
for (int b2 = 0; b2 < batch_dim2; ++b2) {
const T* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
const T* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
T* out_ptr = output_data +
((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) *
lhs_rows * rhs_cols;
for (int j = 0; j < rhs_cols; ++j) {
for (int i = 0; i < lhs_rows; ++i) {
AccumT total = 0;
for (int k = 0; k < accum_depth; ++k) {
AccumT lhs_val = lhs_ptr2[accum_depth * i + k];
AccumT rhs_val = rhs_ptr2[accum_depth * j + k];
total += (lhs_val + filter_offset) * (rhs_val + input_offset);
}
int32_t total_scaled = MultiplyByQuantizedMultiplier(
total, output_multiplier, output_shift);
total_scaled += output_offset;
total_scaled = std::max(total_scaled, output_activation_min);
total_scaled = std::min(total_scaled, output_activation_max);
const int idx = lhs_rows * j + i;
out_ptr[idx] = static_cast<T>(total_scaled);
}
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_

View File

@@ -0,0 +1,101 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
#include <cmath>
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
// TODO(b/135760455): Move this method anonymous namespace in a cc file.
inline RuntimeShape ExtendShapeBatchToSpace(const RuntimeShape& shape) {
if (shape.DimensionsCount() == 4) {
return shape;
}
RuntimeShape new_shape(4, 1);
new_shape.SetDim(0, shape.Dims(0));
new_shape.SetDim(1, shape.Dims(1));
new_shape.SetDim(3, shape.Dims(2));
return new_shape;
}
template <typename T>
inline void BatchToSpaceND(const RuntimeShape& unextended_input1_shape,
const T* input1_data,
const RuntimeShape& unextended_input2_shape,
const int32_t* block_shape_data,
const RuntimeShape& unextended_input3_shape,
const int32_t* crops_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
ruy::profiler::ScopeLabel label("BatchToSpaceND");
TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
unextended_output_shape.DimensionsCount());
const RuntimeShape input1_shape =
ExtendShapeBatchToSpace(unextended_input1_shape);
const RuntimeShape output_shape =
ExtendShapeBatchToSpace(unextended_output_shape);
const int output_width = output_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_batch_size = output_shape.Dims(0);
const int depth = input1_shape.Dims(3);
const int input_width = input1_shape.Dims(2);
const int input_height = input1_shape.Dims(1);
const int input_batch_size = input1_shape.Dims(0);
const int block_shape_height = block_shape_data[0];
const int block_shape_width =
unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
const int crops_top = crops_data[0];
const int crops_left =
unextended_input1_shape.DimensionsCount() == 4 ? crops_data[2] : 0;
for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
const int out_batch = in_batch % output_batch_size;
const int spatial_offset = in_batch / output_batch_size;
for (int in_h = 0; in_h < input_height; ++in_h) {
const int out_h = in_h * block_shape_height +
spatial_offset / block_shape_width - crops_top;
if (out_h < 0 || out_h >= output_height) {
continue;
}
for (int in_w = 0; in_w < input_width; ++in_w) {
const int out_w = in_w * block_shape_width +
spatial_offset % block_shape_width - crops_left;
if (out_w < 0 || out_w >= output_width) {
continue;
}
T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
const T* in =
input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
memcpy(out, in, depth * sizeof(T));
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_

View File

@@ -0,0 +1,91 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
// Also appears to duplicate MinimumMaximum.
//
// R: Result type. T1: Input 1 type. T2: Input 2 type.
template <typename R, typename T1, typename T2>
inline void BroadcastBinaryFunction4DSlow(
const RuntimeShape& unextended_input1_shape, const T1* input1_data,
const RuntimeShape& unextended_input2_shape, const T2* input2_data,
const RuntimeShape& unextended_output_shape, R* output_data,
R (*func)(T1, T2)) {
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
unextended_input2_shape, &desc1, &desc2);
const int* dims_data =
reinterpret_cast<const int*>(output_shape.DimsDataUpTo5D());
for (int b = 0; b < output_shape.Dims(0); ++b) {
int out_idx_b = b * dims_data[1];
int in_idx1_b = desc1.strides[0] * b;
int in_idx2_b = desc2.strides[0] * b;
for (int y = 0; y < output_shape.Dims(1); ++y) {
int out_idx_y = (out_idx_b + y) * dims_data[2];
int in_idx1_y = in_idx1_b + desc1.strides[1] * y;
int in_idx2_y = in_idx2_b + desc2.strides[1] * y;
for (int x = 0; x < output_shape.Dims(2); ++x) {
int out_idx_x = (out_idx_y + x) * dims_data[3];
int in1_idx = in_idx1_y + desc1.strides[2] * x;
int in2_idx = in_idx2_y + desc2.strides[2] * x;
for (int c = 0; c < output_shape.Dims(3); ++c) {
auto out_idx = out_idx_x + c;
auto in1_val = input1_data[in1_idx];
auto in2_val = input2_data[in2_idx];
output_data[out_idx] = func(in1_val, in2_val);
in1_idx += desc1.strides[3];
in2_idx += desc2.strides[3];
}
}
}
}
}
// R: Result type. T1: Input 1 type. T2: Input 2 type.
template <typename R, typename T1, typename T2>
inline void BinaryFunction(const RuntimeShape& input1_shape,
const T1* input1_data,
const RuntimeShape& input2_shape,
const T2* input2_data,
const RuntimeShape& output_shape, R* output_data,
R (*func)(T1, T2)) {
const int flat_size =
MatchingFlatSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = func(input1_data[i], input2_data[i]);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_

View File

@@ -0,0 +1,37 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
#include <cmath>
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = std::ceil(input_data[i]);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_

View File

@@ -0,0 +1,280 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline bool EqualFn(T lhs, T rhs) {
return lhs == rhs;
}
template <typename T>
inline bool NotEqualFn(T lhs, T rhs) {
return lhs != rhs;
}
template <typename T>
inline bool GreaterFn(T lhs, T rhs) {
return lhs > rhs;
}
template <typename T>
inline bool GreaterEqualFn(T lhs, T rhs) {
return lhs >= rhs;
}
template <typename T>
inline bool LessFn(T lhs, T rhs) {
return lhs < rhs;
}
template <typename T>
inline bool LessEqualFn(T lhs, T rhs) {
return lhs <= rhs;
}
template <typename T>
using ComparisonFn = bool (*)(T, T);
template <typename T, ComparisonFn<T> F>
inline void ComparisonImpl(
const ComparisonParams& op_params, const RuntimeShape& input1_shape,
const T* input1_data, const RuntimeShape& input2_shape,
const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
const int64_t flatsize =
MatchingFlatSize(input1_shape, input2_shape, output_shape);
for (int64_t i = 0; i < flatsize; ++i) {
output_data[i] = F(input1_data[i], input2_data[i]);
}
}
template <ComparisonFn<float> F>
inline void Comparison(const ComparisonParams& op_params,
const RuntimeShape& input1_shape,
const float* input1_data,
const RuntimeShape& input2_shape,
const float* input2_data,
const RuntimeShape& output_shape, bool* output_data) {
ComparisonImpl<float, F>(op_params, input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data);
}
template <typename T, ComparisonFn<int32_t> F>
inline void ComparisonWithScaling(
const ComparisonParams& op_params, const RuntimeShape& input1_shape,
const T* input1_data, const RuntimeShape& input2_shape,
const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
int left_shift = op_params.left_shift;
int32_t input1_offset = op_params.input1_offset;
int32_t input1_multiplier = op_params.input1_multiplier;
int input1_shift = op_params.input1_shift;
int32_t input2_offset = op_params.input2_offset;
int32_t input2_multiplier = op_params.input2_multiplier;
int input2_shift = op_params.input2_shift;
const int64_t flatsize =
MatchingFlatSize(input1_shape, input2_shape, output_shape);
for (int64_t i = 0; i < flatsize; ++i) {
const int32_t input1_val = input1_offset + input1_data[i];
const int32_t input2_val = input2_offset + input2_data[i];
const int32_t shifted_input1_val = input1_val * (1 << left_shift);
const int32_t shifted_input2_val = input2_val * (1 << left_shift);
const int32_t scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input1_val, input1_multiplier, input1_shift);
const int32_t scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input2_val, input2_multiplier, input2_shift);
output_data[i] = F(scaled_input1_val, scaled_input2_val);
}
}
struct BroadcastComparison4DSlowCommon {
const RuntimeShape output_shape;
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
};
inline BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
const RuntimeShape& unextended_input1_shape,
const RuntimeShape& unextended_input2_shape,
const RuntimeShape& unextended_output_shape) {
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
unextended_input2_shape, &desc1, &desc2);
return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1,
desc2};
}
template <typename T, ComparisonFn<T> F>
inline void BroadcastComparison4DSlowImpl(
const ComparisonParams& op_params,
const RuntimeShape& unextended_input1_shape, const T* input1_data,
const RuntimeShape& unextended_input2_shape, const T* input2_data,
const RuntimeShape& unextended_output_shape, bool* output_data) {
const BroadcastComparison4DSlowCommon dims =
BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
unextended_input2_shape,
unextended_output_shape);
for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
output_data[Offset(dims.output_shape, b, y, x, c)] =
F(input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)],
input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)]);
}
}
}
}
}
template <ComparisonFn<float> F>
inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
const RuntimeShape& input1_shape,
const float* input1_data,
const RuntimeShape& input2_shape,
const float* input2_data,
const RuntimeShape& output_shape,
bool* output_data) {
BroadcastComparison4DSlowImpl<float, F>(op_params, input1_shape, input1_data,
input2_shape, input2_data,
output_shape, output_data);
}
template <typename T, ComparisonFn<int32_t> F>
inline void BroadcastComparison4DSlowWithScaling(
const ComparisonParams& op_params,
const RuntimeShape& unextended_input1_shape, const T* input1_data,
const RuntimeShape& unextended_input2_shape, const T* input2_data,
const RuntimeShape& unextended_output_shape, bool* output_data) {
const BroadcastComparison4DSlowCommon dims =
BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
unextended_input2_shape,
unextended_output_shape);
int left_shift = op_params.left_shift;
int32_t input1_offset = op_params.input1_offset;
int32_t input1_multiplier = op_params.input1_multiplier;
int input1_shift = op_params.input1_shift;
int32_t input2_offset = op_params.input2_offset;
int32_t input2_multiplier = op_params.input2_multiplier;
int input2_shift = op_params.input2_shift;
for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
const int32_t input1_val =
input1_offset +
input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)];
const int32_t input2_val =
input2_offset +
input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)];
const int32_t shifted_input1_val = input1_val * (1 << left_shift);
const int32_t shifted_input2_val = input2_val * (1 << left_shift);
const int32_t scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input1_val, input1_multiplier, input1_shift);
const int32_t scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input2_val, input2_multiplier, input2_shift);
output_data[Offset(dims.output_shape, b, y, x, c)] =
F(scaled_input1_val, scaled_input2_val);
}
}
}
}
}
#define TFLITE_COMPARISON_OP(name) \
inline void name(const ComparisonParams& op_params, \
const RuntimeShape& input1_shape, const float* input1_data, \
const RuntimeShape& input2_shape, const float* input2_data, \
const RuntimeShape& output_shape, bool* output_data) { \
Comparison<name##Fn>(op_params, input1_shape, input1_data, input2_shape, \
input2_data, output_shape, output_data); \
} \
template <typename T> \
inline void name##NoScaling( \
const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
const T* input1_data, const RuntimeShape& input2_shape, \
const T* input2_data, const RuntimeShape& output_shape, \
bool* output_data) { \
ComparisonImpl<T, name##Fn>(op_params, input1_shape, input1_data, \
input2_shape, input2_data, output_shape, \
output_data); \
} \
template <typename T> \
inline void name##WithScaling( \
const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
const T* input1_data, const RuntimeShape& input2_shape, \
const T* input2_data, const RuntimeShape& output_shape, \
bool* output_data) { \
ComparisonWithScaling<T, name##Fn>(op_params, input1_shape, input1_data, \
input2_shape, input2_data, \
output_shape, output_data); \
} \
template <typename T> \
inline void Broadcast4DSlow##name##NoScaling( \
const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
const T* input1_data, const RuntimeShape& input2_shape, \
const T* input2_data, const RuntimeShape& output_shape, \
bool* output_data) { \
BroadcastComparison4DSlowImpl<T, name##Fn>( \
op_params, input1_shape, input1_data, input2_shape, input2_data, \
output_shape, output_data); \
} \
inline void Broadcast4DSlow##name( \
const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
const float* input1_data, const RuntimeShape& input2_shape, \
const float* input2_data, const RuntimeShape& output_shape, \
bool* output_data) { \
BroadcastComparison4DSlow<name##Fn>(op_params, input1_shape, input1_data, \
input2_shape, input2_data, \
output_shape, output_data); \
} \
template <typename T> \
inline void Broadcast4DSlow##name##WithScaling( \
const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
const T* input1_data, const RuntimeShape& input2_shape, \
const T* input2_data, const RuntimeShape& output_shape, \
bool* output_data) { \
BroadcastComparison4DSlowWithScaling<T, name##Fn>( \
op_params, input1_shape, input1_data, input2_shape, input2_data, \
output_shape, output_data); \
}
TFLITE_COMPARISON_OP(Equal);
TFLITE_COMPARISON_OP(NotEqual);
TFLITE_COMPARISON_OP(Greater);
TFLITE_COMPARISON_OP(GreaterEqual);
TFLITE_COMPARISON_OP(Less);
TFLITE_COMPARISON_OP(LessEqual);
#undef TFLITE_COMPARISON_OP
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_

View File

@@ -0,0 +1,139 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename Scalar>
inline void Concatenation(const ConcatenationParams& params,
const RuntimeShape* const* input_shapes,
const Scalar* const* input_data,
const RuntimeShape& output_shape,
Scalar* output_data) {
int axis = params.axis;
int inputs_count = params.inputs_count;
const int concat_dimensions = output_shape.DimensionsCount();
TFLITE_DCHECK_LT(axis, concat_dimensions);
int64_t concat_size = 0;
for (int i = 0; i < inputs_count; i++) {
TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
for (int j = 0; j < concat_dimensions; j++) {
if (j != axis) {
MatchingDim(*input_shapes[i], j, output_shape, j);
}
}
concat_size += input_shapes[i]->Dims(axis);
}
TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
int64_t outer_size = 1;
for (int i = 0; i < axis; ++i) {
outer_size *= output_shape.Dims(i);
}
// For all input arrays,
// FlatSize() = outer_size * Dims(axis) * base_inner_size;
int64_t base_inner_size = 1;
for (int i = axis + 1; i < concat_dimensions; ++i) {
base_inner_size *= output_shape.Dims(i);
}
Scalar* output_ptr = output_data;
for (int k = 0; k < outer_size; k++) {
for (int i = 0; i < inputs_count; ++i) {
const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
const Scalar* input_ptr = input_data[i] + k * copy_size;
memcpy(output_ptr, input_ptr, copy_size * sizeof(Scalar));
output_ptr += copy_size;
}
}
}
// TODO(b/174275780): The quantized implementation of concatentation isn't fully
// quantized as it takes scale as a floating point value. This should be fixed
// when optimizng this routine further.
inline void ConcatenationWithScaling(const ConcatenationParams& params,
const RuntimeShape* const* input_shapes,
const uint8_t* const* input_data,
const RuntimeShape& output_shape,
uint8_t* output_data) {
int axis = params.axis;
const int32_t* input_zeropoint = params.input_zeropoint;
const float* input_scale = params.input_scale;
int inputs_count = params.inputs_count;
const int32_t output_zeropoint = params.output_zeropoint;
const float output_scale = params.output_scale;
const int concat_dimensions = output_shape.DimensionsCount();
TFLITE_DCHECK_LT(axis, concat_dimensions);
int64_t concat_size = 0;
for (int i = 0; i < inputs_count; i++) {
TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
for (int j = 0; j < concat_dimensions; j++) {
if (j != axis) {
MatchingDim(*input_shapes[i], j, output_shape, j);
}
}
concat_size += input_shapes[i]->Dims(axis);
}
TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
int64_t outer_size = 1;
for (int i = 0; i < axis; ++i) {
outer_size *= output_shape.Dims(i);
}
// For all input arrays,
// FlatSize() = outer_size * Dims(axis) * base_inner_size;
int64_t base_inner_size = 1;
for (int i = axis + 1; i < concat_dimensions; ++i) {
base_inner_size *= output_shape.Dims(i);
}
const float inverse_output_scale = 1.f / output_scale;
uint8_t* output_ptr = output_data;
for (int k = 0; k < outer_size; k++) {
for (int i = 0; i < inputs_count; ++i) {
const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
const uint8_t* input_ptr = input_data[i] + k * copy_size;
if (input_zeropoint[i] == output_zeropoint &&
input_scale[i] == output_scale) {
memcpy(output_ptr, input_ptr, copy_size);
} else {
const float scale = input_scale[i] * inverse_output_scale;
const float bias = -input_zeropoint[i] * scale;
for (int j = 0; j < copy_size; ++j) {
const int32_t value = static_cast<int32_t>(tflite::TfLiteRound(
input_ptr[j] * scale + bias)) +
output_zeropoint;
output_ptr[j] = static_cast<uint8_t>(
std::max<int32_t>(std::min<int32_t>(255, value), 0));
}
}
output_ptr += copy_size;
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_

View File

@@ -0,0 +1,264 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& filter_shape,
const float* filter_data, const RuntimeShape& bias_shape,
const float* bias_data, const RuntimeShape& output_shape,
float* output_data, const RuntimeShape& im2col_shape,
float* im2col_data) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const float output_activation_min = params.float_activation_min;
const float output_activation_max = params.float_activation_max;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
const int in_y_origin = (out_y * stride_height) - pad_height;
for (int out_x = 0; out_x < output_width; ++out_x) {
const int in_x_origin = (out_x * stride_width) - pad_width;
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
float total = 0.f;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
const int in_y = in_y_origin + dilation_height_factor * filter_y;
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
const int in_x = in_x_origin + dilation_width_factor * filter_x;
// Zero padding by omitting the areas outside the image.
const bool is_point_inside_image =
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
(in_y < input_height);
if (!is_point_inside_image) {
continue;
}
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
float input_value = input_data[Offset(input_shape, batch, in_y,
in_x, in_channel)];
float filter_value = filter_data[Offset(
filter_shape, out_channel, filter_y, filter_x, in_channel)];
total += (input_value * filter_value);
}
}
}
float bias_value = 0.0f;
if (bias_data) {
bias_value = bias_data[out_channel];
}
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
ActivationFunctionWithMinMax(total + bias_value,
output_activation_min,
output_activation_max);
}
}
}
}
}
inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data, const RuntimeShape& im2col_shape,
uint8_t* im2col_data, void* cpu_backend_context) {
(void)cpu_backend_context; // only used in optimized code.
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const int32_t input_offset = params.input_offset;
const int32_t filter_offset = params.weights_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
const int in_y_origin = (out_y * stride_height) - pad_height;
for (int out_x = 0; out_x < output_width; ++out_x) {
const int in_x_origin = (out_x * stride_width) - pad_width;
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
int32_t acc = 0;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
const int in_y = in_y_origin + dilation_height_factor * filter_y;
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
const int in_x = in_x_origin + dilation_width_factor * filter_x;
// Zero padding by omitting the areas outside the image.
const bool is_point_inside_image =
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
(in_y < input_height);
if (!is_point_inside_image) {
continue;
}
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
int32_t input_val = input_data[Offset(input_shape, batch, in_y,
in_x, in_channel)];
int32_t filter_val = filter_data[Offset(
filter_shape, out_channel, filter_y, filter_x, in_channel)];
acc +=
(filter_val + filter_offset) * (input_val + input_offset);
}
}
}
if (bias_data) {
acc += bias_data[out_channel];
}
acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
output_shift);
acc += output_offset;
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
static_cast<uint8_t>(acc);
}
}
}
}
}
inline void HybridConvPerChannel(
const ConvParams& params, float* scaling_factors_ptr,
const RuntimeShape& input_shape, const int8_t* input_data,
const RuntimeShape& filter_shape, const int8_t* filter_data,
const RuntimeShape& bias_shape, const float* bias_data,
const RuntimeShape& output_shape, float* output_data,
const RuntimeShape& im2col_shape, int8_t* im2col_data,
const float* per_channel_scale, int32_t* input_offset) {
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const float output_activation_min = params.float_activation_min;
const float output_activation_max = params.float_activation_max;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
const int in_x_origin = (out_x * stride_width) - pad_width;
const int in_y_origin = (out_y * stride_height) - pad_height;
int32_t acc = 0;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
const int in_x = in_x_origin + dilation_width_factor * filter_x;
const int in_y =
in_y_origin + dilation_height_factor * filter_y;
// If the location is outside the bounds of the input image,
// use zero as a default value.
if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
(in_y < input_height)) {
int32_t input_val = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
int32_t filter_val =
filter_data[Offset(filter_shape, out_channel, filter_y,
filter_x, in_channel)];
acc += filter_val * (input_val - input_offset[batch]);
}
}
}
}
float acc_float =
acc * per_channel_scale[out_channel] * scaling_factors_ptr[batch];
if (bias_data) {
acc_float += bias_data[out_channel];
}
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
ActivationFunctionWithMinMax(acc_float, output_activation_min,
output_activation_max);
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_

View File

@@ -0,0 +1,175 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
#include <algorithm>
#include <cstdint>
#include <limits>
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void CumSum(const T* input_data, const RuntimeShape& shape, int32_t axis,
bool exclusive, bool reverse, T* output_data) {
const int32_t rank = shape.DimensionsCount();
TFLITE_DCHECK_GE(rank, 1);
TFLITE_DCHECK_GE(axis, 0);
TFLITE_DCHECK_LT(axis, rank);
size_t inner = 1;
size_t outer = 1;
size_t depth = 1;
for (int32_t i = 0; i < rank; i++) {
if (i < axis)
inner *= shape.Dims(i);
else if (i > axis)
outer *= shape.Dims(i);
else
depth = shape.Dims(i);
}
for (size_t outer_index = 0; outer_index < outer; outer_index++) {
size_t outer_index_adj;
if (reverse)
outer_index_adj = (outer - 1) - outer_index;
else
outer_index_adj = outer_index;
for (size_t inner_index = 0; inner_index < inner; inner_index++) {
T accumulator = 0;
size_t inner_index_adj;
if (reverse)
inner_index_adj = (inner - 1) - inner_index;
else
inner_index_adj = inner_index;
for (size_t depth_index = 0; depth_index < depth; depth_index++) {
size_t depth_index_adj;
if (reverse)
depth_index_adj = (depth - 1) - depth_index;
else
depth_index_adj = depth_index;
size_t index = outer_index_adj;
index += inner_index_adj * depth * outer;
index += depth_index_adj * outer;
if (exclusive) {
output_data[index] = accumulator;
accumulator += input_data[index];
} else {
accumulator += input_data[index];
output_data[index] = accumulator;
}
}
}
}
}
//
// Quantized INT8 CUMSUM
//
inline void CumSum(const ArithmeticParams& params, const int8_t* input_data,
const RuntimeShape& shape, int32_t axis, bool exclusive,
bool reverse, int8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
// Input offset is negative input zero point. Activation tensors are
// asymmetric quantized so they span the full int8 range.
// All inputs should have same zero-point and scale, this is checked during
// Prepare stage.
TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
const int32_t rank = shape.DimensionsCount();
TFLITE_DCHECK_GE(rank, 1);
TFLITE_DCHECK_GE(axis, 0);
TFLITE_DCHECK_LT(axis, rank);
size_t inner = 1;
size_t outer = 1;
size_t depth = 1;
for (int32_t i = 0; i < rank; i++) {
if (i < axis)
inner *= shape.Dims(i);
else if (i > axis)
outer *= shape.Dims(i);
else
depth = shape.Dims(i);
}
for (size_t outer_index = 0; outer_index < outer; outer_index++) {
size_t outer_index_adj;
if (reverse)
outer_index_adj = (outer - 1) - outer_index;
else
outer_index_adj = outer_index;
for (size_t inner_index = 0; inner_index < inner; inner_index++) {
int32_t accumulator = params.input1_offset; // accumulator = 0
accumulator *= (1 << params.left_shift);
accumulator = MultiplyByQuantizedMultiplierSmallerThanOneExp(
accumulator, params.input1_multiplier, params.input1_shift);
size_t inner_index_adj;
if (reverse)
inner_index_adj = (inner - 1) - inner_index;
else
inner_index_adj = inner_index;
for (size_t depth_index = 0; depth_index < depth; depth_index++) {
size_t depth_index_adj;
if (reverse)
depth_index_adj = (depth - 1) - depth_index;
else
depth_index_adj = depth_index;
size_t index = outer_index_adj;
index += inner_index_adj * depth * outer;
index += depth_index_adj * outer;
const int32_t y = params.input1_offset + input_data[index];
const int32_t shifted_y = y * (1 << params.left_shift);
const int32_t scaled_y = MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_y, params.input1_multiplier, params.input1_shift);
int32_t scaled_output;
if (exclusive) {
scaled_output = accumulator;
accumulator += scaled_y;
} else {
accumulator += scaled_y;
scaled_output = accumulator;
}
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
scaled_output, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[index] = static_cast<int8_t>(clamped_output);
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_

View File

@@ -0,0 +1,79 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(4, unextended_input_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
const int input_depth = input_shape.Dims(3);
const int input_width = input_shape.Dims(2);
const int input_height = input_shape.Dims(1);
const int input_batch = input_shape.Dims(0);
const int output_depth = output_shape.Dims(3);
const int output_width = output_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_batch = output_shape.Dims(0);
const int32_t block_size = op_params.block_size;
TFLITE_DCHECK_EQ(input_width * block_size, output_width);
TFLITE_DCHECK_EQ(input_height * block_size, output_height);
TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size);
TFLITE_DCHECK_EQ(input_batch, output_batch);
for (int out_b = 0; out_b < output_batch; ++out_b) {
for (int out_h = 0; out_h < output_height; ++out_h) {
for (int out_w = 0; out_w < output_width; ++out_w) {
for (int out_d = 0; out_d < output_depth; ++out_d) {
const int in_d =
out_d + ((out_h % block_size) * block_size + out_w % block_size) *
output_depth;
const int in_w = out_w / block_size;
const int in_h = out_h / block_size;
const int in_b = out_b;
const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
const int output_index =
Offset(output_shape, out_b, out_h, out_w, out_d);
output_data[output_index] = input_data[input_index];
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_

View File

@@ -0,0 +1,100 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void DepthwiseConv(
const DepthwiseParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& filter_shape,
const float* filter_data, const RuntimeShape& bias_shape,
const float* bias_data, const RuntimeShape& output_shape,
float* output_data) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const int depth_multiplier = params.depth_multiplier;
const float output_activation_min = params.float_activation_min;
const float output_activation_max = params.float_activation_max;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int input_depth = input_shape.Dims(3);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
for (int b = 0; b < batches; ++b) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int ic = 0; ic < input_depth; ++ic) {
for (int m = 0; m < depth_multiplier; m++) {
const int oc = m + ic * depth_multiplier;
const int in_x_origin = (out_x * stride_width) - pad_width;
const int in_y_origin = (out_y * stride_height) - pad_height;
float total = 0.f;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
const int in_x = in_x_origin + dilation_width_factor * filter_x;
const int in_y =
in_y_origin + dilation_height_factor * filter_y;
// If the location is outside the bounds of the input image,
// use zero as a default value.
if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
(in_y < input_height)) {
float input_value =
input_data[Offset(input_shape, b, in_y, in_x, ic)];
float filter_value = filter_data[Offset(
filter_shape, 0, filter_y, filter_x, oc)];
total += (input_value * filter_value);
}
}
}
float bias_value = 0.0f;
if (bias_data) {
bias_value = bias_data[oc];
}
output_data[Offset(output_shape, b, out_y, out_x, oc)] =
ActivationFunctionWithMinMax(total + bias_value,
output_activation_min,
output_activation_max);
}
}
}
}
}
}
} // end namespace reference_ops
} // end namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_

View File

@@ -0,0 +1,319 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
#include <algorithm>
#include "fixedpoint/fixedpoint.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
// Used in tests and template parameters to control which version of depthwise
// convolution is called. Primarily for reference code, and specializations
// forced in tests.
enum class DepthwiseConvImplementation {
// Run all tests against kUseStandardEntry even if also testing another
// kernel, since we need to be sure that the main DepthwiseConv() function in
// optimized_ops.h dispatches to a correctly-executing kernel.
kNone = 0, // The "default" option: use the normal
// DepthwiseConv kernel (entry) function.
kUseGenericKernel, // Forced use of generic kernel.
kUseNeon3x3, // 3x3 kernel that uses NEON when available.
kUseNeon3x3DotProduct, // 3x3 kernel that uses dot-product enabled NEON
// when available.
kUseCModel3x3DotProduct, // 3x3 kernel, reference C model that is intended
// to match overall design NEON code.
kUseUnwound3x3DotProduct, // 3x3 kernel, reference C model with unwound loops
// and some arrays.
kUseIntrinsics3x3DotProduct, // 3x3 kernel using NEON intrinsics.
};
// Category of depthwise convolution output rounding.
enum class DepthwiseConvOutputRounding {
kNone = 0, // Invalid: specific method must be specified.
kAwayFromZero, // Original method: exact halves rounded away from zero.
kUpward, // Halves towards +infinity: adds 0.5 before truncate.
// This is where a future kNearestEven would be placed.
};
// Category of depthwise convolution depth multiplication.
enum class DepthwiseConvDepthMultiplication {
kNoMultiplication = 0, // Depth multiplier = 1.
kUnitInputDepth, // Input depth = 1, output depth = depth multiplier.
};
namespace reference_ops {
namespace depthwise_conv {
template <DepthwiseConvOutputRounding output_rounding>
inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
int shift) {
TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
}
// Single-rounding MultiplyByQuantizedMultiplier
#if TFLITE_SINGLE_ROUNDING
template <>
inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
int32_t x, int32_t quantized_multiplier, int shift) {
using gemmlowp::RoundingDivideByPOT;
using gemmlowp::SaturatingRoundingDoublingHighMul;
int left_shift = shift > 0 ? shift : 0;
int right_shift = shift > 0 ? 0 : -shift;
return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
x * (1 << left_shift), quantized_multiplier),
right_shift);
}
template <>
inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
int32_t x, int32_t quantized_multiplier, int shift) {
return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
}
// Double-rounding MultiplyByQuantizedMultiplier
#else
template <>
inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
int32_t x, int32_t quantized_multiplier, int shift) {
return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
}
template <>
inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
int32_t x, int32_t quantized_multiplier, int shift) {
using gemmlowp::SaturatingRoundingDoublingHighMul;
const int left_shift = shift > 0 ? shift : 0;
const int right_shift = shift > 0 ? 0 : -shift;
const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0;
return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
quantized_multiplier) +
rounding_offset) >>
right_shift;
}
#endif // TFLITE_SINGLE_ROUNDING
template <DepthwiseConvOutputRounding output_rounding>
struct DepthwiseConvBasicKernel {
static inline void Run(
const DepthwiseParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const int depth_multiplier = params.depth_multiplier;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
const int32_t input_offset = params.input_offset;
const int32_t filter_offset = params.weights_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int input_depth = input_shape.Dims(3);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
for (int b = 0; b < batches; ++b) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int ic = 0; ic < input_depth; ++ic) {
for (int m = 0; m < depth_multiplier; m++) {
const int oc = m + ic * depth_multiplier;
const int in_x_origin = (out_x * stride_width) - pad_width;
const int in_y_origin = (out_y * stride_height) - pad_height;
int32_t acc = 0;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
const int in_x =
in_x_origin + dilation_width_factor * filter_x;
const int in_y =
in_y_origin + dilation_height_factor * filter_y;
// If the location is outside the bounds of the input image,
// use zero as a default value.
if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
(in_y < input_height)) {
int32_t input_val =
input_data[Offset(input_shape, b, in_y, in_x, ic)];
int32_t filter_val = filter_data[Offset(
filter_shape, 0, filter_y, filter_x, oc)];
acc += (filter_val + filter_offset) *
(input_val + input_offset);
}
}
}
if (bias_data) {
acc += bias_data[oc];
}
acc = DepthwiseConvRound<output_rounding>(acc, output_multiplier,
output_shift);
acc += output_offset;
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_data[Offset(output_shape, b, out_y, out_x, oc)] =
static_cast<uint8_t>(acc);
}
}
}
}
}
}
// TODO(b/148596273): Reconcile reference versions, perhaps with common
// MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
static inline void RunPerChannel(
const DepthwiseParams& params, const RuntimeShape& input_shape,
const int8_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int8_t* output_data) {
// Get parameters.
// TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const int depth_multiplier = params.depth_multiplier;
const int32_t input_offset = params.input_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
const int32_t* output_multiplier = params.output_multiplier_per_channel;
const int32_t* output_shift = params.output_shift_per_channel;
// Check dimensions of the tensors.
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int input_depth = input_shape.Dims(3);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
for (int m = 0; m < depth_multiplier; ++m) {
const int output_channel = m + in_channel * depth_multiplier;
const int in_x_origin = (out_x * stride_width) - pad_width;
const int in_y_origin = (out_y * stride_height) - pad_height;
int32_t acc = 0;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
const int in_x =
in_x_origin + dilation_width_factor * filter_x;
const int in_y =
in_y_origin + dilation_height_factor * filter_y;
// Zero padding by omitting the areas outside the image.
const bool is_point_inside_image =
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
(in_y < input_height);
if (is_point_inside_image) {
int32_t input_val = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
int32_t filter_val = filter_data[Offset(
filter_shape, 0, filter_y, filter_x, output_channel)];
// Accumulate with 32 bits accumulator.
// In the nudging process during model quantization, we
// force real value of 0.0 be represented by a quantized
// value. This guarantees that the input_offset is a int8_t,
// even though it is represented using int32_t. int32_t +=
// int8_t
// * (int8_t - int8_t) so the highest value we can get from
// each accumulation is [-127, 127] * ([-128, 127] -
// [-128, 127]), which is [-32512, 32512]. log2(32512)
// = 14.98, which means we can accumulate at least 2^16
// multiplications without overflow. The accumulator is
// applied to a filter so the accumulation logic will hold
// as long as the filter size (filter_y * filter_x *
// in_channel) does not exceed 2^16, which is the case in
// all the models we have seen so far.
acc += filter_val * (input_val + input_offset);
}
}
}
if (bias_data) {
acc += bias_data[output_channel];
}
acc = DepthwiseConvRound<output_rounding>(
acc, output_multiplier[output_channel],
output_shift[output_channel]);
acc += output_offset;
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x,
output_channel)] = static_cast<int8_t>(acc);
}
}
}
}
}
}
};
} // namespace depthwise_conv
inline void DepthwiseConv(
const DepthwiseParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data) {
return depthwise_conv::DepthwiseConvBasicKernel<
DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
input_data, filter_shape,
filter_data, bias_shape,
bias_data, output_shape,
output_data);
}
} // namespace reference_ops
} // end namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_

View File

@@ -0,0 +1,78 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEQUANTIZE_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEQUANTIZE_H_
#include <limits.h>
#include <vector>
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
// Dequantizes into a float without rounding.
template <typename InputT, typename OutputT>
inline void Dequantize(const tflite::DequantizationParams& op_params,
const RuntimeShape& input_shape,
const InputT* input_data,
const RuntimeShape& output_shape, OutputT* output_data) {
int32_t zero_point = op_params.zero_point;
const double scale = op_params.scale;
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; i++) {
const int32_t val = input_data[i];
const OutputT result = static_cast<OutputT>(scale * (val - zero_point));
output_data[i] = result;
}
}
// Dequantizes per-channel quantized tensor to float.
template <typename T>
inline void PerChannelDequantize(
const tflite::PerChannelDequantizationParams& op_params,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& output_shape, float* output_data) {
// Ensure flat size is same.
MatchingFlatSize(input_shape, output_shape);
const int32_t* zero_point = op_params.zero_point;
const float* scale = op_params.scale;
const int32_t quantized_dimension = op_params.quantized_dimension;
const int32_t num_dims = input_shape.DimensionsCount();
const int32_t* dims_data = input_shape.DimsData();
std::vector<int> current_dim(num_dims, 0);
do {
size_t offset =
ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
current_dim.data(), 0, nullptr);
const int channel = current_dim[quantized_dimension];
const int32_t val = input_data[offset];
const float result =
static_cast<float>(scale[channel] * (val - zero_point[channel]));
output_data[offset] = result;
} while (NextIndex(num_dims, reinterpret_cast<const int*>(dims_data),
current_dim.data()));
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEQUANTIZE_H_

View File

@@ -0,0 +1,37 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void Elu(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
const float val = input_data[i];
output_data[i] = val < 0.0f ? TfLiteExpm1(val) : val;
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_

View File

@@ -0,0 +1,38 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
#include <cmath>
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void Exp(const T* input_data, const size_t num_elements,
T* output_data) {
ruy::profiler::ScopeLabel label("Exp");
for (size_t idx = 0; idx < num_elements; ++idx) {
output_data[idx] = std::exp(input_data[idx]);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_

View File

@@ -0,0 +1,38 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
#include <cmath>
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
void Fill(const RuntimeShape& value_shape, const T* value_data,
const RuntimeShape& output_shape, T* output_data) {
TFLITE_DCHECK_EQ(value_shape.DimensionsCount(), 0);
const int flat_size = output_shape.FlatSize();
for (int i = 0; i < flat_size; ++i) {
output_data[i] = *value_data;
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_

View File

@@ -0,0 +1,39 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
#include <cmath>
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void Floor(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; i++) {
int offset = i;
output_data[offset] = std::floor(input_data[offset]);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_

View File

@@ -0,0 +1,35 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
#include <cmath>
#include <functional>
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
T FloorDiv(T input1, T input2) {
return std::floor(std::divides<double>()(static_cast<double>(input1),
static_cast<double>(input2)));
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_

View File

@@ -0,0 +1,44 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
#include <cmath>
#include <functional>
namespace tflite {
namespace reference_ops {
template <typename T>
T FloorMod(T input1, T input2) {
struct FloatMod {
float operator()(const float lhs, const float rhs) const {
return std::fmod(lhs, rhs);
}
};
using ModFunc = typename std::conditional<std::is_integral<T>::value,
std::modulus<T>, FloatMod>::type;
ModFunc mod_func;
T trunc_mod = mod_func(input1, input2);
return (trunc_mod != 0) && ((input2 < 0) != (trunc_mod < 0))
? (trunc_mod + input2)
: trunc_mod;
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_

View File

@@ -0,0 +1,321 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void FullyConnected(
const FullyConnectedParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& weights_shape,
const float* weights_data, const RuntimeShape& bias_shape,
const float* bias_data, const RuntimeShape& output_shape,
float* output_data) {
const float output_activation_min = params.float_activation_min;
const float output_activation_max = params.float_activation_max;
// TODO(b/62193649): This really should be:
// const int batches = ArraySize(output_dims, 1);
// but the current --variable_batch hack consists in overwriting the 3rd
// dimension with the runtime batch size, as we don't keep track for each
// array of which dimension is the batch dimension in it.
const int output_dims_count = output_shape.DimensionsCount();
const int weights_dims_count = weights_shape.DimensionsCount();
const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
output_shape, output_dims_count - 1);
const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
for (int b = 0; b < batches; ++b) {
for (int out_c = 0; out_c < output_depth; ++out_c) {
float total = 0.f;
for (int d = 0; d < accum_depth; ++d) {
total += input_data[b * accum_depth + d] *
weights_data[out_c * accum_depth + d];
}
float bias_value = 0.0f;
if (bias_data) {
bias_value = bias_data[out_c];
}
output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
total + bias_value, output_activation_min, output_activation_max);
}
}
}
inline void FullyConnected(
const FullyConnectedParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data) {
const int32_t input_offset = params.input_offset;
const int32_t filter_offset = params.weights_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
// TODO(b/62193649): This really should be:
// const int batches = ArraySize(output_dims, 1);
// but the current --variable_batch hack consists in overwriting the 3rd
// dimension with the runtime batch size, as we don't keep track for each
// array of which dimension is the batch dimension in it.
const int output_dim_count = output_shape.DimensionsCount();
const int filter_dim_count = filter_shape.DimensionsCount();
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
output_shape, output_dim_count - 1);
const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
for (int b = 0; b < batches; ++b) {
for (int out_c = 0; out_c < output_depth; ++out_c) {
int32_t acc = 0;
for (int d = 0; d < accum_depth; ++d) {
int32_t input_val = input_data[b * accum_depth + d];
int32_t filter_val = filter_data[out_c * accum_depth + d];
acc += (filter_val + filter_offset) * (input_val + input_offset);
}
if (bias_data) {
acc += bias_data[out_c];
}
acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
acc += output_offset;
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
}
}
}
inline void FullyConnected(
const FullyConnectedParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int16_t* output_data) {
const int32_t input_offset = params.input_offset;
const int32_t filter_offset = params.weights_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
TFLITE_DCHECK_EQ(output_offset, 0);
// TODO(b/62193649): This really should be:
// const int batches = ArraySize(output_dims, 1);
// but the current --variable_batch hack consists in overwriting the 3rd
// dimension with the runtime batch size, as we don't keep track for each
// array of which dimension is the batch dimension in it.
const int output_dim_count = output_shape.DimensionsCount();
const int filter_dim_count = filter_shape.DimensionsCount();
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
output_shape, output_dim_count - 1);
const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
for (int b = 0; b < batches; ++b) {
for (int out_c = 0; out_c < output_depth; ++out_c) {
// Internal accumulation.
// Initialize accumulator with the bias-value.
int32_t accum = bias_data[out_c];
// Accumulation loop.
for (int d = 0; d < accum_depth; ++d) {
int16_t input_val = input_data[b * accum_depth + d] + input_offset;
int16_t filter_val =
filter_data[out_c * accum_depth + d] + filter_offset;
accum += filter_val * input_val;
}
// Down-scale the final int32_t accumulator to the scale used by our
// (16-bit, typically 3 integer bits) fixed-point format. The quantized
// multiplier and shift here have been pre-computed offline
// (e.g. by toco).
accum =
MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
// Saturate, cast to int16_t, and store to output array.
accum = std::max(accum, output_activation_min - output_offset);
accum = std::min(accum, output_activation_max - output_offset);
accum += output_offset;
output_data[out_c + output_depth * b] = accum;
}
}
}
inline void ShuffledFullyConnected(
const FullyConnectedParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& weights_shape,
const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int16_t* output_data, uint8_t* shuffled_input_workspace_data) {
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
// TODO(b/62193649): This really should be:
// const int batches = ArraySize(output_dims, 1);
// but the current --variable_batch hack consists in overwriting the 3rd
// dimension with the runtime batch size, as we don't keep track for each
// array of which dimension is the batch dimension in it.
const int output_dim_count = output_shape.DimensionsCount();
const int weights_dim_count = weights_shape.DimensionsCount();
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
output_shape, output_dim_count - 1);
const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
TFLITE_DCHECK((accum_depth % 16) == 0);
TFLITE_DCHECK((output_depth % 4) == 0);
// Shuffling and xoring of input activations into the workspace buffer
uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
if (batches == 1) {
for (int i = 0; i < accum_depth; i++) {
shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
}
} else if (batches == 4) {
for (int c = 0; c < accum_depth; c += 16) {
for (int b = 0; b < 4; b++) {
const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
for (int j = 0; j < 16; j++) {
uint8_t src_val = *src_data_ptr++;
// Flip the sign bit, so that the kernel will only need to
// reinterpret these uint8_t values as int8_t, getting for free the
// subtraction of the zero_point value 128.
uint8_t dst_val = src_val ^ 0x80;
*shuffled_input_workspace_ptr++ = dst_val;
}
}
}
} else {
TFLITE_DCHECK(false);
return;
}
// Actual computation
if (batches == 1) {
int16_t* output_ptr = output_data;
// Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
// so that just reinterpreting them as int8_t values is equivalent to
// subtracting 128 from them, thus implementing for free the subtraction of
// the zero_point value 128.
const int8_t* shuffled_weights_ptr =
reinterpret_cast<const int8_t*>(shuffled_weights_data);
// Likewise, we preshuffled and pre-xored the input data above.
const int8_t* shuffled_input_data =
reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
for (int c = 0; c < output_depth; c += 4) {
// Internal accumulation.
// Initialize accumulator with the bias-value.
int32_t accum[4] = {0};
// Accumulation loop.
for (int d = 0; d < accum_depth; d += 16) {
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 16; j++) {
int8_t input_val = shuffled_input_data[d + j];
int8_t weights_val = *shuffled_weights_ptr++;
accum[i] += weights_val * input_val;
}
}
}
for (int i = 0; i < 4; i++) {
// Add bias value
int32_t acc = accum[i] + bias_data[c + i];
// Down-scale the final int32_t accumulator to the scale used by our
// (16-bit, typically 3 integer bits) fixed-point format. The quantized
// multiplier and shift here have been pre-computed offline
// (e.g. by toco).
acc =
MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
// Saturate, cast to int16_t, and store to output array.
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_ptr[c + i] = acc;
}
}
} else if (batches == 4) {
int16_t* output_ptr = output_data;
// Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
// so that just reinterpreting them as int8_t values is equivalent to
// subtracting 128 from them, thus implementing for free the subtraction of
// the zero_point value 128.
const int8_t* shuffled_weights_ptr =
reinterpret_cast<const int8_t*>(shuffled_weights_data);
// Likewise, we preshuffled and pre-xored the input data above.
const int8_t* shuffled_input_data =
reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
for (int c = 0; c < output_depth; c += 4) {
const int8_t* shuffled_input_ptr = shuffled_input_data;
// Accumulation loop.
// Internal accumulation.
// Initialize accumulator with the bias-value.
int32_t accum[4][4];
for (int i = 0; i < 4; i++) {
for (int b = 0; b < 4; b++) {
accum[i][b] = 0;
}
}
for (int d = 0; d < accum_depth; d += 16) {
for (int i = 0; i < 4; i++) {
for (int b = 0; b < 4; b++) {
for (int j = 0; j < 16; j++) {
int8_t input_val = shuffled_input_ptr[16 * b + j];
int8_t weights_val = shuffled_weights_ptr[16 * i + j];
accum[i][b] += weights_val * input_val;
}
}
}
shuffled_input_ptr += 64;
shuffled_weights_ptr += 64;
}
for (int i = 0; i < 4; i++) {
for (int b = 0; b < 4; b++) {
// Add bias value
int32_t acc = accum[i][b] + bias_data[c + i];
// Down-scale the final int32_t accumulator to the scale used by our
// (16-bit, typically 3 integer bits) fixed-point format. The
// quantized multiplier and shift here have been pre-computed offline
// (e.g. by toco).
acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
output_shift);
// Saturate, cast to int16_t, and store to output array.
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_ptr[b * output_depth + c + i] = acc;
}
}
}
} else {
TFLITE_DCHECK(false);
return;
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_

View File

@@ -0,0 +1,166 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline int16_t SaturatingLeftShift(int16_t value, int amount) {
int32_t result = static_cast<int32_t>(value) * (1 << amount);
result = std::min<int32_t>(result, std::numeric_limits<int16_t>::max());
result = std::max<int32_t>(result, std::numeric_limits<int16_t>::min());
return result;
}
// Similar to ARM instruction SQDMULH.
// Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
// rounding to zero instead of to nearest (SQRDMULH).
inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
std::int32_t a_32(a);
std::int32_t b_32(b);
std::int32_t ab_32 = a_32 * b_32;
std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
}
template <typename T>
inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& output_shape, T* output_data) {
ruy::profiler::ScopeLabel label("ReferenceHardSwish/Float");
auto matching_size = MatchingFlatSize(input_shape, output_shape);
const T* in_end = input_data + matching_size;
for (; input_data < in_end; input_data++, output_data++) {
const float in = *input_data;
*output_data =
in * std::min(static_cast<T>(6), std::max(static_cast<T>(0), in + 3)) /
6;
}
}
template <typename T>
inline void HardSwish(const HardSwishParams& params,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& output_shape, T* output_data) {
ruy::profiler::ScopeLabel label("ReferenceHardSwish/Quantized");
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; i++) {
const int16_t input_value = input_data[i] - params.input_zero_point;
// Left-shift as much as we can without overflow/saturation to put
// significant bits in the high bits of our 16-bit fixedpoint values, so
// that fixed-point approximate computations below are as accurate as
// possible.
const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
// Compute the input value on essentially the output scale, just not
// right-shifted yet. This is the value that we'll use in the (x >= +3)
// case, and that in the general case we'll multiply against the "relu-ish"
// fixed-point multiplier in [0, 1].
const int16_t input_value_on_preshift_output_scale =
gemmlowp::SaturatingRoundingDoublingHighMul(
input_value_on_hires_input_scale,
params.output_multiplier_fixedpoint_int16);
// Now compute the "relu-ish multiplier". In the (-3 <= x <= +3) case, that
// is just an affine rescaling of x from [-3, 3] to [0, 1]. In the general
// case, it is just that plus saturation at the boundaries of [-3, 3].
// First, we rescale from [-3, 3] to [-1, 1], saturating.
// That is done by rescaling the input value with a fixed-point multiplier
// (reluish_multiplier_fixedpoint) and bit-shift such that we represent
// that input value on the scale where the real value 3.0f is represented
// by the quantized value 32768. (+32768 is actually not representable as
// int16_t, so this saturates at +32767, and that is seen empirically to be
// a negligible contribution to numerical error/bias).
//
// This code is careful to correctly implement any magnitude of multiplier,
// involving either a right shift or a left shift, with correct saturation
// behavior in the left-shift case. This forces this code to be more
// complicated, but is necessary for real applications: a partially
// trained quantized MobileNet v3-small model that motivated this code
// exhibits some large [min, max] range boundaries, of the order of
// magnitude of 10 or 100 depending on layers.
//
// The next few lines are basically just an ordinary
// MultiplyByQuantizedMultiplier, except that we are more careful here
// about the fine details of saturation when left-shifting, because here
// overflow in left-shift is a common case, not an anomaly as
// MultiplyByQuantizedMultiplier assumes.
int16_t reluish_value = input_value_on_hires_input_scale;
// Shift left, saturating, as much as we can while ensuring that this
// saturation will not contribute to the result. That is, left shift amount
// reduced by 1.
if (params.reluish_multiplier_exponent > 0) {
reluish_value = SaturatingLeftShift(
reluish_value, params.reluish_multiplier_exponent - 1);
}
// Apply the fixed-point multiplier, dividing the value by a divisor
// ranging in [1, 2].
reluish_value = gemmlowp::SaturatingRoundingDoublingHighMul(
reluish_value, params.reluish_multiplier_fixedpoint_int16);
// Apply the last bit of left-shift. Thus, in the left-shifting case, if
// any saturation affects the result, it is happening here --- any
// saturation having occurred above is overwritten here, not affecting the
// result.
if (params.reluish_multiplier_exponent > 0) {
reluish_value = SaturatingLeftShift(reluish_value, 1);
}
// Shift right, in the right-shifting case.
if (params.reluish_multiplier_exponent < 0) {
reluish_value = gemmlowp::RoundingDivideByPOT(
reluish_value, -params.reluish_multiplier_exponent);
}
// At this point we have rescaled the value into a 16bit fixedpoint
// reluish_value in [-1, 1].
// We now convert that to a 16bit fixedpoint value in [0, 1].
reluish_value = (reluish_value + (1 << 15)) >> 1;
// Use of SaturatingDoublingHighMul here is important to cancel the biases
// from the above SaturatingRoundingDoublingHighMul.
//
// On a partially trained MobileNet-v3-small,
//
// | bias on | ImageNet
// | quantized | Top-1
// Operation used here | values | accuracy (50k)
// --------------------------------------+------------+-----------
// SaturatingDoublingHighMul | -0.0024 | 58.920
// SaturatingRoundingDoublingHighMul | -0.0067 | 58.064
//
// In activations_test, this is covered by this testcase:
// QuantizedActivationsOpTest.HardSwishBias
//
const int16_t preshift_output_value = SaturatingDoublingHighMul(
reluish_value, input_value_on_preshift_output_scale);
// We were so far operating on the pre-shift output scale. Now we finally
// apply that output shift, arriving at the final output scale.
int16_t output_value = gemmlowp::RoundingDivideByPOT(
preshift_output_value, -params.output_multiplier_exponent);
output_value += params.output_zero_point;
output_value =
std::min<int16_t>(output_value, std::numeric_limits<T>::max());
output_value =
std::max<int16_t>(output_value, std::numeric_limits<T>::min());
output_data[i] = output_value;
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_

View File

@@ -0,0 +1,144 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
#include <limits>
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_integer_ops {
inline void CheckArithmeticParams(const ArithmeticParams& params) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
// Input offset is negative input zero point. Activation tensors are
// asymmetric quantized so they span the full int8 range.
TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
TFLITE_DCHECK_GE(-params.input2_offset, std::numeric_limits<int8_t>::min());
TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
}
inline void ElementWise(
int size, const ArithmeticParams& params, const int8_t* input1_data,
const int8_t* input2_data, int8_t* output_data,
void (*check_arithmetic_params)(const ArithmeticParams&),
int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
CheckArithmeticParams(params);
for (int i = 0; i < size; ++i) {
output_data[i] = binary_func(input1_data[i], input2_data[i], params);
}
}
inline void BroadcastBinaryFunction4DSlow(
const ArithmeticParams& params, const RuntimeShape& input1_shape,
const int8_t* input1_data, const RuntimeShape& input2_shape,
const int8_t* input2_data, const RuntimeShape& output_shape,
int8_t* output_data,
void (*check_arithmetic_params)(const ArithmeticParams&),
int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
const RuntimeShape extended_output_shape =
RuntimeShape::ExtendedShape(4, output_shape);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
output_data[Offset(extended_output_shape, b, y, x, c)] = binary_func(
input1_data[SubscriptToIndex(desc1, b, y, x, c)],
input2_data[SubscriptToIndex(desc2, b, y, x, c)], params);
}
}
}
}
}
inline int8_t AddFunc(int8_t x, int8_t y, const ArithmeticParams& params) {
const int32_t input1_val = params.input1_offset + x;
const int32_t input2_val = params.input2_offset + y;
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
const int32_t scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input1_val, params.input1_multiplier, params.input1_shift);
const int32_t scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input2_val, params.input2_multiplier, params.input2_shift);
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
raw_sum, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
return static_cast<int8_t>(clamped_output);
}
// Element-wise add that can often be used for inner loop of broadcast add as
// well as the non-broadcast add.
inline void AddElementwise(int size, const ArithmeticParams& params,
const int8_t* input1_data, const int8_t* input2_data,
int8_t* output_data) {
ElementWise(size, params, input1_data, input2_data, output_data,
CheckArithmeticParams, AddFunc);
}
inline void Add(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int8_t* input1_data,
const RuntimeShape& input2_shape, const int8_t* input2_data,
const RuntimeShape& output_shape, int8_t* output_data) {
CheckArithmeticParams(params);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int8_t* input1_data,
const RuntimeShape& input2_shape,
const int8_t* input2_data,
const RuntimeShape& output_shape,
int8_t* output_data) {
BroadcastBinaryFunction4DSlow(params, input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data,
CheckArithmeticParams, AddFunc);
}
} // namespace reference_integer_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_

View File

@@ -0,0 +1,221 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_integer_ops {
// Fixed-point per-channel-quantization convolution reference kernel.
inline void ConvPerChannel(
const ConvParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const RuntimeShape& input_shape,
const int8_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int8_t* output_data) {
// Get parameters.
const int32_t input_offset = params.input_offset; // r = s(q - Z)
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const int32_t output_offset = params.output_offset;
// Set min and max value of the output.
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
// Consistency check.
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
// Check dimensions of the tensors.
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
const int in_y_origin = (out_y * stride_height) - pad_height;
for (int out_x = 0; out_x < output_width; ++out_x) {
const int in_x_origin = (out_x * stride_width) - pad_width;
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
int32_t acc = 0;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
const int in_y = in_y_origin + dilation_height_factor * filter_y;
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
const int in_x = in_x_origin + dilation_width_factor * filter_x;
// Zero padding by omitting the areas outside the image.
const bool is_point_inside_image =
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
(in_y < input_height);
if (!is_point_inside_image) {
continue;
}
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
int32_t input_val = input_data[Offset(input_shape, batch, in_y,
in_x, in_channel)];
int32_t filter_val = filter_data[Offset(
filter_shape, out_channel, filter_y, filter_x, in_channel)];
// Accumulate with 32 bits accumulator.
// In the nudging process during model quantization, we force
// real value of 0.0 be represented by a quantized value. This
// guarantees that the input_offset is a int8_t, even though
// it is represented using int32_t. int32_t += int8_t *
// (int8_t - int8_t) so the highest value we can get from each
// accumulation is [-127, 127] * ([-128, 127] -
// [-128, 127]), which is [-32512, 32512]. log2(32512)
// = 14.98, which means we can accumulate at least 2^16
// multiplications without overflow. The accumulator is
// applied to a filter so the accumulation logic will hold as
// long as the filter size (filter_y * filter_x * in_channel)
// does not exceed 2^16, which is the case in all the models
// we have seen so far.
// TODO(b/174275578): Add a check to make sure the
// accumulator depth is smaller than 2^16.
acc += filter_val * (input_val + input_offset);
}
}
}
if (bias_data) {
acc += bias_data[out_channel];
}
acc = MultiplyByQuantizedMultiplier(
acc, output_multiplier[out_channel], output_shift[out_channel]);
acc += output_offset;
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
static_cast<int8_t>(acc);
}
}
}
}
}
// Fixed-point per-channel-quantization convolution reference kernel.
// 16-bit data and 8-bit filter
inline void ConvPerChannel(
const ConvParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const RuntimeShape& input_shape,
const int16_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const std::int64_t* bias_data, const RuntimeShape& output_shape,
int16_t* output_data) {
// Get parameters.
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
// Set min and max value of the output.
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
// Consistency check.
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
// Check dimensions of the tensors.
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
const int in_y_origin = (out_y * stride_height) - pad_height;
for (int out_x = 0; out_x < output_width; ++out_x) {
const int in_x_origin = (out_x * stride_width) - pad_width;
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
std::int64_t acc = 0;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
const int in_y = in_y_origin + dilation_height_factor * filter_y;
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
const int in_x = in_x_origin + dilation_width_factor * filter_x;
// Zero padding by omitting the areas outside the image.
const bool is_point_inside_image =
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
(in_y < input_height);
if (!is_point_inside_image) {
continue;
}
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
int32_t input_val = input_data[Offset(input_shape, batch, in_y,
in_x, in_channel)];
int32_t filter_val = filter_data[Offset(
filter_shape, out_channel, filter_y, filter_x, in_channel)];
// Accumulate with 64 bits accumulator.
// int64_t += int8_t * int16_t so the highest value we can
// get from each accumulation is [-127, 127] * ([-32768,
// 32767] -
// [-32768, 32767]), which is [-8322945, 8322945].
// log2(8322945) = 22.99.
acc += filter_val * input_val;
}
}
}
if (bias_data) {
acc += bias_data[out_channel];
}
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
acc, output_multiplier[out_channel], output_shift[out_channel]);
scaled_acc = std::max(scaled_acc, output_activation_min);
scaled_acc = std::min(scaled_acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
static_cast<int16_t>(scaled_acc);
}
}
}
}
}
} // namespace reference_integer_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_

View File

@@ -0,0 +1,289 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_integer_ops {
inline void DepthwiseConvPerChannel(
const DepthwiseParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const RuntimeShape& input_shape,
const int8_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int8_t* output_data) {
// Get parameters.
// TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const int depth_multiplier = params.depth_multiplier;
const int32_t input_offset = params.input_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
// Check dimensions of the tensors.
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int input_depth = input_shape.Dims(3);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
for (int m = 0; m < depth_multiplier; ++m) {
const int output_channel = m + in_channel * depth_multiplier;
const int in_x_origin = (out_x * stride_width) - pad_width;
const int in_y_origin = (out_y * stride_height) - pad_height;
int32_t acc = 0;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
const int in_x = in_x_origin + dilation_width_factor * filter_x;
const int in_y =
in_y_origin + dilation_height_factor * filter_y;
// Zero padding by omitting the areas outside the image.
const bool is_point_inside_image =
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
(in_y < input_height);
if (is_point_inside_image) {
int32_t input_val = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
int32_t filter_val = filter_data[Offset(
filter_shape, 0, filter_y, filter_x, output_channel)];
// Accumulate with 32 bits accumulator.
// In the nudging process during model quantization, we force
// real value of 0.0 be represented by a quantized value. This
// guarantees that the input_offset is a int8_t, even though
// it is represented using int32_t. int32_t += int8_t *
// (int8_t - int8_t) so the highest value we can get from each
// accumulation is [-127, 127] * ([-128, 127] -
// [-128, 127]), which is [-32512, 32512]. log2(32512)
// = 14.98, which means we can accumulate at least 2^16
// multiplications without overflow. The accumulator is
// applied to a filter so the accumulation logic will hold as
// long as the filter size (filter_y * filter_x * in_channel)
// does not exceed 2^16, which is the case in all the models
// we have seen so far.
// TODO(b/174275578): Add a check to make sure the
// accumulator depth is smaller than 2^16.
acc += filter_val * (input_val + input_offset);
}
}
}
if (bias_data) {
acc += bias_data[output_channel];
}
acc = MultiplyByQuantizedMultiplier(
acc, output_multiplier[output_channel],
output_shift[output_channel]);
acc += output_offset;
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x,
output_channel)] = static_cast<int8_t>(acc);
}
}
}
}
}
}
inline void DepthwiseConvPerChannel(
const DepthwiseParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const RuntimeShape& input_shape,
const int16_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const std::int64_t* bias_data, const RuntimeShape& output_shape,
int16_t* output_data) {
// Get parameters.
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const int depth_multiplier = params.depth_multiplier;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
// Check dimensions of the tensors.
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int input_depth = input_shape.Dims(3);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
for (int m = 0; m < depth_multiplier; ++m) {
const int output_channel = m + in_channel * depth_multiplier;
const int in_x_origin = (out_x * stride_width) - pad_width;
const int in_y_origin = (out_y * stride_height) - pad_height;
std::int64_t acc = 0;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
const int in_x = in_x_origin + dilation_width_factor * filter_x;
const int in_y =
in_y_origin + dilation_height_factor * filter_y;
// Zero padding by omitting the areas outside the image.
const bool is_point_inside_image =
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
(in_y < input_height);
if (is_point_inside_image) {
int32_t input_val = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
int32_t filter_val = filter_data[Offset(
filter_shape, 0, filter_y, filter_x, output_channel)];
// Accumulate with 64 bits accumulator.
// We assume maximum of 2^16 accumulations as with the 8-bit
// case so actually the value in the accumulator should not
// exceed 40 bits
acc += static_cast<int64_t>(filter_val) *
static_cast<int64_t>(input_val);
}
}
}
if (bias_data) {
acc += bias_data[output_channel];
}
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
acc, output_multiplier[output_channel],
output_shift[output_channel]);
scaled_acc = std::max(scaled_acc, output_activation_min);
scaled_acc = std::min(scaled_acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x,
output_channel)] =
static_cast<int16_t>(scaled_acc);
}
}
}
}
}
}
inline void DepthwiseConvHybridPerChannel(
const DepthwiseParams& params, float* scaling_factors_ptr,
const RuntimeShape& input_shape, const int8_t* input_data,
const RuntimeShape& filter_shape, const int8_t* filter_data,
const RuntimeShape& bias_shape, const float* bias_data,
const RuntimeShape& output_shape, float* output_data,
const float* per_channel_scale, int32_t* input_offset) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const int depth_multiplier = params.depth_multiplier;
const float output_activation_min = params.float_activation_min;
const float output_activation_max = params.float_activation_max;
// Check dimensions of the tensors.
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int input_depth = input_shape.Dims(3);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int bias_depth = bias_shape.FlatSize();
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
TFLITE_DCHECK_EQ(bias_depth, output_depth);
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
for (int m = 0; m < depth_multiplier; ++m) {
const int output_channel = m + in_channel * depth_multiplier;
const int in_x_origin = (out_x * stride_width) - pad_width;
const int in_y_origin = (out_y * stride_height) - pad_height;
int32_t acc = 0;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
const int in_x = in_x_origin + dilation_width_factor * filter_x;
const int in_y =
in_y_origin + dilation_height_factor * filter_y;
// Zero padding by omitting the areas outside the image.
const bool is_point_inside_image =
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
(in_y < input_height);
if (is_point_inside_image) {
int32_t input_val = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
int32_t filter_val = filter_data[Offset(
filter_shape, 0, filter_y, filter_x, output_channel)];
acc += filter_val * (input_val - input_offset[batch]);
}
}
}
float acc_float = static_cast<float>(acc);
acc_float *=
per_channel_scale[output_channel] * scaling_factors_ptr[batch];
if (bias_data && output_channel < bias_depth) {
acc_float += bias_data[output_channel];
}
output_data[Offset(output_shape, batch, out_y, out_x,
output_channel)] =
ActivationFunctionWithMinMax(acc_float, output_activation_min,
output_activation_max);
}
}
}
}
}
}
} // namespace reference_integer_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_

View File

@@ -0,0 +1,109 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_integer_ops {
inline void FullyConnected(
const FullyConnectedParams& params, const RuntimeShape& input_shape,
const int8_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int8_t* output_data) {
const int32_t input_offset = params.input_offset;
const int32_t filter_offset = params.weights_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
const int filter_dim_count = filter_shape.DimensionsCount();
const int batches = output_shape.Dims(0);
const int output_depth = output_shape.Dims(1);
TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
for (int b = 0; b < batches; ++b) {
for (int out_c = 0; out_c < output_depth; ++out_c) {
int32_t acc = 0;
for (int d = 0; d < accum_depth; ++d) {
int32_t input_val = input_data[b * accum_depth + d];
int32_t filter_val = filter_data[out_c * accum_depth + d];
acc += (filter_val + filter_offset) * (input_val + input_offset);
}
if (bias_data) {
acc += bias_data[out_c];
}
acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
acc += output_offset;
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_data[out_c + output_depth * b] = static_cast<int8_t>(acc);
}
}
}
inline void FullyConnected(
const FullyConnectedParams& params, const RuntimeShape& input_shape,
const int16_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const int64_t* bias_data, const RuntimeShape& output_shape,
int16_t* output_data) {
const int32_t filter_offset = params.weights_offset;
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
const int filter_dim_count = filter_shape.DimensionsCount();
const int output_dim_count = output_shape.DimensionsCount();
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
const int output_depth = output_shape.Dims(output_dim_count - 1);
TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
for (int b = 0; b < batches; ++b) {
for (int out_c = 0; out_c < output_depth; ++out_c) {
int64_t acc = 0;
for (int d = 0; d < accum_depth; ++d) {
int32_t input_val = input_data[b * accum_depth + d];
int32_t filter_val = filter_data[out_c * accum_depth + d];
acc += (filter_val + filter_offset) * input_val;
}
if (bias_data) {
acc += bias_data[out_c];
}
int32_t acc_scaled =
MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
acc_scaled = std::max(acc_scaled, output_activation_min);
acc_scaled = std::min(acc_scaled, output_activation_max);
output_data[out_c + output_depth * b] = static_cast<int16_t>(acc_scaled);
}
}
}
} // namespace reference_integer_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_

View File

@@ -0,0 +1,65 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_integer_ops {
inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
int32_t depth, const int8_t* input_data,
int8_t* output_data) {
static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
// The output scale must be in sync with Prepare().
// Output is in 1/128 scale so the actual output range is nudged from [-1, 1]
// to [-1, 127/128].
static constexpr int32_t kOutputScale = 7;
for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
// int32_t = (int8_t - int8_t) ^ 2.
// ([-128, 127] - [-128, 127]) ^ 2 = [0, (2^8 - 1)^2] so the accumulator is
// safe from overflowing in at least 2^16 steps.
int32_t acc = 0;
for (int inner_index = 0; inner_index < depth; ++inner_index) {
int32_t input =
input_data[depth * outer_index + inner_index] - input_zero_point;
acc += input * input;
}
int32_t inv_l2norm_multiplier;
int inv_l2norm_shift;
GetInvSqrtQuantizedMultiplierExp(acc, kReverseShift, &inv_l2norm_multiplier,
&inv_l2norm_shift);
for (int inner_index = 0; inner_index < depth; ++inner_index) {
int32_t input =
input_data[depth * outer_index + inner_index] - input_zero_point;
// Rescale and downcast. Rescale is folded into the division.
int32_t output_in_q24 = MultiplyByQuantizedMultiplier(
input, inv_l2norm_multiplier, inv_l2norm_shift + kOutputScale);
output_in_q24 =
std::min(static_cast<int32_t>(kMaxInt8),
std::max(static_cast<int32_t>(kMinInt8), output_in_q24));
output_data[depth * outer_index + inner_index] =
static_cast<int8_t>(output_in_q24);
}
}
}
} // namespace reference_integer_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_

View File

@@ -0,0 +1,119 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
#include <limits>
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_integer_ops {
inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
int32_t input_multiplier, int32_t input_left_shift,
int32_t input_size, const int8_t* input_data,
int8_t* output_data) {
// Integer bits must be in sync with Prepare() function.
static constexpr int32_t kInputIntegerBits = 4;
static constexpr int32_t kOutputIntegerBits = 8;
static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
static constexpr int32_t kOutputZeroPoint = -128;
for (int i = 0; i < input_size; ++i) {
const int32_t input =
static_cast<int32_t>(input_data[i]) - input_zero_point;
if (input <= -input_range_radius) {
output_data[i] = kMinInt8;
} else if (input >= input_range_radius) {
output_data[i] = kMaxInt8;
} else {
const int32_t input_in_q4 = MultiplyByQuantizedMultiplier(
input, input_multiplier, input_left_shift);
using FixedPoint4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
const int32_t output_in_q0 =
gemmlowp::logistic(FixedPoint4::FromRaw(input_in_q4)).raw();
// Rescale and downcast.
using gemmlowp::RoundingDivideByPOT;
int32_t output_in_q23 =
RoundingDivideByPOT(output_in_q0, 31 - kOutputIntegerBits);
output_in_q23 = std::min(std::max(output_in_q23 + kOutputZeroPoint,
static_cast<int32_t>(kMinInt8)),
static_cast<int32_t>(kMaxInt8));
output_data[i] = static_cast<int8_t>(output_in_q23);
}
}
}
inline void Logistic(int32_t input_multiplier, int32_t input_left_shift,
int32_t input_size, const int16_t* ptr_input_data,
int16_t* ptr_output_data) {
// We use the LUT for sigmoid and take into account, that
// tanh(x) = 2*sigmoid(2*x) - 1
// We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
// In case of general parameter scale, multiplier 3 is taken into account
// in TanhPrepare function and it is included in
// input_multiplier already.
TFLITE_DCHECK_GE(input_left_shift, 0);
if (input_multiplier == 0) { // power of two case
input_multiplier = 3 << input_left_shift;
input_left_shift = 0;
}
int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
int32_t input_data =
((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
// We do interpolation on unsigned values.
uint32_t abs_input_data = abs(input_data);
// We divide by 2 power of 9, because
// we need to divide by 2 in power of 7 for
// the input conversion + 1/4 from the scale above.
// Define uh as uint32_t type not to make this function overflow.
uint32_t uh = abs_input_data >> 9;
uint32_t result;
if (uh >= 255) {
// Saturate to maximum.
result = 0x7FFF << 10;
} else {
uint32_t ua = sigmoid_table_uint16[uh];
uint32_t ub = sigmoid_table_uint16[uh + 1];
uint32_t ut = abs_input_data & 0x1ff;
// Interpolation is done using the fractional bit.
result = (ua << 9) + ut * (ub - ua);
}
result = (input_data >= 0) ? (result + (1 << 9))
: ((1 << (16 + 9)) - result + (1 << 9) - 1);
// Back to 16-bit.
result >>= 10;
*ptr_output_data = result;
}
}
} // namespace reference_integer_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_

View File

@@ -0,0 +1,77 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_integer_ops {
template <typename integer_type>
inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
int32_t shift, const RuntimeShape& unextended_input_shape,
const integer_type* input_data, int32_t input_zero_point,
const RuntimeShape& unextended_output_shape,
integer_type* output_data, int32_t output_zero_point) {
// Current implementation only supports dimension equals 4 and simultaneous
// reduction over width and height.
TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(4, unextended_input_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
const int output_batch = output_shape.Dims(0);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int output_depth = output_shape.Dims(3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int num_elements_in_axis = input_width * input_height;
TFLITE_CHECK_EQ(op_params.axis_count, 2);
TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
(op_params.axis[0] == 2 && op_params.axis[1] == 1));
TFLITE_CHECK_EQ(output_height, 1);
TFLITE_CHECK_EQ(output_width, 1);
static constexpr int32_t kMinInt = std::numeric_limits<integer_type>::min();
static constexpr int32_t kMaxInt = std::numeric_limits<integer_type>::max();
for (int out_b = 0; out_b < output_batch; ++out_b) {
for (int out_d = 0; out_d < output_depth; ++out_d) {
int32_t acc = 0;
for (int in_h = 0; in_h < input_height; ++in_h) {
for (int in_w = 0; in_w < input_width; ++in_w) {
acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)] -
input_zero_point;
}
}
acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
acc = acc > 0 ? (acc + num_elements_in_axis / 2) / num_elements_in_axis
: (acc - num_elements_in_axis / 2) / num_elements_in_axis;
acc += output_zero_point;
acc = std::min(std::max(acc, kMinInt), kMaxInt);
output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
static_cast<integer_type>(acc);
}
}
}
} // namespace reference_integer_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_

View File

@@ -0,0 +1,131 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
#include "fixedpoint/fixedpoint.h"
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_integer_ops {
template <typename T>
inline void MulElementwise(int size, const ArithmeticParams& params,
const T* input1_data, const T* input2_data,
T* output_data) {
for (int i = 0; i < size; ++i) {
const int32_t input1_val = params.input1_offset + input1_data[i];
const int32_t input2_val = params.input2_offset + input2_data[i];
const int32_t unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplier(input1_val * input2_val,
params.output_multiplier,
params.output_shift);
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, unclamped_result));
output_data[i] = static_cast<T>(clamped_output);
}
}
template <typename T>
inline void Mul(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape& input2_shape, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
ruy::profiler::ScopeLabel label("Mul/8bit");
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
MulElementwise(flat_size, params, input1_data, input2_data, output_data);
}
// Mul with 16 bit inputs and int8_t outputs.
inline void Mul(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int16_t* input1_data,
const RuntimeShape& input2_shape, const int16_t* input2_data,
const RuntimeShape& output_shape, int8_t* output_data) {
ruy::profiler::ScopeLabel label("Mul/Int16Int8");
int32_t output_offset = params.output_offset;
int32_t output_activation_min = params.quantized_activation_min;
int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; i++) {
// F0 uses 0 integer bits, range [-1, 1].
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
F0 unclamped_result =
F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
int16_t rescaled_result =
gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
int16_t clamped_result = std::min<int16_t>(
output_activation_max - output_offset, rescaled_result);
clamped_result = std::max<int16_t>(output_activation_min - output_offset,
clamped_result);
output_data[i] = output_offset + clamped_result;
}
}
template <typename T>
inline void BroadcastMul4DSlow(
const ArithmeticParams& params, const RuntimeShape& input1_shape,
const T* input1_data, const RuntimeShape& input2_shape,
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
ruy::profiler::ScopeLabel label("BroadcastMul4DSlow");
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
// The input shapes are extended as part of NdArrayDesc initialization.
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
const RuntimeShape extended_output_shape =
RuntimeShape::ExtendedShape(4, output_shape);
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
const int32_t input1_val =
params.input1_offset +
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
const int32_t input2_val =
params.input2_offset +
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
const int32_t unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplier(input1_val * input2_val,
params.output_multiplier,
params.output_shift);
const int32_t clamped_output = std::min(
params.quantized_activation_max,
std::max(params.quantized_activation_min, unclamped_result));
output_data[Offset(extended_output_shape, b, y, x, c)] =
static_cast<T>(clamped_output);
}
}
}
}
}
} // namespace reference_integer_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_

View File

@@ -0,0 +1,262 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
#include <limits>
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_integer_ops {
inline bool AveragePool(const PoolParams& params,
const RuntimeShape& input_shape,
const int8_t* input_data,
const RuntimeShape& output_shape, int8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int stride_height = params.stride_height;
const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
const int in_x_origin =
(out_x * stride_width) - params.padding_values.width;
const int in_y_origin =
(out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
std::min(params.filter_height, input_height - in_y_origin);
int32_t acc = 0;
int filter_count = 0;
for (int filter_y = filter_y_start; filter_y < filter_y_end;
++filter_y) {
for (int filter_x = filter_x_start; filter_x < filter_x_end;
++filter_x) {
const int in_x = in_x_origin + filter_x;
const int in_y = in_y_origin + filter_y;
acc +=
input_data[Offset(input_shape, batch, in_y, in_x, channel)];
filter_count++;
}
}
if (filter_count == 0) return false;
// Round to the closest integer value.
acc = acc > 0 ? (acc + filter_count / 2) / filter_count
: (acc - filter_count / 2) / filter_count;
acc = std::max(acc, params.quantized_activation_min);
acc = std::min(acc, params.quantized_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
static_cast<int8_t>(acc);
}
}
}
}
return true;
}
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
const int8_t* input_data, const RuntimeShape& output_shape,
int8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
TFLITE_DCHECK_GE(params.quantized_activation_min,
std::numeric_limits<int8_t>::min());
TFLITE_DCHECK_LE(params.quantized_activation_max,
std::numeric_limits<int8_t>::max());
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int stride_height = params.stride_height;
const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
const int in_x_origin =
(out_x * stride_width) - params.padding_values.width;
const int in_y_origin =
(out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
std::min(params.filter_height, input_height - in_y_origin);
int8_t max = std::numeric_limits<int8_t>::lowest();
for (int filter_y = filter_y_start; filter_y < filter_y_end;
++filter_y) {
for (int filter_x = filter_x_start; filter_x < filter_x_end;
++filter_x) {
const int in_x = in_x_origin + filter_x;
const int in_y = in_y_origin + filter_y;
max = std::max(
max,
input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
}
}
max = std::max<int8_t>(max, params.quantized_activation_min);
max = std::min<int8_t>(max, params.quantized_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
static_cast<int8_t>(max);
}
}
}
}
}
inline bool AveragePool(const PoolParams& params,
const RuntimeShape& input_shape,
const int16_t* input_data,
const RuntimeShape& output_shape,
int16_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int stride_height = params.stride_height;
const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
const int in_x_origin =
(out_x * stride_width) - params.padding_values.width;
const int in_y_origin =
(out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
std::min(params.filter_height, input_height - in_y_origin);
int32_t acc = 0;
int filter_count = 0;
for (int filter_y = filter_y_start; filter_y < filter_y_end;
++filter_y) {
for (int filter_x = filter_x_start; filter_x < filter_x_end;
++filter_x) {
const int in_x = in_x_origin + filter_x;
const int in_y = in_y_origin + filter_y;
acc +=
input_data[Offset(input_shape, batch, in_y, in_x, channel)];
filter_count++;
}
}
if (filter_count == 0) return false;
// Round to the closest integer value.
acc = acc > 0 ? (acc + filter_count / 2) / filter_count
: (acc - filter_count / 2) / filter_count;
acc = std::max(acc, params.quantized_activation_min);
acc = std::min(acc, params.quantized_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
static_cast<int16_t>(acc);
}
}
}
}
return true;
}
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
const int16_t* input_data, const RuntimeShape& output_shape,
int16_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
TFLITE_DCHECK_GE(params.quantized_activation_min,
std::numeric_limits<int16_t>::min());
TFLITE_DCHECK_LE(params.quantized_activation_max,
std::numeric_limits<int16_t>::max());
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int stride_height = params.stride_height;
const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
const int in_x_origin =
(out_x * stride_width) - params.padding_values.width;
const int in_y_origin =
(out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
std::min(params.filter_height, input_height - in_y_origin);
int16_t max = std::numeric_limits<int16_t>::lowest();
for (int filter_y = filter_y_start; filter_y < filter_y_end;
++filter_y) {
for (int filter_x = filter_x_start; filter_x < filter_x_end;
++filter_x) {
const int in_x = in_x_origin + filter_x;
const int in_y = in_y_origin + filter_y;
max = std::max(
max,
input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
}
}
max = std::max<int16_t>(max, params.quantized_activation_min);
max = std::min<int16_t>(max, params.quantized_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
static_cast<int16_t>(max);
}
}
}
}
}
} // namespace reference_integer_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_

View File

@@ -0,0 +1,116 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
#include <limits>
#include "fixedpoint/fixedpoint.h"
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_integer_ops {
inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
int32_t input_multiplier, int32_t input_shift,
const RuntimeShape& input_shape, const int8_t* input_data,
const RuntimeShape& output_shape, int8_t* output_data) {
// Integer bits must be in sync with Prepare() function.
static constexpr int32_t kInputIntegerBits = 4;
static constexpr int32_t kOutputScale = 7;
static constexpr int32_t kMinInt8 = std::numeric_limits<int8_t>::min();
static constexpr int32_t kMaxInt8 = std::numeric_limits<int8_t>::max();
using F4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
const int32_t input =
static_cast<int32_t>(input_data[i]) - input_zero_point;
if (input <= -input_range_radius) {
output_data[i] = kMinInt8;
} else if (input >= input_range_radius) {
output_data[i] = kMaxInt8;
} else {
const int32_t input_in_q4 =
MultiplyByQuantizedMultiplier(input, input_multiplier, input_shift);
const int32_t output_in_q0 =
gemmlowp::tanh(F4::FromRaw(input_in_q4)).raw();
// Rescale and downcast.
using gemmlowp::RoundingDivideByPOT;
int32_t output_in_q24 =
RoundingDivideByPOT(output_in_q0, 31 - kOutputScale);
output_in_q24 = std::min(std::max(output_in_q24, kMinInt8), kMaxInt8);
output_data[i] = static_cast<int8_t>(output_in_q24);
}
}
}
inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
const RuntimeShape& input_shape, const int16_t* ptr_input_data,
const RuntimeShape& output_shape, int16_t* ptr_output_data) {
// We use the LUT for sigmoid and take into account, that
// tanh(x) = 2*sigmoid(2*x) - 1
// We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
// In case of general parameter scale, multiplier 3 is taken into account
// in TanhPrepare function and it is included in
// input_multiplier already.
if (input_multiplier == 0) { // power of two case
input_multiplier = 3 << input_left_shift;
input_left_shift = 0;
}
int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++) {
int32_t input_data =
((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
uint32_t abs_input_data = abs(input_data);
uint32_t uh = abs_input_data >> 8;
int32_t result;
if (uh >= 255) {
// Saturate to maximum.
result = 0xFFFF << 8;
} else {
uint32_t ua = sigmoid_table_uint16[uh];
uint32_t ub = sigmoid_table_uint16[uh + 1];
uint8_t ut = abs_input_data & 0xFF;
result = (ua << 8) + ut * (ub - ua);
}
result = (input_data >= 0)
? (result - (1 << (14 + 9)) + (1 << (9 - 2)))
: (-result + (1 << (14 + 9)) + (1 << (9 - 2)) - 1);
// Convert back to 16-bit.
result >>= (9 - 1);
*ptr_output_data = result;
}
}
} // namespace reference_integer_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_

View File

@@ -0,0 +1,221 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_integer_ops {
// Fixed-point per-channel-quantization transpose convolution reference kernel.
inline void TransposeConv(
const ConvParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const RuntimeShape& input_shape,
const int8_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int8_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
int32_t* scratch_buffer) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int32_t input_offset = params.input_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_activation_min = std::numeric_limits<int8_t>::min();
const int32_t output_activation_max = std::numeric_limits<int8_t>::max();
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
const int num_elements = output_shape.FlatSize();
// We need to initialize scratch_buffer to all 0s, as we apply the same
// 'scatter' based trick as in float version.
memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
// Loop through input elements one at a time.
for (int batch = 0; batch < batches; ++batch) {
for (int in_y = 0; in_y < input_height; ++in_y) {
for (int in_x = 0; in_x < input_width; ++in_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
// Loop through the output elements it will influence.
const int out_x_origin = (in_x * stride_width) - pad_width;
const int out_y_origin = (in_y * stride_height) - pad_height;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
for (int out_channel = 0; out_channel < output_depth;
++out_channel) {
// Compute output element location.
const int out_x = out_x_origin + filter_x;
const int out_y = out_y_origin + filter_y;
// We cannot accumulate out of bounds.
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
(out_y < output_height)) {
const int8_t input_value = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
const int8_t filter_value =
filter_data[Offset(filter_shape, out_channel, filter_y,
filter_x, in_channel)];
scratch_buffer[Offset(output_shape, batch, out_y, out_x,
out_channel)] +=
(input_value + input_offset) * filter_value;
}
}
}
}
}
}
}
}
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
out_channel)];
if (bias_data) {
acc += bias_data[out_channel];
}
acc = MultiplyByQuantizedMultiplier(
acc, output_multiplier[out_channel], output_shift[out_channel]);
acc += output_offset;
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
static_cast<int8_t>(acc);
}
}
}
}
}
// int16_t input (zero_point=0), int8_t filter, int64 accumulator
inline void TransposeConv(
const ConvParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const RuntimeShape& input_shape,
const int16_t* input_data, const RuntimeShape& filter_shape,
const int8_t* filter_data, const RuntimeShape& bias_shape,
const std::int64_t* bias_data, const RuntimeShape& output_shape,
int16_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
std::int64_t* scratch_buffer) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int32_t output_activation_min = std::numeric_limits<int16_t>::min();
const int32_t output_activation_max = std::numeric_limits<int16_t>::max();
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
const int num_elements = output_shape.FlatSize();
// We need to initialize scratch_buffer to all 0s, as we apply the same
// 'scatter' based trick as in float version.
memset(scratch_buffer, 0, num_elements * sizeof(std::int64_t));
// Loop through input elements one at a time.
for (int batch = 0; batch < batches; ++batch) {
for (int in_y = 0; in_y < input_height; ++in_y) {
for (int in_x = 0; in_x < input_width; ++in_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
// Loop through the output elements it will influence.
const int out_x_origin = (in_x * stride_width) - pad_width;
const int out_y_origin = (in_y * stride_height) - pad_height;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
for (int out_channel = 0; out_channel < output_depth;
++out_channel) {
// Compute output element location.
const int out_x = out_x_origin + filter_x;
const int out_y = out_y_origin + filter_y;
// We cannot accumulate out of bounds.
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
(out_y < output_height)) {
const int32_t input_value = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
const int32_t filter_value =
filter_data[Offset(filter_shape, out_channel, filter_y,
filter_x, in_channel)];
scratch_buffer[Offset(output_shape, batch, out_y, out_x,
out_channel)] +=
input_value * filter_value;
}
}
}
}
}
}
}
}
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
std::int64_t acc = scratch_buffer[Offset(output_shape, batch, out_y,
out_x, out_channel)];
if (bias_data) {
acc += bias_data[out_channel];
}
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
acc, output_multiplier[out_channel], output_shift[out_channel]);
scaled_acc = std::max(scaled_acc, output_activation_min);
scaled_acc = std::min(scaled_acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
static_cast<int16_t>(scaled_acc);
}
}
}
}
}
} // namespace reference_integer_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_

View File

@@ -0,0 +1,90 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
#include <algorithm>
#include <cmath>
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
const RuntimeShape& input_shape,
const float* input_data,
const RuntimeShape& output_shape,
float* output_data, float epsilon = 1e-6) {
const int trailing_dim = input_shape.DimensionsCount() - 1;
const int outer_size =
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
const int depth =
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
for (int i = 0; i < outer_size; ++i) {
float squared_l2_norm = 0;
for (int c = 0; c < depth; ++c) {
const float val = input_data[depth * i + c];
squared_l2_norm += val * val;
}
float l2_norm = std::sqrt(squared_l2_norm);
l2_norm = std::max(l2_norm, epsilon);
for (int c = 0; c < depth; ++c) {
output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
}
}
}
inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
const RuntimeShape& input_shape,
const uint8_t* input_data,
const RuntimeShape& output_shape,
uint8_t* output_data) {
const int trailing_dim = input_shape.DimensionsCount() - 1;
const int depth =
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
const int outer_size =
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
const int32_t input_zero_point = op_params.input_zero_point;
for (int i = 0; i < outer_size; ++i) {
int32_t square_l2_norm = 0;
for (int c = 0; c < depth; c++) {
int32_t diff = input_data[depth * i + c] - input_zero_point;
square_l2_norm += diff * diff;
}
int32_t inv_l2norm_multiplier;
int inv_l2norm_shift;
GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
&inv_l2norm_multiplier, &inv_l2norm_shift);
for (int c = 0; c < depth; c++) {
int32_t diff = input_data[depth * i + c] - input_zero_point;
int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
int32_t unclamped_output_val = 128 + rescaled_diff;
int32_t output_val =
std::min(static_cast<int32_t>(255),
std::max(static_cast<int32_t>(0), unclamped_output_val));
output_data[depth * i + c] = static_cast<uint8_t>(output_val);
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_

View File

@@ -0,0 +1,69 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
#include <algorithm>
#include <limits>
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_ops {
inline void LeakyRelu(const tflite::LeakyReluParams& params,
const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
const float val = input_data[i];
// Note that alpha might be > 1 or < 0, so we don't use std::max here.
output_data[i] = val > 0 ? val : val * params.alpha;
}
}
template <typename T>
inline void QuantizeLeakyRelu(const LeakyReluParams& params,
const RuntimeShape& input_shape,
const T* input_data,
const RuntimeShape& output_shape,
T* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
static const int32_t quantized_min = std::numeric_limits<T>::min();
static const int32_t quantized_max = std::numeric_limits<T>::max();
for (int i = 0; i < flat_size; ++i) {
const int32_t input_value = input_data[i] - params.input_offset;
int32_t unclamped_output;
if (input_value >= 0) {
unclamped_output = params.output_offset +
MultiplyByQuantizedMultiplier(
input_value, params.output_multiplier_identity,
params.output_shift_identity);
} else {
unclamped_output = params.output_offset +
MultiplyByQuantizedMultiplier(
input_value, params.output_multiplier_alpha,
params.output_shift_alpha);
}
const T clamped_output =
std::min(quantized_max, std::max(quantized_min, unclamped_output));
output_data[i] = static_cast<T>(clamped_output);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_

View File

@@ -0,0 +1,256 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
#include <algorithm>
#include <cstddef>
#include <limits>
#include "fixedpoint/fixedpoint.h"
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_ops {
inline void LogSoftmax(const SoftmaxParams& params,
const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const int trailing_dim = input_shape.DimensionsCount() - 1;
const int outer_size =
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
const int depth =
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
for (int i = 0; i < outer_size; ++i) {
// Find max element value which we'll use to ensure numerical stability
// taking advantage of the following equality:
// log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
float max = std::numeric_limits<float>::lowest();
for (int c = 0; c < depth; ++c) {
max = std::max(max, input_data[i * depth + c]);
}
// Compute sum.
float sum = 0.f;
for (int c = 0; c < depth; ++c) {
sum += std::exp(input_data[i * depth + c] - max);
}
// Compute result.
const float log_sum = std::log(sum);
for (int c = 0; c < depth; ++c) {
output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
}
}
}
inline void LogSoftmax(const SoftmaxParams& params,
const RuntimeShape& input_shape,
const uint8_t* input_data,
const RuntimeShape& output_shape, uint8_t* output_data) {
const int32_t input_multiplier = params.input_multiplier;
const int32_t input_left_shift = params.input_left_shift;
const int32_t reverse_scaling_divisor = params.reverse_scaling_divisor;
const int32_t reverse_scaling_right_shift =
params.reverse_scaling_right_shift;
const int diff_min = params.diff_min;
// The representation chosen for the input to the exp() function is Q5.26.
// We need to leave extra space since values that we skip might be as large
// as -32 before multiplying by input_beta_multiplier, and therefore as
// large as -16 afterwards. Note that exp(-8) is definitely not
// insignificant to accumulation, but exp(-16) definitely is.
static constexpr int kScaledDiffIntegerBits = 5;
static constexpr int kAccumulationIntegerBits = 12;
static constexpr int kOutputIntegerBits = 4;
using FixedPointScaledDiff =
gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
using FixedPointAccum =
gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
const int trailing_dim = input_shape.DimensionsCount() - 1;
const int outer_size =
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
const int depth =
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
for (int i = 0; i < outer_size; ++i) {
uint8_t max_in_row = 0;
for (int c = 0; c < depth; ++c) {
max_in_row = std::max(max_in_row, input_data[i * depth + c]);
}
FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
for (int c = 0; c < depth; ++c) {
int32_t input_diff =
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
if (input_diff >= diff_min) {
const int32_t input_diff_rescaled =
MultiplyByQuantizedMultiplierGreaterThanOne(
input_diff, input_multiplier, input_left_shift);
const FixedPointScaledDiff scaled_diff_f8 =
FixedPointScaledDiff::FromRaw(input_diff_rescaled);
sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
exp_on_negative_values(scaled_diff_f8));
}
}
const int32_t fixed_log_sum_of_exps =
log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>(
sum_of_exps)
.raw();
// rescaled_diff_min is smallest representable in
// Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
// log-sub-exps that will be subtracted in the loop.
//
// The thresholds diff_min, etc are negative.
const int rescaled_diff_min =
fixed_log_sum_of_exps + std::numeric_limits<int32_t>::lowest();
const int adjusted_diff_min =
std::max(static_cast<int32_t>(
diff_min - 1), // Note use of > below instead of >= above.
MultiplyByQuantizedMultiplierSmallerThanOneExp(
rescaled_diff_min, reverse_scaling_divisor,
-reverse_scaling_right_shift));
for (int c = 0; c < depth; ++c) {
int32_t input_diff =
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
if (input_diff > adjusted_diff_min) {
const int32_t input_diff_rescaled =
MultiplyByQuantizedMultiplierGreaterThanOne(
input_diff, input_multiplier, input_left_shift);
int32_t unsat_output =
gemmlowp::RoundingDivideByPOT(
(input_diff_rescaled - fixed_log_sum_of_exps),
31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
255;
output_data[i * depth + c] = static_cast<uint8_t>(
std::max(std::min(unsat_output, static_cast<int32_t>(255)),
static_cast<int32_t>(0)));
} else {
// Set output to smallest value.
output_data[i * depth + c] = 0;
}
}
}
}
template <typename T>
inline void LogSoftmaxQuantized(const SoftmaxParams& params,
const size_t outer_size, const size_t depth,
const RuntimeShape& input_shape,
const T* input_data,
const RuntimeShape& output_shape,
T* output_data) {
const int32_t input_multiplier = params.input_multiplier;
const int32_t input_left_shift = params.input_left_shift;
const int32_t reverse_scaling_divisor = params.reverse_scaling_divisor;
const int32_t reverse_scaling_right_shift =
params.reverse_scaling_right_shift;
const int diff_min = params.diff_min;
static constexpr T kMinT8 = std::numeric_limits<T>::min();
static constexpr T kMaxT8 = std::numeric_limits<T>::max();
static constexpr int32_t kMinInt32 = std::numeric_limits<int32_t>::min();
// All IntegerBits must agree with Prepare function.
// Input is chosen as Q5.26 so exp(-1 * 2^5 * 2^-1) = exp(-16) is negligible.
static constexpr int kInputIntegerBits = 5;
static constexpr int kAccumulationIntegerBits = 12;
static constexpr int kOutputIntegerBits = 4;
using F5 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
using F12 = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
for (size_t outer_index = 0; outer_index < outer_size; ++outer_index) {
T max_in_row = kMinT8;
for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
max_in_row =
std::max(max_in_row, input_data[outer_index * depth + inner_index]);
}
// Accumulator "sum_of_exps_in_q12" is safe from overflowing in 2^12 steps.
F12 sum_of_exps_in_q12 = F12::FromRaw(0);
for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
int32_t input_diff =
static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
max_in_row;
if (input_diff >= diff_min) {
const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
input_diff, input_multiplier, input_left_shift);
sum_of_exps_in_q12 =
sum_of_exps_in_q12 +
gemmlowp::Rescale<kAccumulationIntegerBits>(
exp_on_negative_values(F5::FromRaw(input_diff_in_q5)));
}
}
const int32_t log_sum_of_exps_in_q5 =
log_x_for_x_greater_than_or_equal_to_1<kInputIntegerBits>(
sum_of_exps_in_q12)
.raw();
// Potentially reduced the valid range. shifted_log_sum_of_exps_in_q5 is
// smallest representable in Q5.26 plus the log_sum_of_exps.
const int32_t shifted_log_sum_of_exps_in_q5 =
log_sum_of_exps_in_q5 + kMinInt32;
const int32_t adjusted_diff_min =
std::max(static_cast<int32_t>(diff_min - 1),
MultiplyByQuantizedMultiplier(shifted_log_sum_of_exps_in_q5,
reverse_scaling_divisor,
-reverse_scaling_right_shift));
for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
int32_t input_diff =
static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
max_in_row;
// Note use of > below instead of >= above.
if (input_diff > adjusted_diff_min) {
const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
input_diff, input_multiplier, input_left_shift);
// Rescale and downcast.
int32_t output_in_q27 =
gemmlowp::RoundingDivideByPOT(
(input_diff_in_q5 - log_sum_of_exps_in_q5),
31 - kInputIntegerBits - kOutputIntegerBits) +
kMaxT8;
output_in_q27 =
std::max(std::min(output_in_q27, static_cast<int32_t>(kMaxT8)),
static_cast<int32_t>(kMinT8));
output_data[outer_index * depth + inner_index] =
static_cast<T>(output_in_q27);
} else {
output_data[outer_index * depth + inner_index] = kMinT8;
}
}
}
}
inline void LogSoftmax(const SoftmaxParams& params, const size_t outer_size,
const size_t depth, const RuntimeShape& input_shape,
const int8_t* input_data,
const RuntimeShape& output_shape, int8_t* output_data) {
LogSoftmaxQuantized(params, outer_size, depth, input_shape, input_data,
output_shape, output_data);
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_

View File

@@ -0,0 +1,132 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
#include <cmath>
#include "fixedpoint/fixedpoint.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/types.h"
#include "tensorflow/lite/kernels/op_macros.h"
namespace tflite {
namespace reference_ops {
inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const float cutoff_upper = 16.619047164916992188f;
const float cutoff_lower = -9.f;
const int flat_size = MatchingFlatSize(input_shape, output_shape);
// Rational for using approximation in reference kernel.
// 0. This approximation gives enough precision for float.
// 1. This works around an issue on an embedded chipset where exp() does not
// return correctly as expected - exp(x) should return inf when overflown
// not 1.701417 IEEE 754 defines representation for inf.
// 2. This will speed up calculation and is matching the behavior in the
// optimized kernels. (check the definition of scalar_logistic_op<float>)
for (int i = 0; i < flat_size; i++) {
float val = input_data[i];
float result;
if (val > cutoff_upper) {
result = 1.0f;
} else if (val < cutoff_lower) {
result = std::exp(val);
} else {
result = 1.f / (1.f + std::exp(-val));
}
output_data[i] = result;
}
}
// Convenience version that allows, for example, generated-code calls to be
// uniform between data types.
inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& output_shape,
float* output_data) {
// Drop params: not needed.
Logistic(input_shape, input_data, output_shape, output_data);
}
inline void Logistic(const LogisticParams& params,
const RuntimeShape& input_shape, const int16_t* input_data,
const RuntimeShape& output_shape, int16_t* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; i++) {
// F0 uses 0 integer bits, range [-1, 1].
// This is the return type of math functions such as tanh, logistic,
// whose range is in [-1, 1].
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
// F3 uses 3 integer bits, range [-8, 8], the input range expected here.
using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
const F3 input = F3::FromRaw(input_data[i]);
F0 output = gemmlowp::logistic(input);
output_data[i] = output.raw();
}
}
// Quantized int8_t logistic activation. Cheats by dequantizing and
// requantizing around the floating point logistic method. This implementation
// is slow on platforms without a floating point unit.
// TODO(b/141211002): Delete this int8_t implementation once we can reuse the
// approach used in TFLite for int8_t Logistic.
inline void Logistic(const RuntimeShape& input_shape, const int8_t* input_data,
float input_scale, int input_zero_point,
const RuntimeShape& output_shape, int8_t* output_data,
float output_scale, int output_zero_point) {
const float cutoff_upper = 16.619047164916992188f;
const float cutoff_lower = -9.f;
const int flat_size = MatchingFlatSize(input_shape, output_shape);
// Rational for using approximation in reference kernel.
// 0. This approximation gives enough precision for float.
// 1. This works around an issue on an embedded chipset where exp() does not
// return correctly as expected - exp(x) should return inf when overflown
// not 1.701417 IEEE 754 defines representation for inf.
// 2. This will speed up calculation and is matching the behavior in the
// optimized kernels. (check the definition of scalar_logistic_op<float>)
for (int i = 0; i < flat_size; i++) {
// Dequantize.
float val =
static_cast<float>((input_data[i] - input_zero_point) * input_scale);
float result;
if (val > cutoff_upper) {
result = 1.0f;
} else if (val < cutoff_lower) {
result = std::exp(val);
} else {
result = 1.f / (1.f + std::exp(-val));
}
// Requantize
int8_t output =
static_cast<int8_t>(result / output_scale + output_zero_point);
output_data[i] = output;
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_

View File

@@ -0,0 +1,64 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T, typename Op, int N = 5>
void MaximumMinimumBroadcastSlow(const RuntimeShape& unextended_input1_shape,
const T* input1_data,
const RuntimeShape& unextended_input2_shape,
const T* input2_data,
const RuntimeShape& unextended_output_shape,
T* output_data, Op op) {
// Uses element-wise calculation if broadcast is not required.
if (unextended_input1_shape == unextended_input2_shape) {
const int flat_size =
MatchingElementsSize(unextended_input1_shape, unextended_input2_shape,
unextended_output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = op(input1_data[i], input2_data[i]);
}
} else {
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
NdArrayDesc<N> desc1;
NdArrayDesc<N> desc2;
NdArrayDesc<N> output_desc;
NdArrayDescsForElementwiseBroadcast(
unextended_input1_shape, unextended_input2_shape, &desc1, &desc2);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
&output_desc);
auto maxmin_func = [&](int indexes[N]) {
output_data[SubscriptToIndex(output_desc, indexes)] =
op(input1_data[SubscriptToIndex(desc1, indexes)],
input2_data[SubscriptToIndex(desc2, indexes)]);
};
NDOpsHelper<N>(output_desc, maxmin_func);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_

View File

@@ -0,0 +1,166 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
#include "tensorflow/lite/kernels/internal/common.h"
namespace tflite {
namespace reference_ops {
// Element-wise mul that can often be used for inner loop of broadcast Mul as
// well as the non-broadcast Mul.
inline void MulElementwise(int size, const ArithmeticParams& params,
const uint8_t* input1_data,
const uint8_t* input2_data, uint8_t* output_data) {
for (int i = 0; i < size; ++i) {
const int32_t input1_val = params.input1_offset + input1_data[i];
const int32_t input2_val = params.input2_offset + input2_data[i];
const int32_t unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplier(input1_val * input2_val,
params.output_multiplier,
params.output_shift);
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, unclamped_result));
output_data[i] = static_cast<uint8_t>(clamped_output);
}
}
template <typename T>
inline void Mul(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape& input2_shape, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) {
T output_activation_min;
T output_activation_max;
GetActivationParams(params, &output_activation_min, &output_activation_max);
const int flat_size =
MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
input1_data[i] * input2_data[i], output_activation_min,
output_activation_max);
}
}
inline void Mul(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const uint8_t* input1_data,
const RuntimeShape& input2_shape, const uint8_t* input2_data,
const RuntimeShape& output_shape, uint8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int flat_size =
MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
MulElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void BroadcastMul4DSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const uint8_t* input1_data,
const RuntimeShape& input2_shape,
const uint8_t* input2_data,
const RuntimeShape& output_shape,
uint8_t* output_data) {
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
const RuntimeShape extended_output_shape =
RuntimeShape::ExtendedShape(4, output_shape);
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
const int32_t input1_val =
params.input1_offset +
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
const int32_t input2_val =
params.input2_offset +
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
const int32_t unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplier(input1_val * input2_val,
params.output_multiplier,
params.output_shift);
const int32_t clamped_output = std::min(
params.quantized_activation_max,
std::max(params.quantized_activation_min, unclamped_result));
output_data[Offset(extended_output_shape, b, y, x, c)] =
static_cast<uint8_t>(clamped_output);
}
}
}
}
}
template <typename T>
void BroadcastMul4DSlow(const ArithmeticParams& params,
const RuntimeShape& unextended_input1_shape,
const T* input1_data,
const RuntimeShape& unextended_input2_shape,
const T* input2_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
T output_activation_min;
T output_activation_max;
GetActivationParams(params, &output_activation_min, &output_activation_max);
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
unextended_input2_shape, &desc1, &desc2);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
for (int b = 0; b < output_shape.Dims(0); ++b) {
for (int y = 0; y < output_shape.Dims(1); ++y) {
for (int x = 0; x < output_shape.Dims(2); ++x) {
for (int c = 0; c < output_shape.Dims(3); ++c) {
output_data[Offset(output_shape, b, y, x, c)] =
ActivationFunctionWithMinMax(
input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
input2_data[SubscriptToIndex(desc2, b, y, x, c)],
output_activation_min, output_activation_max);
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_

View File

@@ -0,0 +1,37 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void Negate(const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& output_shape, T* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = -input_data[i];
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_

View File

@@ -0,0 +1,169 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PAD_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PAD_H_
#include <vector>
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
// TFLite Pad supports activation tensors with up to 5 dimensions.
constexpr int PadKernelMaxDimensionCount() { return 5; }
// There are two versions of pad: Pad and PadV2. In PadV2 there is a second
// scalar input that provides the padding value. Therefore pad_value_ptr can be
// equivalent to a simple input1_data. For Pad, it should point to a zero
// value.
//
// Note that two typenames are required, so that T=P=int32_t is considered a
// specialization distinct from P=int32_t.
template <typename T, typename P>
inline void PadImpl(const tflite::PadParams& op_params,
const RuntimeShape& input_shape, const T* input_data,
const P* pad_value_ptr, const RuntimeShape& output_shape,
T* output_data) {
const RuntimeShape ext_input_shape =
RuntimeShape::ExtendedShape(PadKernelMaxDimensionCount(), input_shape);
const RuntimeShape ext_output_shape =
RuntimeShape::ExtendedShape(PadKernelMaxDimensionCount(), output_shape);
TFLITE_DCHECK_LE(op_params.left_padding_count, PadKernelMaxDimensionCount());
TFLITE_DCHECK_LE(op_params.right_padding_count, PadKernelMaxDimensionCount());
// Runtime calls are currently fixed at 5 dimensions. Copy inputs so we can
// pad them to 5 dims (yes, we are "padding the padding").
int left_padding_copy[PadKernelMaxDimensionCount()];
for (int i = 0; i < PadKernelMaxDimensionCount(); i++) {
left_padding_copy[i] = 0;
}
for (int i = 0; i < op_params.left_padding_count; ++i) {
left_padding_copy[i + PadKernelMaxDimensionCount() -
op_params.left_padding_count] = op_params.left_padding[i];
}
int right_padding_copy[PadKernelMaxDimensionCount()];
for (int i = 0; i < PadKernelMaxDimensionCount(); i++) {
right_padding_copy[i] = 0;
}
for (int i = 0; i < op_params.right_padding_count; ++i) {
right_padding_copy[i + PadKernelMaxDimensionCount() -
op_params.right_padding_count] =
op_params.right_padding[i];
}
const int output_batch = ext_output_shape.Dims(0);
const int output_plane = ext_output_shape.Dims(1);
const int output_height = ext_output_shape.Dims(2);
const int output_width = ext_output_shape.Dims(3);
const int output_depth = ext_output_shape.Dims(4);
const int left_b_padding = left_padding_copy[0];
const int left_p_padding = left_padding_copy[1];
const int left_h_padding = left_padding_copy[2];
const int left_w_padding = left_padding_copy[3];
const int left_d_padding = left_padding_copy[4];
const int right_b_padding = right_padding_copy[0];
const int right_p_padding = right_padding_copy[1];
const int right_h_padding = right_padding_copy[2];
const int right_w_padding = right_padding_copy[3];
const int right_d_padding = right_padding_copy[4];
const T pad_value = *pad_value_ptr;
const T* in_ptr = input_data;
T* out_ptr = output_data;
for (int out_b = 0; out_b < output_batch; ++out_b) {
for (int out_p = 0; out_p < output_plane; ++out_p) {
for (int out_h = 0; out_h < output_height; ++out_h) {
for (int out_w = 0; out_w < output_width; ++out_w) {
for (int out_d = 0; out_d < output_depth; ++out_d) {
if (out_b < left_b_padding ||
out_b >= output_batch - right_b_padding ||
out_p < left_p_padding ||
out_p >= output_plane - right_p_padding ||
out_h < left_h_padding ||
out_h >= output_height - right_h_padding ||
out_w < left_w_padding ||
out_w >= output_width - right_w_padding ||
out_d < left_d_padding ||
out_d >= output_depth - right_d_padding) {
*out_ptr++ = pad_value;
} else {
*out_ptr++ = *in_ptr++;
}
}
}
}
}
}
}
template <typename T, typename P>
inline void Pad(const tflite::PadParams& op_params,
const RuntimeShape& input_shape, const T* input_data,
const P* pad_value_ptr, const RuntimeShape& output_shape,
T* output_data) {
PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
output_data);
}
// The second (pad-value) input can be int32_t when, say, the first is uint8_t.
template <typename T>
inline void Pad(const tflite::PadParams& op_params,
const RuntimeShape& input_shape, const T* input_data,
const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
T* output_data) {
const T converted_pad_value = static_cast<T>(*pad_value_ptr);
PadImpl(op_params, input_shape, input_data, &converted_pad_value,
output_shape, output_data);
}
// This version avoids conflicting template matching.
template <>
inline void Pad(const tflite::PadParams& op_params,
const RuntimeShape& input_shape, const int32_t* input_data,
const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
int32_t* output_data) {
PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
output_data);
}
template <typename T, typename P>
inline void PadImageStyle(const tflite::PadParams& op_params,
const RuntimeShape& input_shape, const T* input_data,
const P* pad_value_ptr,
const RuntimeShape& output_shape, T* output_data) {
Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
output_data);
}
template <typename P>
inline void PadImageStyle(const tflite::PadParams& op_params,
const RuntimeShape& input_shape,
const float* input_data, const P* pad_value_ptr,
const RuntimeShape& output_shape,
float* output_data) {
Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
output_data);
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PAD_H_

View File

@@ -0,0 +1,301 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline bool AveragePool(const PoolParams& params,
const RuntimeShape& input_shape,
const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int stride_height = params.stride_height;
const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
const int in_x_origin =
(out_x * stride_width) - params.padding_values.width;
const int in_y_origin =
(out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
std::min(params.filter_height, input_height - in_y_origin);
float total = 0.f;
float filter_count = 0;
for (int filter_y = filter_y_start; filter_y < filter_y_end;
++filter_y) {
for (int filter_x = filter_x_start; filter_x < filter_x_end;
++filter_x) {
const int in_x = in_x_origin + filter_x;
const int in_y = in_y_origin + filter_y;
total +=
input_data[Offset(input_shape, batch, in_y, in_x, channel)];
filter_count++;
}
}
if (filter_count == 0) return false;
const float average = total / filter_count;
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
ActivationFunctionWithMinMax(average, params.float_activation_min,
params.float_activation_max);
}
}
}
}
return true;
}
inline bool AveragePool(const PoolParams& params,
const RuntimeShape& input_shape,
const uint8_t* input_data,
const RuntimeShape& output_shape,
uint8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int stride_height = params.stride_height;
const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
const int in_x_origin =
(out_x * stride_width) - params.padding_values.width;
const int in_y_origin =
(out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
std::min(params.filter_height, input_height - in_y_origin);
int32_t acc = 0;
int filter_count = 0;
for (int filter_y = filter_y_start; filter_y < filter_y_end;
++filter_y) {
for (int filter_x = filter_x_start; filter_x < filter_x_end;
++filter_x) {
const int in_x = in_x_origin + filter_x;
const int in_y = in_y_origin + filter_y;
acc +=
input_data[Offset(input_shape, batch, in_y, in_x, channel)];
filter_count++;
}
}
if (filter_count == 0) return false;
acc = (acc + filter_count / 2) / filter_count;
acc = std::max(acc, params.quantized_activation_min);
acc = std::min(acc, params.quantized_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
static_cast<uint8_t>(acc);
}
}
}
}
return true;
}
inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& output_shape,
float* output_data) {
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int stride_height = params.stride_height;
const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
const int in_x_origin =
(out_x * stride_width) - params.padding_values.width;
const int in_y_origin =
(out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
std::min(params.filter_height, input_height - in_y_origin);
float sum_squares = 0.f;
int filter_count = 0;
for (int filter_y = filter_y_start; filter_y < filter_y_end;
++filter_y) {
for (int filter_x = filter_x_start; filter_x < filter_x_end;
++filter_x) {
const int in_x = in_x_origin + filter_x;
const int in_y = in_y_origin + filter_y;
const float val =
input_data[Offset(input_shape, batch, in_y, in_x, channel)];
sum_squares += val * val;
filter_count++;
}
}
const float l2pool_result = std::sqrt(sum_squares / filter_count);
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
ActivationFunctionWithMinMax(l2pool_result,
params.float_activation_min,
params.float_activation_max);
}
}
}
}
}
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& output_shape,
float* output_data) {
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int stride_height = params.stride_height;
const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
const int in_x_origin =
(out_x * stride_width) - params.padding_values.width;
const int in_y_origin =
(out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
std::min(params.filter_height, input_height - in_y_origin);
float max = std::numeric_limits<float>::lowest();
for (int filter_y = filter_y_start; filter_y < filter_y_end;
++filter_y) {
for (int filter_x = filter_x_start; filter_x < filter_x_end;
++filter_x) {
const int in_x = in_x_origin + filter_x;
const int in_y = in_y_origin + filter_y;
max = std::max(
max,
input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
}
}
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
ActivationFunctionWithMinMax(max, params.float_activation_min,
params.float_activation_max);
}
}
}
}
}
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& output_shape,
uint8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
TFLITE_DCHECK_GE(params.quantized_activation_min, 0);
TFLITE_DCHECK_LE(params.quantized_activation_max, 255);
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int stride_height = params.stride_height;
const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int channel = 0; channel < depth; ++channel) {
const int in_x_origin =
(out_x * stride_width) - params.padding_values.width;
const int in_y_origin =
(out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end =
std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end =
std::min(params.filter_height, input_height - in_y_origin);
uint8_t max = 0;
for (int filter_y = filter_y_start; filter_y < filter_y_end;
++filter_y) {
for (int filter_x = filter_x_start; filter_x < filter_x_end;
++filter_x) {
const int in_x = in_x_origin + filter_x;
const int in_y = in_y_origin + filter_y;
max = std::max(
max,
input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
}
}
max = std::max<uint8_t>(max, params.quantized_activation_min);
max = std::min<uint8_t>(max, params.quantized_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
static_cast<uint8_t>(max);
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_

View File

@@ -0,0 +1,774 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <limits>
#include <utility>
#include "fixedpoint/fixedpoint.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
#if defined(_MSC_VER)
#define __restrict__ __restrict
#endif
namespace tflite {
namespace tensor_utils {
namespace {
const int32_t kInt16Max = std::numeric_limits<int16_t>::max();
const int32_t kInt16Min = std::numeric_limits<int16_t>::min();
} // namespace
void PortableSymmetricQuantizeFloats(const float* values, const int size,
int8_t* quantized_values, float* min_value,
float* max_value, float* scaling_factor) {
auto minmax = std::minmax_element(values, values + size);
*min_value = *minmax.first;
*max_value = *minmax.second;
PortableSymmetricQuantizeFloats(values, size, quantized_values, *min_value,
*max_value, scaling_factor);
}
void PortableSymmetricQuantizeFloats(const float* values, const int size,
int8_t* quantized_values, float min_value,
float max_value, float* scaling_factor) {
const int32_t kScale = 127;
const float range = std::max(std::abs(min_value), std::abs(max_value));
if (range == 0) {
memset(quantized_values, 0, size * sizeof(int8_t));
*scaling_factor = 1;
return;
}
*scaling_factor = range / kScale;
const float scaling_factor_inv = kScale / range;
for (int i = 0; i < size; ++i) {
const int32_t quantized_value =
static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
// Clamp: just in case some odd numeric offset.
quantized_values[i] = static_cast<int8_t>(
std::min(kScale, std::max(-kScale, quantized_value)));
}
}
void PortableAsymmetricQuantizeFloats(const float* values, const int size,
int8_t* quantized_values,
float* scaling_factor, int32_t* offset) {
const int32_t kMinScale = -128;
const int32_t kMaxScale = 127;
const double qmin_double = kMinScale;
const double qmax_double = kMaxScale;
const auto minmax = std::minmax_element(values, values + size);
const double rmin = static_cast<double>(std::min(0.0f, *minmax.first));
const double rmax = static_cast<double>(std::max(0.0f, *minmax.second));
if (rmin == rmax) {
memset(quantized_values, 0, size * sizeof(int8_t));
*scaling_factor = 1;
*offset = 0;
return;
} else {
double scale = (rmax - rmin) / (qmax_double - qmin_double);
const double zero_point_from_min = qmin_double - rmin / scale;
const double zero_point_from_max = qmax_double - rmax / scale;
const double zero_point_from_min_error =
std::abs(qmin_double) + std::abs(rmin / scale);
const double zero_point_from_max_error =
std::abs(qmax_double) + std::abs(rmax / scale);
const double zero_point_double =
zero_point_from_min_error < zero_point_from_max_error
? zero_point_from_min
: zero_point_from_max;
int8_t nudged_zero_point = 0;
if (zero_point_double <= qmin_double) {
nudged_zero_point = kMinScale;
} else if (zero_point_double >= qmax_double) {
nudged_zero_point = kMaxScale;
} else {
nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
}
*scaling_factor = scale;
*offset = nudged_zero_point;
}
const float scaling_factor_inv = 1.0f / *scaling_factor;
for (int i = 0; i < size; ++i) {
const int32_t quantized_value = static_cast<int32_t>(
TfLiteRound(*offset + values[i] * scaling_factor_inv));
quantized_values[i] =
std::min(kMaxScale, std::max(kMinScale, quantized_value));
}
}
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
int m_rows, int m_cols,
const float* vector,
int n_batch, float* result) {
float* result_in_batch = result;
for (int b = 0; b < n_batch; b++) {
const float* matrix_ptr = matrix;
for (int r = 0; r < m_rows; r++) {
float dot_prod = 0.0f;
const float* vector_in_batch = vector + b * m_cols;
for (int c = 0; c < m_cols; c++) {
dot_prod += *matrix_ptr++ * *vector_in_batch++;
}
*result_in_batch += dot_prod;
++result_in_batch;
}
}
}
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors, const float* scaling_factors,
int n_batch, float* __restrict__ result) {
for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
const float batch_scaling_factor = scaling_factors[batch];
// Get the address of the first row.
const int8_t* row_ptr = matrix;
for (int row = 0; row < m_rows; ++row) {
// Initialize the dot product sum for the row to 0.
int32_t dotprod = 0;
#if defined(__GNUC__)
// Prefetch the row to cache.
__builtin_prefetch(row_ptr, 0 /* prefetch for read */,
3 /* temporal locality */);
#endif
for (int col = 0; col < m_cols; ++col, ++row_ptr) {
dotprod += (*row_ptr) * (vectors[col]);
} // for col
*result += dotprod * batch_scaling_factor;
++result;
} // for row
} // for batch
}
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors, const float* scaling_factors,
int n_batch, float* __restrict__ result, const float* per_channel_scale,
const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
bool* compute_row_sums, CpuBackendContext* context) {
if (input_offset == nullptr) {
PortableMatrixBatchVectorMultiplyAccumulate(
matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
return;
}
if (!compute_row_sums || *compute_row_sums) {
PortableReductionSumVector(matrix, row_sums, m_rows, m_cols);
if (compute_row_sums) {
*compute_row_sums = false;
}
}
for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
const float batch_scaling_factor = scaling_factors[batch];
const int32_t batch_offset = input_offset[batch];
const int8_t* row_ptr = matrix;
for (int row = 0; row < m_rows; ++row) {
int32_t dotprod = 0;
float scale = batch_scaling_factor;
if (per_channel_scale) {
scale *= per_channel_scale[row];
}
#if defined(__GNUC__)
// Prefetch the row to cache.
__builtin_prefetch(row_ptr, 0 /* prefetch for read */,
3 /* temporal locality */);
#endif
for (int col = 0; col < m_cols; ++col, ++row_ptr) {
dotprod += (*row_ptr) * vectors[col];
} // for col
dotprod -= row_sums[row] * batch_offset;
*result += dotprod * scale;
++result;
} // for row
} // for batch
}
void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
const float* __restrict__ matrix, const int32_t* __restrict__ segments,
const int32_t* __restrict__ indices, int m_rows, int m_cols,
const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
const int kBlockSize = 4;
TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
for (int batch = 0; batch < n_batch; batch++) {
const float* matrix_ptr = matrix;
for (int row = 0; row < m_rows; row++) {
float dot_prod = 0.0f;
const float* vector_in_batch = vector + batch * m_cols;
for (int i = segments[row]; i < segments[row + 1]; i++) {
const int block_start_index = indices[i] * kBlockSize;
const float* vector_block_in_batch_ptr =
vector_in_batch + block_start_index;
for (int c = 0; c < kBlockSize; c++) {
dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
}
}
result[batch * m_rows + row] += dot_prod;
}
}
}
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
float* __restrict__ result) {
const int kBlockSize = 16;
TFLITE_DCHECK_EQ( // NOLINT
m_cols % kBlockSize, 0);
for (int batch = 0; batch < n_batch; batch++) {
const float* matrix_ptr = matrix;
const uint8_t* ledger_ptr = ledger;
for (int row = 0; row < m_rows; row++) {
float dot_prod = 0.0f;
int num_nonzero_blocks = *ledger_ptr++;
if (num_nonzero_blocks > 0) {
const float* vector_in_batch = vector + batch * m_cols;
for (int i = 0; i < num_nonzero_blocks; i++) {
const int block_start_index = *ledger_ptr++ * kBlockSize;
const float* vector_block_in_batch_ptr =
vector_in_batch + block_start_index;
for (int c = 0; c < kBlockSize; c++) {
dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
}
}
}
result[batch * m_rows + row] += dot_prod;
}
}
}
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
const int m_cols, const int8_t* __restrict__ vectors,
const float* scaling_factors, int n_batch, float* __restrict__ result) {
static const int kBlockSize = 16;
TFLITE_DCHECK_EQ( // NOLINT
m_cols % kBlockSize, 0);
for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
const float batch_scaling_factor = scaling_factors[batch];
const uint8_t* ledger_ptr = ledger;
// Get the address of the first row.
const int8_t* row_ptr = matrix;
for (int row = 0; row < m_rows; ++row) {
// Initialize the dot product sum for the row to 0.
int32_t dotprod = 0;
#if defined(__GNUC__)
// Prefetch the row to cache.
__builtin_prefetch(row_ptr, 0 /* prefetch for read */,
3 /* temporal locality */);
#endif
int num_nonzero_blocks = *ledger_ptr++;
for (int i = 0; i < num_nonzero_blocks; i++) {
const int block_start_index = *ledger_ptr++ * kBlockSize;
const int8_t* vector_block_ptr = vectors + block_start_index;
for (int c = 0; c < kBlockSize; c++) {
dotprod += (*row_ptr++) * (*vector_block_ptr++);
} // for block
} // for num_nonzero_blocks
result[batch * m_rows + row] += dotprod * batch_scaling_factor;
} // for row
} // for batch
}
template <typename T>
void PortableMatrixBatchVectorMultiplyAccumulateImpl(
const int8_t* input, const int32_t* bias,
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
T* output) {
const int16_t output_max = std::numeric_limits<T>::max();
const int16_t output_min = std::numeric_limits<T>::min();
for (int batch = 0; batch < n_batch; ++batch) {
for (int row = 0; row < n_output; ++row) {
int32_t acc = bias[row];
for (int col = 0; col < n_input; ++col) {
int8_t input_val = input[batch * n_input + col];
int8_t weights_val = input_to_gate_weights[row * n_input + col];
acc += input_val * weights_val;
}
acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
acc += output_zp;
acc += output[batch * n_output + row];
if (acc > output_max) {
acc = output_max;
}
if (acc < output_min) {
acc = output_min;
}
output[batch * n_output + row] = static_cast<T>(acc);
}
}
}
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* input, const int32_t* bias,
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
int32_t* scratch, int16_t* output, CpuBackendContext* context) {
PortableMatrixBatchVectorMultiplyAccumulateImpl(
input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
n_output, output_zp, output);
}
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* input, const int32_t* bias,
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
int32_t* scratch, int8_t* output, CpuBackendContext* context) {
PortableMatrixBatchVectorMultiplyAccumulateImpl(
input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
n_output, output_zp, output);
}
void PortableMatrixBatchVectorMultiply(const int8_t* input,
int32_t input_zeropoint,
const int8_t* input_to_gate_weights,
int32_t input_to_gate_effective_scale_a,
int32_t input_to_gate_effective_scale_b,
int32_t n_batch, int32_t n_input,
int32_t n_cell, int8_t* gate_output,
int8_t gate_output_zp) {
const int32_t int8_max = std::numeric_limits<int8_t>::max();
const int32_t int8_min = std::numeric_limits<int8_t>::min();
for (int batch = 0; batch < n_batch; ++batch) {
for (int row = 0; row < n_cell; ++row) {
int32_t acc = 0;
for (int col = 0; col < n_input; ++col) {
int32_t input_val = input[batch * n_input + col];
int8_t weights_val = input_to_gate_weights[row * n_input + col];
acc += (input_val - input_zeropoint) * weights_val;
}
acc = MultiplyByQuantizedMultiplier(acc, input_to_gate_effective_scale_a,
input_to_gate_effective_scale_b);
acc += gate_output_zp;
if (acc > int8_max) {
acc = int8_max;
}
if (acc < int8_min) {
acc = int8_min;
}
gate_output[batch * n_cell + row] = static_cast<int8_t>(acc);
}
}
}
void PortableMatrixBatchVectorMultiply(
const int16_t* hidden, const int8_t* hidden_to_output_weights,
int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
int32_t n_output, int32_t output_zp, int8_t* proj_output) {
const int16_t int8_max = std::numeric_limits<int8_t>::max();
const int16_t int8_min = std::numeric_limits<int8_t>::min();
for (int batch = 0; batch < n_batch; ++batch) {
for (int row = 0; row < n_output; ++row) {
int64_t acc = gate_bias[row];
for (int col = 0; col < n_hidden; ++col) {
int16_t input_val = hidden[batch * n_hidden + col];
int8_t weights_val = hidden_to_output_weights[row * n_hidden + col];
int64_t curr = acc;
acc += input_val * weights_val;
if (input_val * weights_val > 0 && acc < curr) {
acc = std::numeric_limits<int32_t>::max();
}
if (input_val * weights_val < 0 && acc > curr) {
acc = std::numeric_limits<int32_t>::min();
}
}
acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
proj_effective_scale_b);
acc += output_zp;
if (acc > int8_max) {
acc = int8_max;
}
if (acc < int8_min) {
acc = int8_min;
}
proj_output[batch * n_output + row] = acc;
}
}
}
void PortableApplyLayerNorm(const int16_t* input,
const int16_t* layer_norm_weights,
const int32_t* bias, int32_t layer_norm_scale_a,
int32_t layer_norm_scale_b, int32_t variance_limit,
int n_batch, int n_input, int16_t* output) {
// The square of std::pow(2, 10), which is the extra factor that makes sure
// normalized values has enough resolution.
static const int kTwoToPower20 = 1 << 20;
for (int i = 0; i < n_batch; ++i) {
int64_t sum = 0;
int64_t sum_sq = 0;
for (int j = 0; j < n_input; ++j) {
const int32_t index = i * n_input + j;
int32_t val = static_cast<int32_t>(input[index]);
sum += val;
sum_sq += val * val;
}
int32_t mean =
static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
// TODO(b/173994730): Avoids overflow but only works for POT n_input.
int32_t temp = kTwoToPower20 / n_input;
int64_t variance =
sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
int32_t variance2 = static_cast<int32_t>(variance / kTwoToPower20);
if (variance2 < 1) {
variance2 = variance_limit;
}
int32_t stddev_inverse_a;
int stddev_inverse_b;
GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
&stddev_inverse_a, &stddev_inverse_b);
for (int j = 0; j < n_input; ++j) {
const int32_t index = i * n_input + j;
int32_t val = static_cast<int32_t>(input[index]);
int32_t shifted = 1024 * val - mean;
int32_t rescaled = MultiplyByQuantizedMultiplier(
shifted, stddev_inverse_a, stddev_inverse_b);
// TODO(jianlijianli): Saturate this.
int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
int32_t val4 =
static_cast<int32_t>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
int32_t val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
layer_norm_scale_b + 12);
val5 = std::min(std::max(kInt16Min, val5), kInt16Max);
output[index] = static_cast<int16_t>(val5);
}
}
}
void PortableApplyLayerNormFloat(const int16_t* input,
const int16_t* layer_norm_weights,
int32_t layer_norm_scale_a,
int32_t layer_norm_scale_b,
const int32_t* bias, int n_batch, int n_input,
int16_t* output) {
const int32_t int16_max = std::numeric_limits<int16_t>::max();
const int32_t int16_min = std::numeric_limits<int16_t>::min();
const float layer_norm_scale =
layer_norm_scale_a *
std::pow(2.0, static_cast<double>(layer_norm_scale_b - 31));
const float bias_scale =
static_cast<float>(std::pow(2.0, -10)) * layer_norm_scale;
for (int batch = 0; batch < n_batch; ++batch) {
float sum = 0.0f;
float sum_sq = 0.0f;
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
const float value = static_cast<float>(input[index]);
sum += value;
sum_sq += value * value;
}
const float mean = sum / n_input;
float stddev_inv = 0.0f;
const float variance = sum_sq / n_input - mean * mean;
if (variance == 0) {
stddev_inv = 1.0f / std::sqrt(1e-8f);
} else {
stddev_inv = 1.0f / std::sqrt(variance);
}
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
const float normalized_value =
(static_cast<float>(input[index]) - mean) * stddev_inv;
const float weighted_normalized_value =
normalized_value * layer_norm_weights[i] * layer_norm_scale +
bias[i] * bias_scale;
const int32_t quant_output = static_cast<int32_t>(round(
weighted_normalized_value * static_cast<float>(std::pow(2, 12))));
output[index] = std::min(int16_max, std::max(int16_min, quant_output));
}
}
}
void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
int32_t scalar, int32_t n_row,
int32_t n_col, int32_t* output) {
for (int i = 0; i < n_row; ++i) {
int32_t row_sum = 0;
for (int j = 0; j < n_col; ++j) {
row_sum += *matrix++;
}
output[i] += row_sum * scalar;
}
}
void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
int32_t n_input, int16_t* output) {
for (int batch = 0; batch < n_batch; ++batch) {
for (int c = 0; c < n_input; c++) {
using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
const int index = batch * n_input + c;
F3 sigmoid_input = F3::FromRaw(input[index]);
F0 sigmoid_output = gemmlowp::logistic(sigmoid_input);
output[index] = sigmoid_output.raw();
}
}
}
void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
int32_t n_input, int16_t* output) {
const int32_t int16_max = std::numeric_limits<int16_t>::max();
const int32_t int16_min = std::numeric_limits<int16_t>::min();
for (int batch = 0; batch < n_batch; ++batch) {
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
const float float_input =
input[index] * static_cast<float>(std::pow(2, -12));
const float float_output = 1.0f / (1.0f + std::exp(-float_input));
const int32_t quant_output = static_cast<int32_t>(
float_output * static_cast<float>(std::pow(2, 15)));
const int32_t quant_output_clamped =
std::min(int16_max, std::max(int16_min, quant_output));
output[index] = static_cast<int16_t>(quant_output_clamped);
}
}
}
template <int IntegerBits>
void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch,
int32_t n_input, int16_t* output) {
using FX = gemmlowp::FixedPoint<std::int16_t, IntegerBits>;
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
for (int batch = 0; batch < n_batch; ++batch) {
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
FX tanh_input = FX::FromRaw(input[index]);
F0 tanh_output = gemmlowp::tanh(tanh_input);
output[index] = tanh_output.raw();
}
}
}
void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
int32_t n_batch, int32_t n_input, int16_t* output) {
assert(integer_bits <= 6);
#define DISPATCH_TANH(i) \
case i: \
PortableApplyTanhImpl<i>(input, n_batch, n_input, output); \
break;
switch (integer_bits) {
DISPATCH_TANH(0);
DISPATCH_TANH(1);
DISPATCH_TANH(2);
DISPATCH_TANH(3);
DISPATCH_TANH(4);
DISPATCH_TANH(5);
DISPATCH_TANH(6);
default:
return;
}
#undef DISPATCH_TANH
}
void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
int32_t n_input, int32_t integer_bits,
int16_t* output) {
const int32_t int16_max = std::numeric_limits<int16_t>::max();
const int32_t int16_min = std::numeric_limits<int16_t>::min();
const double two = 2.0;
for (int batch = 0; batch < n_batch; ++batch) {
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
const float float_input =
input[index] * std::pow(two, static_cast<double>(integer_bits));
const float float_output = std::tanh(float_input);
const int32_t quant_output = static_cast<int32_t>(
float_output * static_cast<float>(std::pow(2, 15)));
const int32_t quant_output_clamped =
std::min(int16_max, std::max(int16_min, quant_output));
output[index] = static_cast<int16_t>(quant_output_clamped);
}
}
}
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
int n_batch, int n_input, int shift, int16_t* output) {
for (int batch = 0; batch < n_batch; ++batch) {
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
const int16_t a = input_1[index];
const int16_t b = input_2[index];
const int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
output[index] =
static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
}
}
}
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
int32_t multiplier, int32_t shift, int32_t n_batch,
int32_t n_input, int32_t output_zp, int8_t* output) {
for (int batch = 0; batch < n_batch; ++batch) {
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
const int16_t a = input_1[index];
const int16_t b = input_2[index];
int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
value -= output_zp;
value = std::min(std::max(static_cast<int32_t>(-128), value),
static_cast<int32_t>(127));
output[index] = static_cast<int8_t>(value);
}
}
}
void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
int n_batch, int n_input, int16_t* output) {
for (int batch = 0; batch < n_batch; ++batch) {
for (int i = 0; i < n_input; ++i) {
const int index = batch * n_input + i;
int32_t sum = input_1[index] + input_2[index];
const int32_t sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum));
output[index] = static_cast<int16_t>(sum_clamped);
}
}
}
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
int v_size) {
float result = 0.0;
for (int v = 0; v < v_size; v++) {
result += *vector1++ * *vector2++;
}
return result;
}
namespace {
inline int32_t VectorVectorDotProduct(const int16_t* vector1,
const int16_t* vector2, int v_size) {
int32_t result = 0;
for (int v = 0; v < v_size; v++) {
result += *vector1++ * *vector2++;
}
return result;
}
} // namespace
void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
const int16_t* vector2,
int v_size, int n_batch,
int32_t* result) {
for (int b = 0; b < n_batch; b++) {
result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
vector1 += v_size;
vector2 += v_size;
}
}
void PortableVectorBatchVectorCwiseProductAccumulate(
const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
int32_t multiplier, int shift, int16_t* result) {
for (int b = 0; b < n_batch; b++) {
for (int v = 0; v < v_size; v++) {
int32_t prod = vector[v] * *batch_vector++;
prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
int32_t output = prod + *result;
output = std::max(std::min(static_cast<int32_t>(32767), output),
static_cast<int32_t>(-32768));
*result++ = output;
}
}
}
void PortableSub1Vector(const float* vector, int v_size, float* result) {
for (int v = 0; v < v_size; v++) {
*result++ = 1.0f - *vector++;
}
}
void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result) {
static const int16_t kOne = 32767;
for (int v = 0; v < v_size; v++) {
*result++ = kOne - *vector++;
}
}
void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
const float scale, float* result) {
for (int v = 0; v < v_size; ++v) {
*result++ = scale * *vector++;
}
}
void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
float* __restrict__ output_vector,
int v_size, int n_batch) {
for (int batch = 0; batch < n_batch; ++batch) {
float sum = 0.0f;
for (int i = 0; i < v_size; ++i) {
sum += input_vector[i];
}
const float mean = sum / v_size;
float sum_diff_sq = 0.0f;
for (int i = 0; i < v_size; ++i) {
const float diff = input_vector[i] - mean;
sum_diff_sq += diff * diff;
}
const float variance = sum_diff_sq / v_size;
constexpr float kNormalizationConstant = 1e-8f;
const float stddev_inv =
1.0f / std::sqrt(variance + kNormalizationConstant);
for (int i = 0; i < v_size; ++i) {
output_vector[i] = (input_vector[i] - mean) * stddev_inv;
}
input_vector += v_size;
output_vector += v_size;
}
}
void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
const int8_t* recurrent, int8_t recurrent_zp,
int32_t input_effective_scale_a,
int32_t input_effective_scale_b,
int32_t recurrent_effective_scale_a,
int32_t recurrent_effective_scale_b,
int32_t n_batch, int32_t n_cell,
int16_t* output) {
const int32_t int16_max = std::numeric_limits<int16_t>::max();
const int32_t int16_min = std::numeric_limits<int16_t>::min();
for (int i = 0; i < n_batch * n_cell; ++i) {
int32_t x = static_cast<int32_t>(input[i]) - static_cast<int32_t>(input_zp);
int32_t h =
static_cast<int32_t>(recurrent[i]) - static_cast<int32_t>(recurrent_zp);
int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
input_effective_scale_b);
int32_t h_scaled = MultiplyByQuantizedMultiplier(
h, recurrent_effective_scale_a, recurrent_effective_scale_b);
int32_t y = h_scaled + x_scaled;
if (y > int16_max) {
y = int16_max;
}
if (y < int16_min) {
y = int16_min;
}
output[i] = static_cast<int16_t>(y);
}
}
} // namespace tensor_utils
} // namespace tflite

View File

@@ -0,0 +1,235 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
#include <algorithm>
#include <cstdint>
#if defined(_MSC_VER)
#define __restrict__ __restrict
#endif
namespace tflite {
// Not all backends support CpuBackendContext usage, so forward declare to avoid
// pulling in its implementation.
class CpuBackendContext;
namespace tensor_utils {
template <typename T>
bool PortableIsZeroVector(const T* vector, int v_size) {
for (int i = 0; i < v_size; ++i) {
if (vector[i] != 0) {
return false;
}
}
return true;
}
void PortableSymmetricQuantizeFloats(const float* values, const int size,
int8_t* quantized_values, float* min_value,
float* max_value, float* scaling_factor);
void PortableSymmetricQuantizeFloats(const float* values, const int size,
int8_t* quantized_values, float min_value,
float max_value, float* scaling_factor);
void PortableAsymmetricQuantizeFloats(const float* values, const int size,
int8_t* quantized_values,
float* scaling_factor, int32_t* offset);
// Multiply a matrix by a batch vector, and store results in a batch-size
// vector.
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
int m_rows, int m_cols,
const float* vector,
int n_batch, float* result);
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors, const float* scaling_factors,
int n_batch, float* __restrict__ result);
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vectors, const float* scaling_factors,
int n_batch, float* __restrict__ result, const float* per_channel_scale,
const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
bool* compute_row_sums, CpuBackendContext* context);
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
const int8_t* __restrict__ vector, const float* scaling_factors,
int n_batch, int32_t* scratch, float* __restrict__ result,
CpuBackendContext* context);
void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
const float* __restrict__ matrix, const int32_t* __restrict__ segments,
const int32_t* __restrict__ indices, int m_rows, int m_cols,
const float* __restrict__ vector, int n_batch, float* __restrict__ result);
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
float* __restrict__ result);
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
const int m_cols, const int8_t* __restrict__ vectors,
const float* scaling_factors, int n_batch, float* __restrict__ result);
// Dot product of two vectors.
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
int v_size);
void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
const int16_t* vector2,
int v_size, int n_batch,
int32_t* result);
void PortableVectorBatchVectorCwiseProductAccumulate(
const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
int32_t multiplier, int shift, int16_t* result);
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* input, const int32_t* bias,
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
int32_t* scratch, int16_t* output, CpuBackendContext* context);
void PortableMatrixBatchVectorMultiplyAccumulate(
const int8_t* input, const int32_t* bias,
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
int32_t* scratch, int8_t* output, CpuBackendContext* context);
void PortableMatrixBatchVectorMultiply(const int8_t* input,
int32_t input_zeropoint,
const int8_t* input_to_gate_weights,
int32_t input_to_gate_effective_scale_a,
int32_t input_to_gate_effective_scale_b,
int32_t n_batch, int32_t n_input,
int32_t n_cell, int8_t* gate_output,
int8_t gate_output_zp);
void PortableMatrixBatchVectorMultiply(
const int16_t* hidden, const int8_t* hidden_to_output_weights,
int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
int32_t n_output, int32_t output_zp, int8_t* proj_output);
void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
int32_t scalar, int32_t n_row,
int32_t n_col, int32_t* output);
void PortableApplyLayerNorm(const int16_t* input,
const int16_t* layer_norm_weights,
const int32_t* bias, int32_t layer_norm_scale_a,
int32_t layer_norm_scale_b, int32_t variance_limit,
int n_batch, int n_input, int16_t* output);
void PortableApplyLayerNormFloat(const int16_t* input,
const int16_t* layer_norm_weights,
int32_t layer_norm_scale_a,
int32_t layer_norm_scale_b,
const int32_t* bias, int n_batch, int n_input,
int16_t* output);
void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
int32_t n_input, int16_t* output);
void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
int32_t n_input, int16_t* output);
void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
int32_t n_batch, int32_t n_input, int16_t* output);
void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
int32_t n_input, int32_t integer_bits,
int16_t* output);
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
int n_batch, int n_input, int shift, int16_t* output);
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
int32_t multiplier, int32_t shift, int32_t n_batch,
int32_t n_input, int32_t output_zp, int8_t* output);
void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
int n_batch, int n_input, int16_t* output);
template <typename T>
void PortableCwiseClipping(T* vector, const int v_size,
const T& clipping_value) {
for (int i = 0; i < v_size; i++) {
vector[i] = std::max(std::min(clipping_value, vector[i]),
static_cast<T>(-clipping_value));
}
}
// Batch vector initialization with another vector.
void PortableVectorBatchVectorAssign(const float* vector, int v_size,
int n_batch, float* batch_vector);
// Compute "1.0f - elements of vector" (used in CIFG).
void PortableSub1Vector(const float* vector, int v_size, float* result);
void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result);
// Multiply all elements of vector with a scalar.
void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
float* result);
// Reduce-sum on a vector:
// input_vector: pointer to input vector.
// output_vector: pointer to vector.
// output_size: output vector size.
// reduction_size: number of consecutive elements from input vector which are
// added to get one element of output.
template <typename INPUT, typename OUTPUT>
void PortableReductionSumVector(const INPUT* input_vector,
OUTPUT* output_vector, int output_size,
int reduction_size) {
for (int o = 0; o < output_size; o++) {
OUTPUT result = 0;
for (int r = 0; r < reduction_size; r++) {
result += input_vector[r];
}
output_vector[o] = result;
input_vector += reduction_size;
}
}
// Layer norm for each batch.
void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
float* __restrict__ output_vector,
int v_size, int n_batch);
// Saturate Add.
void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
const int8_t* recurrent, int8_t recurrent_zp,
int32_t input_effective_scale_a,
int32_t input_effective_scale_b,
int32_t recurrent_effective_scale_a,
int32_t recurrent_effective_scale_b,
int32_t n_batch, int32_t n_cell,
int16_t* output);
} // namespace tensor_utils
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_

View File

@@ -0,0 +1,109 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
// Broadcast prelu to output_shape for quantized uint8_t/int8_t data.
template <typename T>
inline void BroadcastPrelu4DSlow(
const PreluParams& params, const RuntimeShape& input_shape,
const T* input_data, const RuntimeShape& alpha_shape, const T* alpha_data,
const RuntimeShape& output_shape, T* output_data) {
TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
const RuntimeShape extended_output_shape =
RuntimeShape::ExtendedShape(4, output_shape);
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
int output_index = Offset(extended_output_shape, b, y, x, c);
int input_index = SubscriptToIndex(desc1, b, y, x, c);
const int32_t input_value =
params.input_offset + input_data[input_index];
int32_t output_value;
if (input_value >= 0) {
output_value = MultiplyByQuantizedMultiplier(
input_value, params.output_multiplier_1, params.output_shift_1);
} else {
auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
const int32_t alpha_value =
params.alpha_offset + alpha_data[alpha_index];
output_value = MultiplyByQuantizedMultiplier(
input_value * alpha_value, params.output_multiplier_2,
params.output_shift_2);
}
output_value += params.output_offset;
const int32_t quantized_min = std::numeric_limits<T>::min();
const int32_t quantized_max = std::numeric_limits<T>::max();
const int32_t clamped_output =
std::min(quantized_max, std::max(quantized_min, output_value));
output_data[output_index] = static_cast<T>(clamped_output);
}
}
}
}
}
template <typename T>
inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
const T* input_data, const RuntimeShape& alpha_shape,
const T* alpha_data, const RuntimeShape& output_shape,
T* output_data) {
const int32_t quantized_min = std::numeric_limits<T>::min();
const int32_t quantized_max = std::numeric_limits<T>::max();
const int flat_size =
MatchingElementsSize(input_shape, alpha_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
const int32_t input_value = params.input_offset + input_data[i];
int32_t output_value;
if (input_value >= 0) {
output_value = MultiplyByQuantizedMultiplier(
input_value, params.output_multiplier_1, params.output_shift_1);
} else {
const int32_t alpha_value = params.alpha_offset + alpha_data[i];
output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value,
params.output_multiplier_2,
params.output_shift_2);
}
output_value += params.output_offset;
const int32_t clamped_output =
std::min(quantized_max, std::max(quantized_min, output_value));
output_data[i] = static_cast<T>(clamped_output);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_

View File

@@ -0,0 +1,138 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
// Consolidates dimensions in broadcast inputs, checks for five-fold pattern.
//
// For example, if sequence of dimensions of one input is
// ..., 1, 3, 1, 7, 9, 5,... and the other is ..., 2, 3, 1, 7, 1, 1, ...
// we can consolidate these as
// ..., 1, 3*7, 9*5, ... and 2, 3*7, 1.
//
// The category is updated in the less-frequent case of shapes that are
// not suited to a fivefold-loop broadcast.
//
// Falls back to generic pattern when it does not know how to process properly.
//
// Returns true iff there is some sort of broadcast, which includes five-fold
// patterns and falling back to generic broadcast.
inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
const RuntimeShape& shape1,
tflite::ArithmeticParams* params) {
const int dims_count =
std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
RuntimeShape scalar_shape(dims_count, 1);
auto extended_shape0 = RuntimeShape::ExtendedShape(dims_count, shape0);
auto extended_shape1 = RuntimeShape::ExtendedShape(dims_count, shape1);
// Check for "exact" match, implicitly accepting any scalar shapes.
if (extended_shape0 == extended_shape1) {
params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
return false;
}
for (int i = dims_count - 1; i >= 0; --i) {
if (extended_shape0.Dims(i) == extended_shape1.Dims(i)) {
continue;
} else if (extended_shape0.Dims(i) == 1) {
params->broadcast_category =
BroadcastableOpCategory::kFirstInputBroadcastsFast;
break;
} else if (extended_shape1.Dims(i) == 1) {
params->broadcast_category =
BroadcastableOpCategory::kSecondInputBroadcastsFast;
break;
} else {
// This case is erroneous: there is a dimension that does not match and
// is not a broadcast from one shape to the other.
params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
return true;
}
}
if (params->broadcast_category !=
BroadcastableOpCategory::kFirstInputBroadcastsFast &&
params->broadcast_category !=
BroadcastableOpCategory::kSecondInputBroadcastsFast) {
// This is unreachable because at least one else clause in the above loop
// must be reached.
TFLITE_DCHECK(false);
params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
return false;
}
// From this point it is assumed contractually that corresponding dimensions
// in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
const bool swap_inputs = params->broadcast_category ==
BroadcastableOpCategory::kSecondInputBroadcastsFast;
const RuntimeShape* shape_a =
swap_inputs ? &extended_shape1 : &extended_shape0;
const RuntimeShape* shape_b =
swap_inputs ? &extended_shape0 : &extended_shape1;
int i = dims_count - 1;
params->broadcast_shape[0] = 1;
params->broadcast_shape[1] = 1;
params->broadcast_shape[2] = 1;
params->broadcast_shape[3] = 1;
params->broadcast_shape[4] = 1;
// y_0 is greedy: include dims if both or neither equal 1: in other words,
// test for equality rather than (shape_a->Dims(i) != 1).
while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
params->broadcast_shape[4] *= shape_b->Dims(i);
--i;
}
// Here either input_a or input_b has dim of 1 (if i >= 0). If it is input_b
// that has the unit dimension, the next two loops are not entered.
while (i >= 0 && shape_a->Dims(i) == 1) {
params->broadcast_shape[3] *= shape_b->Dims(i);
--i;
}
while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
params->broadcast_shape[2] *= shape_a->Dims(i);
--i;
}
// Here either input_a or input_b has dim of 1 (if i >= 0).
while (i >= 0 && shape_b->Dims(i) == 1) {
params->broadcast_shape[1] *= shape_a->Dims(i);
--i;
}
while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
params->broadcast_shape[0] *= shape_b->Dims(i);
--i;
}
// Rarer case is when the broadcast dimensions cannot be handled by a fivefold
// loop.
if (i >= 0) {
params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
}
return true;
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_

View File

@@ -0,0 +1,89 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
#include <algorithm>
#include <limits>
#include <vector>
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename InputT, typename OutputT>
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
const RuntimeShape& input_shape,
const InputT* input_data,
const RuntimeShape& output_shape,
OutputT* output_data) {
const int32_t zero_point = op_params.zero_point;
const double scale = op_params.scale;
const int flat_size = MatchingFlatSize(input_shape, output_shape);
static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();
for (int i = 0; i < flat_size; i++) {
const InputT val = input_data[i];
int32_t unclamped =
static_cast<int32_t>(TfLiteRound(val / static_cast<float>(scale))) +
zero_point;
int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
output_data[i] = clamped;
}
}
// Quantizes per-channel.
template <typename InputT, typename OutputT>
inline void PerChannelQuantize(
const tflite::PerChannelQuantizationParams& op_params,
const RuntimeShape& input_shape, const InputT* input_data,
const RuntimeShape& output_shape, OutputT* output_data) {
// Ensure flat size is same.
MatchingFlatSize(input_shape, output_shape);
const int32_t* zero_point = op_params.zero_point;
const float* scale = op_params.scale;
const int32_t quantized_dimension = op_params.quantized_dimension;
const int32_t num_dims = input_shape.DimensionsCount();
const int32_t* dims_data = input_shape.DimsData();
std::vector<int> current_dim(num_dims, 0);
static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();
do {
size_t offset =
ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
current_dim.data(), 0, nullptr);
const InputT val = input_data[offset];
const int channel = current_dim[quantized_dimension];
int32_t unclamped = static_cast<int32_t>(TfLiteRound(
val / static_cast<float>(scale[channel]))) +
zero_point[channel];
int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
output_data[offset] = static_cast<OutputT>(clamped);
} while (NextIndex(num_dims, reinterpret_cast<const int*>(dims_data),
current_dim.data()));
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_

View File

@@ -0,0 +1,524 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/max.h"
#include "tensorflow/lite/kernels/internal/min.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/types.h"
// Check if the reduction at index is the first one along the dimensions given
// in axis.
inline bool IsFirstReduction(const int* index, const int num_axis,
const int* axis) {
if (num_axis == 0) {
return true;
}
TFLITE_DCHECK(index != nullptr);
TFLITE_DCHECK(axis != nullptr);
for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
if (index[axis[axis_idx]] != 0) {
return false;
}
}
return true;
}
namespace tflite {
namespace reference_ops {
// A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
// This method iterates through input data and reduce elements along the
// dimensions given in axis.
template <typename In, typename Out>
inline bool Reduce(const In* input_data, const int* input_dims,
const int* output_dims, const int input_num_dims,
const int output_num_dims, const int* axis,
const int num_axis, int* input_iter,
Out reducer(Out current, const In in), Out* output_data) {
// Reset input iterator.
for (int idx = 0; idx < input_num_dims; ++idx) {
input_iter[idx] = 0;
}
// Iterate through input_data.
do {
size_t input_offset =
ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
input_iter, num_axis, axis);
output_data[output_offset] =
reducer(output_data[output_offset], input_data[input_offset]);
} while (NextIndex(input_num_dims, input_dims, input_iter));
return true;
}
// Similar to above Reduce function but takes two reducer functions.
// The 'reducer_first' is called with the first value of the reduction,
// 'reducer_next' is then called for all the others.
template <typename In, typename Out>
inline bool Reduce(const In* input_data, const int* input_dims,
const int* output_dims, const int input_num_dims,
const int output_num_dims, const int* axis,
const int num_axis, int* input_iter,
const std::function<Out(In in)>& reducer_first,
const std::function<Out(Out current, In in)>& reducer_next,
Out* output_data) {
// Reset input iterator.
for (int idx = 0; idx < input_num_dims; ++idx) {
input_iter[idx] = 0;
}
// Iterate through input_data.
do {
size_t input_offset =
ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
input_iter, num_axis, axis);
if (IsFirstReduction(input_iter, num_axis, axis)) {
output_data[output_offset] = reducer_first(input_data[input_offset]);
} else {
output_data[output_offset] =
reducer_next(output_data[output_offset], input_data[input_offset]);
}
} while (NextIndex(input_num_dims, input_dims, input_iter));
return true;
}
// This method parses the input 'axis' to remove duplicates and handle negative
// values, and returns a valid 'out_axis'
inline bool ResolveAxis(const int num_dims, const int* axis,
const int64_t num_axis, int* out_axis,
int* out_num_axis) {
*out_num_axis = 0; // Just in case.
// Short-circuit axis resolution for scalars; the axis will go unused.
if (num_dims == 0) {
return true;
}
// o(n^2) is fine since out_num_axis should be really small, mostly <= 4
for (int64_t idx = 0; idx < num_axis; ++idx) {
// Handle negative index. A positive index 'p_idx' can be represented as a
// negative index 'n_idx' as: n_idx = p_idx-num_dims
// eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1] */
int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
TFLITE_DCHECK(current >= 0 && current < num_dims);
if (current < 0 || current >= num_dims) {
return false;
}
bool is_dup = false;
for (int j = 0; j < *out_num_axis; ++j) {
if (out_axis[j] == current) {
is_dup = true;
break;
}
}
if (!is_dup) {
out_axis[*out_num_axis] = current;
*out_num_axis += 1;
}
}
return true;
}
// This method expects that output_data has been initialized.
template <typename In, typename Out>
inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
const int* output_dims, const int input_num_dims,
const int output_num_dims, const int* axis,
const int num_axis, int* input_iter,
Out* output_data) {
auto reducer = [](const Out current, const In in) -> Out {
const Out actual_in = static_cast<Out>(in);
return current + actual_in;
};
return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
output_num_dims, axis, num_axis, input_iter, reducer,
output_data);
}
template <typename T>
inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
const T init_value, T* data) {
size_t num_elements = 1;
for (int idx = 0; idx < num_dims; ++idx) {
size_t current = static_cast<size_t>(dims[idx]);
// Overflow prevention.
if (current > 0 &&
num_elements > std::numeric_limits<size_t>::max() / current) {
return false;
}
num_elements *= current;
}
for (size_t idx = 0; idx < num_elements; ++idx) {
data[idx] = init_value;
}
return true;
}
// Computes the generic value (i.e., sum/max/min/prod) of elements across
// dimensions given in axis. It needs to pass in init_value and reducer.
template <typename T>
inline bool ReduceGeneric(const T* input_data, const int* input_dims,
const int input_num_dims, T* output_data,
const int* output_dims, const int output_num_dims,
const int* axis, const int64_t num_axis_dimensions,
bool keep_dims, int* temp_index, int* resolved_axis,
T init_value,
T reducer(const T current, const T in)) {
// Reset output data.
if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
output_data)) {
return false;
}
// Return early when input shape has zero dim. This is done after initializing
// data for output tensor because there are cases that the input tensor is
// empty but output tensor is not. In that case, output tensor should be
// filled with init_value.
for (int i = 0; i < input_num_dims; ++i) {
if (input_dims[i] == 0) return true;
}
// Resolve axis.
int num_resolved_axis = 0;
if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
&num_resolved_axis)) {
return false;
}
return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
output_num_dims, resolved_axis, num_resolved_axis,
temp_index, reducer, output_data);
}
// Computes the mean of elements across dimensions given in axis.
// It does so in two stages, first calculates the sum of elements along the axis
// then divides it by the number of element in axis.
template <typename T, typename U>
inline bool Mean(const T* input_data, const int* input_dims,
const int input_num_dims, T* output_data,
const int* output_dims, const int output_num_dims,
const int* axis, const int num_axis_dimensions, bool keep_dims,
int* temp_index, int* resolved_axis, U* temp_sum) {
ruy::profiler::ScopeLabel label("Mean");
// Reset output data.
size_t num_outputs = 1;
for (int idx = 0; idx < output_num_dims; ++idx) {
size_t current = static_cast<size_t>(output_dims[idx]);
// Overflow prevention.
if (num_outputs > std::numeric_limits<size_t>::max() / current) {
return false;
}
num_outputs *= current;
}
for (size_t idx = 0; idx < num_outputs; ++idx) {
output_data[idx] = T();
temp_sum[idx] = U();
}
// Resolve axis.
int num_resolved_axis = 0;
if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
&num_resolved_axis)) {
return false;
}
if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
output_num_dims, resolved_axis, num_resolved_axis,
temp_index, temp_sum)) {
return false;
}
// Calculate mean by dividing output_data by num of aggregated element.
size_t num_elements_in_axis = 1;
for (int idx = 0; idx < num_resolved_axis; ++idx) {
size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
// Overflow prevention.
if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
return false;
}
num_elements_in_axis *= current;
}
if (num_elements_in_axis > 0) {
for (size_t idx = 0; idx < num_outputs; ++idx) {
output_data[idx] =
static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
}
}
return true;
}
template <typename T>
inline void Mean(const tflite::MeanParams& op_params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_shape, T* output_data) {
ruy::profiler::ScopeLabel label("Mean4D");
// Current implementation only supports dimension equals 4 and simultaneous
// reduction over width and height.
TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(4, unextended_input_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
const int output_batch = output_shape.Dims(0);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int output_depth = output_shape.Dims(3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
TFLITE_CHECK_EQ(op_params.axis_count, 2);
TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
(op_params.axis[0] == 2 && op_params.axis[1] == 1));
TFLITE_CHECK_EQ(output_height, 1);
TFLITE_CHECK_EQ(output_width, 1);
for (int out_b = 0; out_b < output_batch; ++out_b) {
for (int out_d = 0; out_d < output_depth; ++out_d) {
float value = 0;
for (int in_h = 0; in_h < input_height; ++in_h) {
for (int in_w = 0; in_w < input_width; ++in_w) {
value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
}
}
output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
value / (input_width * input_height);
}
}
}
inline void Mean(const tflite::MeanParams& op_params,
const RuntimeShape& unextended_input_shape,
const uint8_t* input_data, int32_t input_zero_point,
float input_scale, const RuntimeShape& unextended_output_shape,
uint8_t* output_data, int32_t output_zero_point,
float output_scale) {
ruy::profiler::ScopeLabel label("Mean4D/Uint8");
// Current implementation only supports dimension equals 4 and simultaneous
// reduction over width and height.
TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(4, unextended_input_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
const int output_batch = output_shape.Dims(0);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int output_depth = output_shape.Dims(3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const float num_elements_in_axis = input_width * input_height;
TFLITE_CHECK_EQ(op_params.axis_count, 2);
TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
(op_params.axis[0] == 2 && op_params.axis[1] == 1));
TFLITE_CHECK_EQ(output_height, 1);
TFLITE_CHECK_EQ(output_width, 1);
constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
float temp = input_zero_point * input_scale / output_scale;
temp = temp > 0 ? temp + 0.5f : temp - 0.5f;
int32_t bias = output_zero_point - static_cast<int32_t>(temp);
double real_scale =
static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
int32_t multiplier;
int shift;
QuantizeMultiplier(real_scale, &multiplier, &shift);
for (int out_b = 0; out_b < output_batch; ++out_b) {
for (int out_d = 0; out_d < output_depth; ++out_d) {
int32_t acc = 0;
for (int in_h = 0; in_h < input_height; ++in_h) {
for (int in_w = 0; in_w < input_width; ++in_w) {
acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
}
}
acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
acc += bias;
acc = std::min(std::max(acc, kMinValue), kMaxValue);
output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
static_cast<uint8_t>(acc);
}
}
}
// Computes the mean of elements across dimensions given in axis.
// It does so in two stages, first calculates the sum of elements along the axis
// then divides it by the number of element in axis for quantized values.
template <typename T, typename U>
inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
float input_scale, const int* input_dims,
const int input_num_dims, T* output_data,
int32_t output_zero_point, float output_scale,
const int* output_dims,
const int output_num_dims, const int* axis,
const int num_axis_dimensions, bool keep_dims,
int* temp_index, int* resolved_axis, U* temp_sum,
bool compute_sum) {
const bool uint8_case = std::is_same<T, uint8_t>::value;
const bool int16_case = std::is_same<T, int16_t>::value;
if (uint8_case) {
ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
} else if (int16_case) {
ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
} else {
ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
}
// Reset output data.
size_t num_outputs = 1;
for (int idx = 0; idx < output_num_dims; ++idx) {
size_t current = static_cast<size_t>(output_dims[idx]);
// Overflow prevention.
if (num_outputs > std::numeric_limits<size_t>::max() / current) {
return false;
}
num_outputs *= current;
}
for (size_t idx = 0; idx < num_outputs; ++idx) {
output_data[idx] = T();
temp_sum[idx] = U();
}
// Return early when input shape has zero dim. This is done after initializing
// data for output tensor because there are cases that the input tensor is
// empty but output tensor is not. In that case, output tensor should be
// filled with init_value.
for (int i = 0; i < input_num_dims; ++i) {
if (input_dims[i] == 0) return true;
}
// Resolve axis.
int num_resolved_axis = 0;
if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
&num_resolved_axis)) {
return false;
}
if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
output_num_dims, resolved_axis, num_resolved_axis,
temp_index, temp_sum)) {
return false;
}
// Calculate mean by dividing output_data by num of aggregated element.
size_t num_elements_in_axis = 1;
for (int idx = 0; idx < num_resolved_axis; ++idx) {
size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
// Overflow prevention.
if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
return false;
}
num_elements_in_axis *= current;
}
if (num_elements_in_axis > 0) {
const float scale = input_scale / output_scale;
if (compute_sum) {
// TODO(b/116341117): Eliminate float and do this completely in 8bit.
const float bias = -input_zero_point * scale * num_elements_in_axis;
for (size_t idx = 0; idx < num_outputs; ++idx) {
const U value =
static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
output_zero_point;
output_data[idx] = static_cast<T>(value);
}
} else {
const float bias = -input_zero_point * scale;
for (size_t idx = 0; idx < num_outputs; ++idx) {
float float_mean = static_cast<float>(temp_sum[idx]) /
static_cast<float>(num_elements_in_axis);
float result = TfLiteMin(
TfLiteRound(float_mean * scale + bias) + output_zero_point,
static_cast<float>(std::numeric_limits<T>::max()));
result = TfLiteMax(result,
static_cast<float>(std::numeric_limits<T>::min()));
output_data[idx] = static_cast<T>(result);
}
}
}
return true;
}
template <typename T>
inline bool QuantizedReduceProd(const T* input_data, int32_t input_zero_point,
const RuntimeShape& input_shape, T* output_data,
int32_t output_zero_point,
const RuntimeShape& output_shape,
const int* axis,
const int64_t num_axis_dimensions,
bool keep_dims, int* temp_index,
int* resolved_axis, int32_t* temp_prod,
int32_t scaling_multiplier, int scaling_shift) {
const int32_t kMinValue = std::numeric_limits<T>::min();
const int32_t kMaxValue = std::numeric_limits<T>::max();
// Resolve axis.
int num_resolved_axis = 0;
if (!ResolveAxis(input_shape.DimensionsCount(), axis, num_axis_dimensions,
resolved_axis, &num_resolved_axis)) {
return false;
}
// Calculate the reduced product by rescaling each multiplication step to
// avoid an overflow.
auto reducer_first = [&](T in) -> int32_t { return in - input_zero_point; };
auto reducer_next = [&](int32_t current, T in) -> int32_t {
const int64_t result =
static_cast<int64_t>(current) * (in - input_zero_point);
return MultiplyByQuantizedMultiplier(result, scaling_multiplier,
scaling_shift);
};
if (!Reduce<T, int32_t>(
input_data, input_shape.DimsData(), output_shape.DimsData(),
input_shape.DimensionsCount(), output_shape.DimensionsCount(),
resolved_axis, num_resolved_axis, temp_index, reducer_first,
reducer_next, temp_prod)) {
return false;
}
for (int i = 0; i < output_shape.FlatSize(); i++) {
int32_t result =
MultiplyByQuantizedMultiplier(static_cast<int64_t>(temp_prod[i]),
scaling_multiplier, scaling_shift) +
output_zero_point;
result = std::min(std::max(result, kMinValue), kMaxValue);
output_data[i] = static_cast<T>(result);
}
return true;
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_

View File

@@ -0,0 +1,68 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REQUANTIZE_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REQUANTIZE_H_
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename input_type, typename output_type>
inline void Requantize(const input_type* input_data, int32_t size,
int32_t effective_scale_multiplier,
int32_t effective_scale_shift, int32_t input_zeropoint,
int32_t output_zeropoint, output_type* output_data) {
ruy::profiler::ScopeLabel label("Requantize");
const bool same_scale =
(effective_scale_multiplier == 1 << 30 && effective_scale_shift == 1);
if (same_scale) {
const bool mixed_type_int8_uint8 =
std::is_same<input_type, int8_t>::value &&
std::is_same<output_type, uint8_t>::value;
const bool mixed_type_uint8_int8 =
std::is_same<input_type, uint8_t>::value &&
std::is_same<output_type, int8_t>::value;
const int32_t zero_point_diff = input_zeropoint - output_zeropoint;
// Fast path to do requantization for the case when just a shift of 128 is
// needed.
if ((mixed_type_int8_uint8 && zero_point_diff == -128) ||
(mixed_type_uint8_int8 && zero_point_diff == 128)) {
for (int i = 0; i < size; ++i) {
output_data[i] = input_data[i] ^ 0x80;
}
return;
}
}
static constexpr int32_t kMinOutput = std::numeric_limits<output_type>::min();
static constexpr int32_t kMaxOutput = std::numeric_limits<output_type>::max();
for (int i = 0; i < size; ++i) {
const int32_t input = input_data[i] - input_zeropoint;
const int32_t output =
MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
effective_scale_shift) +
output_zeropoint;
const int32_t clamped_output =
std::max(std::min(output, kMaxOutput), kMinOutput);
output_data[i] = static_cast<output_type>(clamped_output);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REQUANTIZE_H_

View File

@@ -0,0 +1,228 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <limits>
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void ComputeInterpolationValues(const float value, const float scale,
const bool half_pixel_centers,
int32_t input_size, float* scaled_value,
int32_t* lower_bound,
int32_t* upper_bound) {
if (half_pixel_centers) {
*scaled_value = (value + 0.5f) * scale - 0.5f;
} else {
*scaled_value = value * scale;
}
float scaled_value_floor = std::floor(*scaled_value);
*lower_bound = std::max(static_cast<int32_t>(scaled_value_floor),
static_cast<int32_t>(0));
*upper_bound =
std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1);
}
template <typename T>
inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_size_shape,
const int32_t* output_size_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
// If half_pixel_centers is True, align_corners must be False.
TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(4, unextended_input_shape);
const RuntimeShape output_size_shape =
RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
int32_t input_height = input_shape.Dims(1);
int32_t input_width = input_shape.Dims(2);
int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
int32_t output_height =
output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
int32_t output_width =
output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
float height_scale = static_cast<float>(input_height) / output_height;
float width_scale = static_cast<float>(input_width) / output_width;
if (op_params.align_corners && output_height > 1) {
height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
}
if (op_params.align_corners && output_width > 1) {
width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
}
const float rounding_offset = std::numeric_limits<T>::is_integer ? .5f : .0f;
for (int b = 0; b < batches; ++b) {
for (int y = 0; y < output_height; ++y) {
float input_y;
int32_t y0, y1;
ComputeInterpolationValues(y, height_scale, op_params.half_pixel_centers,
input_height, &input_y, &y0, &y1);
for (int x = 0; x < output_width; ++x) {
float input_x;
int32_t x0, x1;
ComputeInterpolationValues(x, width_scale, op_params.half_pixel_centers,
input_width, &input_x, &x0, &x1);
for (int c = 0; c < depth; ++c) {
T interpolation =
static_cast<T>(input_data[Offset(input_shape, b, y0, x0, c)] *
(1 - (input_y - y0)) * (1 - (input_x - x0)) +
input_data[Offset(input_shape, b, y1, x0, c)] *
(input_y - y0) * (1 - (input_x - x0)) +
input_data[Offset(input_shape, b, y0, x1, c)] *
(1 - (input_y - y0)) * (input_x - x0) +
input_data[Offset(input_shape, b, y1, x1, c)] *
(input_y - y0) * (input_x - x0) +
rounding_offset);
output_data[Offset(output_shape, b, y, x, c)] = interpolation;
}
}
}
}
}
inline void ComputeInterpolationValuesInteger(
const int32_t value, const int32_t scale_10, const bool half_pixel_centers,
int32_t input_size, int32_t* scaled_value, int32_t* lower_bound,
int32_t* upper_bound) {
if (half_pixel_centers) {
*scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
} else {
*scaled_value = value * scale_10;
}
constexpr int32_t zero = 0;
*lower_bound = std::max(*scaled_value / (1 << 10), zero);
*upper_bound =
std::min((*scaled_value + (1 << 10) - 1) / (1 << 10), input_size - 1);
}
// Same as above but doesn't use any floating-point for the resize
template <typename T>
inline void ResizeBilinearInteger(
const tflite::ResizeBilinearParams& op_params,
const RuntimeShape& unextended_input_shape, const T* input_data,
const RuntimeShape& unextended_output_size_shape,
const int32_t* output_size_data,
const RuntimeShape& unextended_output_shape, T* output_data) {
// If half_pixel_centers is True, align_corners must be False.
TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(4, unextended_input_shape);
const RuntimeShape output_size_shape =
RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
const int32_t input_height = input_shape.Dims(1);
const int32_t input_width = input_shape.Dims(2);
const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
const int32_t output_height =
output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
const int32_t output_width =
output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
int32_t height_scale_10 =
((1 << 10) * input_height + output_height / 2) / output_height;
int32_t width_scale_10 =
((1 << 10) * input_width + output_width / 2) / output_width;
if (op_params.align_corners && output_height > 1) {
height_scale_10 =
((1 << 10) * (input_height - 1) + (output_height - 1) / 2) /
(output_height - 1);
}
if (op_params.align_corners && output_width > 1) {
width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) /
(output_width - 1);
}
for (int b = 0; b < batches; ++b) {
for (int y = 0; y < output_height; ++y) {
int32_t input_y, y0, y1;
ComputeInterpolationValuesInteger(y, height_scale_10,
op_params.half_pixel_centers,
input_height, &input_y, &y0, &y1);
for (int x = 0; x < output_width; ++x) {
int32_t input_x, x0, x1;
ComputeInterpolationValuesInteger(x, width_scale_10,
op_params.half_pixel_centers,
input_width, &input_x, &x0, &x1);
for (int c = 0; c < depth; ++c) {
const int64_t output_20_ll =
static_cast<int64_t>(
input_data[Offset(input_shape, b, y0, x0, c)]) *
((1 << 10) - (input_y - (1 << 10) * y0)) *
((1 << 10) - (input_x - (1 << 10) * x0));
const int64_t output_20_lu =
static_cast<int64_t>(
input_data[Offset(input_shape, b, y1, x0, c)]) *
(input_y - (1 << 10) * y0) *
((1 << 10) - (input_x - (1 << 10) * x0));
const int64_t output_20_rl =
static_cast<int64_t>(
input_data[Offset(input_shape, b, y0, x1, c)]) *
((1 << 10) - (input_y - (1 << 10) * y0)) *
(input_x - (1 << 10) * x0);
const int64_t output_20_ru =
static_cast<int64_t>(
input_data[Offset(input_shape, b, y1, x1, c)]) *
(input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
const int64_t output_20 =
output_20_ll + output_20_lu + output_20_rl + output_20_ru;
const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
const T interpolation =
static_cast<T>((output_20 + round) / (1 << 20));
output_data[Offset(output_shape, b, y, x, c)] = interpolation;
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_

View File

@@ -0,0 +1,101 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
#include <cmath>
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline int32_t GetNearestNeighbor(const int input_value,
const int32_t input_size,
const int32_t output_size,
const bool align_corners,
const bool half_pixel_centers) {
const float scale =
(align_corners && output_size > 1)
? (input_size - 1) / static_cast<float>(output_size - 1)
: input_size / static_cast<float>(output_size);
const float offset = half_pixel_centers ? 0.5f : 0.0f;
int32_t output_value = std::min(
align_corners
? static_cast<int32_t>(TfLiteRound((input_value + offset) * scale))
: static_cast<int32_t>(std::floor((input_value + offset) * scale)),
input_size - 1);
if (half_pixel_centers) {
output_value = std::max(static_cast<int32_t>(0), output_value);
}
return output_value;
}
template <typename T>
inline void ResizeNearestNeighbor(
const tflite::ResizeNearestNeighborParams& op_params,
const RuntimeShape& unextended_input_shape, const T* input_data,
const RuntimeShape& output_size_shape, const int32_t* output_size_data,
const RuntimeShape& unextended_output_shape, T* output_data) {
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(4, unextended_input_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
int32_t input_height = input_shape.Dims(1);
int32_t input_width = input_shape.Dims(2);
int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
// The Tensorflow version of this op allows resize on the width and height
// axis only.
TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
int32_t output_height = output_size_data[0];
int32_t output_width = output_size_data[1];
const int col_offset = input_shape.Dims(3);
const int row_offset = input_shape.Dims(2) * col_offset;
const int batch_offset = input_shape.Dims(1) * row_offset;
const T* input_ptr = input_data;
T* output_ptr = output_data;
for (int b = 0; b < batches; ++b) {
for (int y = 0; y < output_height; ++y) {
int32_t in_y = GetNearestNeighbor(y, input_height, output_height,
op_params.align_corners,
op_params.half_pixel_centers);
const T* y_input_ptr = input_ptr + in_y * row_offset;
for (int x = 0; x < output_width; ++x) {
int32_t in_x = GetNearestNeighbor(x, input_width, output_width,
op_params.align_corners,
op_params.half_pixel_centers);
const T* x_input_ptr = y_input_ptr + in_x * col_offset;
memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
output_ptr += depth;
}
}
input_ptr += batch_offset;
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_

View File

@@ -0,0 +1,51 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
#include <cmath>
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline float RoundToNearest(float value) {
auto floor_val = std::floor(value);
auto diff = value - floor_val;
if ((diff < 0.5f) ||
((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0))) {
return floor_val;
} else {
return floor_val = floor_val + 1.0f;
}
}
inline void Round(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
// Note that this implementation matches that of tensorFlow tf.round
// and corresponds to the bankers rounding method.
// cfenv (for fesetround) is not yet supported universally on Android, so
// using a work around.
output_data[i] = RoundToNearest(input_data[i]);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_

View File

@@ -0,0 +1,80 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_
#include "tensorflow/lite/kernels/internal/portable_tensor.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void Slice(const tflite::SliceParams& op_params,
const RuntimeShape& input_shape,
const RuntimeShape& output_shape,
SequentialTensorWriter<T>* writer) {
const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(5, input_shape);
TFLITE_DCHECK_LE(op_params.begin_count, 5);
TFLITE_DCHECK_LE(op_params.size_count, 5);
const int begin_count = op_params.begin_count;
const int size_count = op_params.size_count;
// We front-pad the begin and size vectors.
int start[5];
int stop[5];
for (int i = 0; i < 5; ++i) {
int padded_i = 5 - i;
start[i] =
begin_count < padded_i ? 0 : op_params.begin[begin_count - padded_i];
stop[i] =
(size_count < padded_i || op_params.size[size_count - padded_i] == -1)
? ext_shape.Dims(i)
: start[i] + op_params.size[size_count - padded_i];
}
for (int i0 = start[0]; i0 < stop[0]; ++i0) {
for (int i1 = start[1]; i1 < stop[1]; ++i1) {
for (int i2 = start[2]; i2 < stop[2]; ++i2) {
for (int i3 = start[3]; i3 < stop[3]; ++i3) {
for (int i4 = start[4]; i4 < stop[4]; ++i4) {
writer->Write(Offset(ext_shape, i0, i1, i2, i3, i4));
}
}
}
}
}
}
template <typename T>
inline void Slice(const tflite::SliceParams& op_params,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& output_shape, T* output_data) {
SequentialTensorWriter<T> writer(input_data, output_data);
return Slice(op_params, input_shape, output_shape, &writer);
}
template <typename T>
inline void Slice(const tflite::SliceParams& op_params,
const RuntimeShape& input_shape, const TfLiteTensor* input,
const RuntimeShape& output_shape, TfLiteTensor* output) {
SequentialTensorWriter<T> writer(input, output);
return Slice(op_params, input_shape, output_shape, &writer);
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_

View File

@@ -0,0 +1,232 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
#include <limits>
#include "fixedpoint/fixedpoint.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/types.h"
#include "tensorflow/lite/kernels/op_macros.h"
namespace tflite {
namespace reference_ops {
inline void Softmax(const SoftmaxParams& params,
const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const int trailing_dim = input_shape.DimensionsCount() - 1;
const int outer_size =
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
const int depth =
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
for (int i = 0; i < outer_size; ++i) {
// Find max element value which we'll use to ensure numerical stability
// taking advantage of the following equality:
// exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
float max = std::numeric_limits<float>::lowest();
for (int c = 0; c < depth; ++c) {
max = std::max(max, input_data[i * depth + c]);
}
// Compute sum.
float sum = 0.f;
for (int c = 0; c < depth; ++c) {
const float exp_c = std::exp((input_data[i * depth + c] - max) *
static_cast<float>(params.beta));
output_data[i * depth + c] = exp_c;
sum += exp_c;
}
// Compute result.
for (int c = 0; c < depth; ++c) {
output_data[i * depth + c] = output_data[i * depth + c] / sum;
}
}
}
// Quantized softmax with int8_t/uint8_t input and int8_t/uint8_t/int16_t
// output.
template <typename InputT, typename OutputT>
inline void Softmax(const SoftmaxParams& params,
const RuntimeShape& input_shape, const InputT* input_data,
const RuntimeShape& output_shape, OutputT* output_data) {
const int32_t input_beta_multiplier = params.input_multiplier;
const int32_t input_beta_left_shift = params.input_left_shift;
const int diff_min = params.diff_min;
// The representation chosen for the input to the exp() function is Q5.26.
// We need to leave extra space since values that we skip might be as large as
// -32 before multiplying by input_beta_multiplier, and therefore as large as
// -16 afterwards. Note that exp(-8) is definitely not insignificant to
// accumulation, but exp(-16) definitely is.
static const int kScaledDiffIntegerBits = 5;
static const int kAccumulationIntegerBits = 12;
using FixedPointScaledDiff =
gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
using FixedPointAccum =
gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
const int trailing_dim = input_shape.DimensionsCount() - 1;
const int outer_size =
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
const int depth =
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
for (int i = 0; i < outer_size; ++i) {
InputT max_in_row = std::numeric_limits<InputT>::min();
for (int c = 0; c < depth; ++c) {
max_in_row = std::max(max_in_row, input_data[i * depth + c]);
}
FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
for (int c = 0; c < depth; ++c) {
int32_t input_diff =
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
if (input_diff >= diff_min) {
const int32_t input_diff_rescaled =
MultiplyByQuantizedMultiplierGreaterThanOne(
input_diff, input_beta_multiplier, input_beta_left_shift);
const FixedPointScaledDiff scaled_diff_f8 =
FixedPointScaledDiff::FromRaw(input_diff_rescaled);
sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
exp_on_negative_values(scaled_diff_f8));
}
}
int num_bits_over_unit;
FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
for (int c = 0; c < depth; ++c) {
int32_t input_diff =
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
if (input_diff >= diff_min) {
const int32_t input_diff_rescaled =
MultiplyByQuantizedMultiplierGreaterThanOne(
input_diff, input_beta_multiplier, input_beta_left_shift);
const FixedPointScaledDiff scaled_diff_f8 =
FixedPointScaledDiff::FromRaw(input_diff_rescaled);
FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
(shifted_scale * exp_in_0).raw(),
num_bits_over_unit + 31 - (sizeof(OutputT) * 8));
const int32_t shifted_output =
unsat_output +
static_cast<int32_t>(std::numeric_limits<OutputT>::min());
output_data[i * depth + c] = static_cast<OutputT>(std::max(
std::min(shifted_output,
static_cast<int32_t>(std::numeric_limits<OutputT>::max())),
static_cast<int32_t>(std::numeric_limits<OutputT>::min())));
} else {
output_data[i * depth + c] = std::numeric_limits<OutputT>::min();
}
}
}
}
// Computes exp(input - max_input)
inline int16_t SoftMaxCalculateExp(const SoftmaxParams& params,
const int16_t* input_data, const int depth,
int16_t max_in_row, int i, int c) {
int32_t input_diff = input_data[i * depth + c] - max_in_row;
// scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
// exp lut generated with range [-10, 0], as exp(-10) is negligible.
int32_t scaled_diff = MultiplyByQuantizedMultiplier(
input_diff, params.input_multiplier, params.input_left_shift);
// recenter to [-32768, 32767]
int32_t sym_scaled_diff = scaled_diff + 32767;
int16_t sat_sym_scaled_diff =
std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
static_cast<int32_t>(32767));
// apply the exp() LUT activation function
return lut_lookup(sat_sym_scaled_diff, params.exp_lut);
}
// Quantized softmax with int16_t input and int16_t output.
inline void SoftmaxInt16(const SoftmaxParams& params,
const RuntimeShape& input_shape,
const int16_t* input_data,
const RuntimeShape& output_shape,
int16_t* output_data) {
const int trailing_dim = input_shape.DimensionsCount() - 1;
const int outer_size =
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
const int depth =
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
for (int i = 0; i < outer_size; ++i) {
// Find the largest element
int16_t max_in_row = std::numeric_limits<int16_t>::min();
for (int c = 0; c < depth; ++c) {
max_in_row = std::max(max_in_row, input_data[i * depth + c]);
}
// This loops computes the exp values and their sum. We will need the exp
// values later on in the function so we cache them in the output_data
// buffer. This is an optimization done to avoid calculating the exp values
// twice making use of the output_data buffer as scratch memory.
int32_t sum_of_exps = 0; // Q16.15 fixed point format.
int16_t* exp_results_Q015 = output_data + i * depth;
for (int c = 0; c < depth; ++c) {
exp_results_Q015[c] =
SoftMaxCalculateExp(params, input_data, depth, max_in_row, i, c);
sum_of_exps += exp_results_Q015[c];
}
// Compute the reciprocal 1/sum_of_exps
uint8_t headroom_plus_one =
CountLeadingZeros(static_cast<uint32_t>(sum_of_exps));
int32_t shifted_sum =
((static_cast<int64_t>(sum_of_exps) << (headroom_plus_one - 1)) +
(1 << 13)) >>
14;
// since the LUT computes 1/(1 + x) we need to first compute x = (sum - 1).
// also, the LUT expects a symmetrical input, so we must also recenter x
// from [0, 65535] to [-32768, 32767].
int32_t sym_shifted_sum = shifted_sum + (-((1 << 15) + (1 << 16)));
int16_t sat_sym_shifted_sum = static_cast<int16_t>(
std::min(std::max(sym_shifted_sum, static_cast<int32_t>(-32768)),
static_cast<int32_t>(32767)));
// apply 1/(1 + x) LUT activation function
int16_t reciprocal_scale_Q015 =
lut_lookup(sat_sym_shifted_sum, params.one_over_one_plus_x_lut);
// Rescale the exp_result with reciprocal
// range of output is [0, 32767] correspond to [0.0, 1.0]
for (int c = 0; c < depth; ++c) {
uint8_t right_shift = 31 - headroom_plus_one;
int64_t round = 1 << (right_shift - 1);
int32_t result = (static_cast<int64_t>(exp_results_Q015[c]) *
static_cast<int64_t>(reciprocal_scale_Q015) +
round) >>
right_shift;
output_data[i * depth + c] = static_cast<int16_t>(
std::min(std::max(result, static_cast<int32_t>(0)),
static_cast<int32_t>(32767)));
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_

View File

@@ -0,0 +1,109 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
#include <cmath>
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
// TODO(b/135760455): Move this method anonymous namespace in a cc file.
inline RuntimeShape ExtendShapeSpaceToBatch(const RuntimeShape& shape) {
if (shape.DimensionsCount() == 4) {
return shape;
}
RuntimeShape new_shape(4, 1);
new_shape.SetDim(0, shape.Dims(0));
new_shape.SetDim(1, shape.Dims(1));
new_shape.SetDim(3, shape.Dims(2));
return new_shape;
}
template <typename T>
inline void SpaceToBatchND(const SpaceToBatchParams& params,
const RuntimeShape& unextended_input1_shape,
const T* input1_data,
const RuntimeShape& unextended_input2_shape,
const int32_t* block_shape_data,
const RuntimeShape& unextended_input3_shape,
const int32_t* paddings_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
ruy::profiler::ScopeLabel label("SpaceToBatchND");
TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
unextended_output_shape.DimensionsCount());
// Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
const RuntimeShape input1_shape =
ExtendShapeSpaceToBatch(unextended_input1_shape);
const RuntimeShape output_shape =
ExtendShapeSpaceToBatch(unextended_output_shape);
const int depth = input1_shape.Dims(3);
const int input_width = input1_shape.Dims(2);
const int input_height = input1_shape.Dims(1);
const int input_batch_size = input1_shape.Dims(0);
const int output_width = output_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_batch_size = output_shape.Dims(0);
const int block_shape_height = block_shape_data[0];
const int block_shape_width =
unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
const int padding_top = paddings_data[0];
const int padding_left =
unextended_input1_shape.DimensionsCount() == 4 ? paddings_data[2] : 0;
// For uint8 quantized, the correct padding "zero value" is the output offset.
const int32_t pad_value = params.output_offset;
for (int out_b = 0; out_b < output_batch_size; ++out_b) {
int input_batch = out_b % input_batch_size;
int shift_w = (out_b / input_batch_size) % block_shape_width;
int shift_h = (out_b / input_batch_size) / block_shape_width;
for (int out_h = 0; out_h < output_height; ++out_h) {
for (int out_w = 0; out_w < output_width; ++out_w) {
T* out = output_data + Offset(output_shape, out_b, out_h, out_w, 0);
if (out_h * block_shape_height + shift_h < padding_top ||
out_h * block_shape_height + shift_h >=
padding_top + input_height ||
out_w * block_shape_width + shift_w < padding_left ||
out_w * block_shape_width + shift_w >= padding_left + input_width) {
// This may not execute correctly when pad_value != 0 and T != uint8.
memset(out, pad_value, depth * sizeof(T));
} else {
const T* in =
input1_data +
Offset(input1_shape, input_batch,
(out_h * block_shape_height + shift_h) - padding_top,
(out_w * block_shape_width + shift_w) - padding_left, 0);
memcpy(out, in, depth * sizeof(T));
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_

View File

@@ -0,0 +1,80 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
#include <cstdint>
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(4, unextended_input_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(4, unextended_output_shape);
const int input_depth = input_shape.Dims(3);
const int input_width = input_shape.Dims(2);
const int input_height = input_shape.Dims(1);
const int input_batch = input_shape.Dims(0);
const int output_depth = output_shape.Dims(3);
const int output_width = output_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_batch = output_shape.Dims(0);
const int32_t block_size = op_params.block_size;
TFLITE_DCHECK_EQ(input_width, output_width * block_size);
TFLITE_DCHECK_EQ(input_height, output_height * block_size);
TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth);
TFLITE_DCHECK_EQ(input_batch, output_batch);
for (int in_b = 0; in_b < input_batch; ++in_b) {
for (int in_h = 0; in_h < input_height; ++in_h) {
for (int in_w = 0; in_w < input_width; ++in_w) {
for (int in_d = 0; in_d < input_depth; ++in_d) {
const int out_d =
in_d + ((in_h % block_size) * block_size + in_w % block_size) *
input_depth;
const int out_w = in_w / block_size;
const int out_h = in_h / block_size;
const int out_b = in_b;
const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
const int output_index =
Offset(output_shape, out_b, out_h, out_w, out_d);
output_data[output_index] = input_data[input_index];
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_

View File

@@ -0,0 +1,121 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/portable_tensor.h"
#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T>
inline void StridedSlice(const tflite::StridedSliceParams& op_params,
const RuntimeShape& unextended_input_shape,
const RuntimeShape& unextended_output_shape,
SequentialTensorWriter<T>* writer) {
using strided_slice::LoopCondition;
using strided_slice::StartForAxis;
using strided_slice::StopForAxis;
ruy::profiler::ScopeLabel label("StridedSlice");
// Note that the output_shape is not used herein.
tflite::StridedSliceParams params_copy = op_params;
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 5);
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 5);
const RuntimeShape input_shape =
RuntimeShape::ExtendedShape(5, unextended_input_shape);
const RuntimeShape output_shape =
RuntimeShape::ExtendedShape(5, unextended_output_shape);
// Reverse and pad to 5 dimensions because that is what the runtime code
// requires (ie. all shapes must be 5D and are given backwards).
strided_slice::StridedSlicePadIndices(&params_copy, 5);
const int start_0 = StartForAxis(params_copy, input_shape, 0);
const int stop_0 = StopForAxis(params_copy, input_shape, 0, start_0);
const int start_1 = StartForAxis(params_copy, input_shape, 1);
const int stop_1 = StopForAxis(params_copy, input_shape, 1, start_1);
const int start_2 = StartForAxis(params_copy, input_shape, 2);
const int stop_2 = StopForAxis(params_copy, input_shape, 2, start_2);
const int start_3 = StartForAxis(params_copy, input_shape, 3);
const int stop_3 = StopForAxis(params_copy, input_shape, 3, start_3);
const int start_4 = StartForAxis(params_copy, input_shape, 4);
const int stop_4 = StopForAxis(params_copy, input_shape, 4, start_4);
for (int offset_0 = start_0 * input_shape.Dims(1),
end_0 = stop_0 * input_shape.Dims(1),
step_0 = params_copy.strides[0] * input_shape.Dims(1);
!LoopCondition(offset_0, end_0, params_copy.strides[0]);
offset_0 += step_0) {
for (int offset_1 = (offset_0 + start_1) * input_shape.Dims(2),
end_1 = (offset_0 + stop_1) * input_shape.Dims(2),
step_1 = params_copy.strides[1] * input_shape.Dims(2);
!LoopCondition(offset_1, end_1, params_copy.strides[1]);
offset_1 += step_1) {
for (int offset_2 = (offset_1 + start_2) * input_shape.Dims(3),
end_2 = (offset_1 + stop_2) * input_shape.Dims(3),
step_2 = params_copy.strides[2] * input_shape.Dims(3);
!LoopCondition(offset_2, end_2, params_copy.strides[2]);
offset_2 += step_2) {
for (int offset_3 = (offset_2 + start_3) * input_shape.Dims(4),
end_3 = (offset_2 + stop_3) * input_shape.Dims(4),
step_3 = params_copy.strides[3] * input_shape.Dims(4);
!LoopCondition(offset_3, end_3, params_copy.strides[3]);
offset_3 += step_3) {
for (int offset_4 = offset_3 + start_4, end_4 = offset_3 + stop_4;
!LoopCondition(offset_4, end_4, params_copy.strides[4]);
offset_4 += params_copy.strides[4]) {
writer->Write(offset_4);
}
}
}
}
}
}
template <typename T>
inline void StridedSlice(const tflite::StridedSliceParams& op_params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
SequentialTensorWriter<T> writer(input_data, output_data);
StridedSlice<T>(op_params, unextended_input_shape, unextended_output_shape,
&writer);
}
template <typename T>
inline void StridedSlice(const tflite::StridedSliceParams& op_params,
const RuntimeShape& unextended_input_shape,
const TfLiteTensor* input,
const RuntimeShape& unextended_output_shape,
TfLiteTensor* output) {
SequentialTensorWriter<T> writer(input, output);
StridedSlice<T>(op_params, unextended_input_shape, unextended_output_shape,
&writer);
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_

View File

@@ -0,0 +1,476 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
#include <stdint.h>
#include <algorithm>
#include <limits>
#include "ruy/profiler/instrumentation.h" // from @ruy
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void SubNonBroadcast(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const float* input1_data,
const RuntimeShape& input2_shape,
const float* input2_data,
const RuntimeShape& output_shape,
float* output_data) {
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
input1_data[i] - input2_data[i], params.float_activation_min,
params.float_activation_max);
}
}
inline void SubNonBroadcast(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int32_t* input1_data,
const RuntimeShape& input2_shape,
const int32_t* input2_data,
const RuntimeShape& output_shape,
int32_t* output_data) {
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
input1_data[i] - input2_data[i], params.quantized_activation_min,
params.quantized_activation_max);
}
}
// TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
// dimensionality if the runtime code does a single loop over one dimension
// that handles broadcasting as the base case. The code generator would then
// generate max(D1, D2) nested for loops.
template <int N = 5>
inline void BroadcastSubSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const float* input1_data,
const RuntimeShape& input2_shape,
const float* input2_data,
const RuntimeShape& output_shape,
float* output_data) {
ruy::profiler::ScopeLabel label("BroadcastSubSlow/float");
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
NdArrayDesc<N> desc1;
NdArrayDesc<N> desc2;
NdArrayDesc<N> output_desc;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
auto sub_func = [&](int indexes[N]) {
output_data[SubscriptToIndex(output_desc, indexes)] =
ActivationFunctionWithMinMax(
input1_data[SubscriptToIndex(desc1, indexes)] -
input2_data[SubscriptToIndex(desc2, indexes)],
params.float_activation_min, params.float_activation_max);
};
NDOpsHelper<N>(output_desc, sub_func);
}
template <int N = 5>
inline void BroadcastSubSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int32_t* input1_data,
const RuntimeShape& input2_shape,
const int32_t* input2_data,
const RuntimeShape& output_shape,
int32_t* output_data) {
ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
NdArrayDesc<N> desc1;
NdArrayDesc<N> desc2;
NdArrayDesc<N> output_desc;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
auto sub_func = [&](int indexes[N]) {
output_data[SubscriptToIndex(output_desc, indexes)] =
ActivationFunctionWithMinMax(
input1_data[SubscriptToIndex(desc1, indexes)] -
input2_data[SubscriptToIndex(desc2, indexes)],
params.quantized_activation_min, params.quantized_activation_max);
};
NDOpsHelper<N>(output_desc, sub_func);
}
template <int N = 5>
void BroadcastSubSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int64_t* input1_data,
const RuntimeShape& input2_shape,
const int64_t* input2_data,
const RuntimeShape& output_shape, int64_t* output_data) {
ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
NdArrayDesc<N> desc1;
NdArrayDesc<N> desc2;
NdArrayDesc<N> output_desc;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
auto sub_func = [&](int indexes[N]) {
output_data[SubscriptToIndex(output_desc, indexes)] =
ActivationFunctionWithMinMax(
input1_data[SubscriptToIndex(desc1, indexes)] -
input2_data[SubscriptToIndex(desc2, indexes)],
params.int64_activation_min, params.int64_activation_max);
};
NDOpsHelper<N>(output_desc, sub_func);
}
template <typename T, int N = 5>
void BroadcastSubSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const T* input1_data,
const RuntimeShape& input2_shape, const T* input2_data,
const RuntimeShape& output_shape, T* output_data) {
ruy::profiler::ScopeLabel label("BroadcastSubSlow/templated");
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
NdArrayDesc<N> desc1;
NdArrayDesc<N> desc2;
NdArrayDesc<N> output_desc;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
auto sub_func = [&](int indexes[N]) {
output_data[SubscriptToIndex(output_desc, indexes)] =
ActivationFunctionWithMinMax(
input1_data[SubscriptToIndex(desc1, indexes)] -
input2_data[SubscriptToIndex(desc2, indexes)],
params.quantized_activation_min, params.quantized_activation_max);
};
NDOpsHelper<N>(output_desc, sub_func);
}
template <int N = 5>
inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const int16_t* input1_data,
const RuntimeShape& input2_shape,
const int16_t* input2_data,
const RuntimeShape& output_shape,
int16_t* output_data) {
ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
NdArrayDesc<N> desc1;
NdArrayDesc<N> desc2;
NdArrayDesc<N> output_desc;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
auto sub_func = [&](int indexes[N]) {
const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
const int32_t scaled_input1_val =
gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
const int32_t scaled_input2_val =
gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
const int32_t raw_output = scaled_input1_val - scaled_input2_val;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[SubscriptToIndex(output_desc, indexes)] =
static_cast<int16_t>(clamped_output);
};
NDOpsHelper<N>(output_desc, sub_func);
}
template <typename T, int N = 5>
void BroadcastQuantSubSlow(const ArithmeticParams& params,
const RuntimeShape& input1_shape,
const T* input1_data,
const RuntimeShape& input2_shape,
const T* input2_data,
const RuntimeShape& output_shape, T* output_data) {
ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T");
NdArrayDesc<N> desc1;
NdArrayDesc<N> desc2;
NdArrayDesc<N> output_desc;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
auto sub_func = [&](int indexes[N]) {
const int32_t input1_val =
params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
const int32_t input2_val =
params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
const int32_t scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input1_val, params.input1_multiplier, params.input1_shift);
const int32_t scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input2_val, params.input2_multiplier, params.input2_shift);
const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
raw_sub, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[SubscriptToIndex(output_desc, indexes)] =
static_cast<T>(clamped_output);
};
NDOpsHelper<N>(output_desc, sub_func);
}
// Element-wise add that can often be used for inner loop of broadcast add as
// well as the non-broadcast add.
template <typename T>
inline void SubElementwise(int size, const ArithmeticParams& params,
const T* input1_data, const T* input2_data,
T* output_data) {
for (int i = 0; i < size; ++i) {
const int32_t input1_val = params.input1_offset + input1_data[i];
const int32_t input2_val = params.input2_offset + input2_data[i];
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
const int32_t scaled_input1_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input1_val, params.input1_multiplier, params.input1_shift);
const int32_t scaled_input2_val =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input2_val, params.input2_multiplier, params.input2_shift);
const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
const int32_t raw_output =
MultiplyByQuantizedMultiplierSmallerThanOneExp(
raw_sub, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output =
std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
output_data[i] = static_cast<T>(clamped_output);
}
}
inline void Sub(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const uint8_t* input1_data,
const RuntimeShape& input2_shape, const uint8_t* input2_data,
const RuntimeShape& output_shape, uint8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
TFLITE_DCHECK_GT(params.input1_offset, -256);
TFLITE_DCHECK_GT(params.input2_offset, -256);
TFLITE_DCHECK_LT(params.input1_offset, 256);
TFLITE_DCHECK_LT(params.input2_offset, 256);
SubElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void Sub(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int8_t* input1_data,
const RuntimeShape& input2_shape, const int8_t* input2_data,
const RuntimeShape& output_shape, int8_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
TFLITE_DCHECK_GE(params.input1_offset, -128);
TFLITE_DCHECK_GE(params.input2_offset, -128);
// offset = -quantization_params.zero_point in PrepareGeneralSubOp().
// So it's maximum can be 128 not 127.
TFLITE_DCHECK_LE(params.input1_offset, 128);
TFLITE_DCHECK_LE(params.input2_offset, 128);
SubElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void Sub(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int16_t* input1_data,
const RuntimeShape& input2_shape, const int16_t* input2_data,
const RuntimeShape& output_shape, int16_t* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
TFLITE_DCHECK_EQ(params.input1_offset, 0);
TFLITE_DCHECK_EQ(params.input2_offset, 0);
SubElementwise(flat_size, params, input1_data, input2_data, output_data);
}
template <typename T>
void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
const T* input1_data, const RuntimeShape& input2_shape,
const T* input2_data, const RuntimeShape& output_shape,
T* output_data) {
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
const RuntimeShape extended_output_shape =
RuntimeShape::ExtendedShape(4, output_shape);
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
output_data[Offset(extended_output_shape, b, y, x, c)] =
input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
}
}
}
}
}
inline void SetActivationMinMax(const ArithmeticParams& params,
int32_t* activation_min,
int32_t* activation_max) {
*activation_min = params.quantized_activation_min;
*activation_max = params.quantized_activation_max;
}
inline void SetActivationMinMax(const ArithmeticParams& params,
float* activation_min, float* activation_max) {
*activation_min = params.float_activation_min;
*activation_max = params.float_activation_max;
}
inline void SetActivationMinMax(const ArithmeticParams& params,
int64_t* activation_min,
int64_t* activation_max) {
*activation_min = params.int64_activation_min;
*activation_max = params.int64_activation_max;
}
template <typename T>
inline void SubWithActivation(
const ArithmeticParams& params, const RuntimeShape& input1_shape,
const T* input1_data, const RuntimeShape& input2_shape,
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
ruy::profiler::ScopeLabel label("SubWithActivation");
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
T activation_min, activation_max;
SetActivationMinMax(params, &activation_min, &activation_max);
for (int i = 0; i < flat_size; ++i) {
output_data[i] = ActivationFunctionWithMinMax(
input1_data[i] - input2_data[i], activation_min, activation_max);
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_

View File

@@ -0,0 +1,129 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
#include <cmath>
#include "fixedpoint/fixedpoint.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/types.h"
#include "tensorflow/lite/kernels/op_macros.h"
namespace tflite {
namespace reference_ops {
inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; i++) {
float val = input_data[i];
float result = std::tanh(val);
output_data[i] = result;
}
}
// Convenience version that allows, for example, generated-code calls to be
// uniform between data types.
inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& output_shape,
float* output_data) {
// Drop params: not needed.
Tanh(input_shape, input_data, output_shape, output_data);
}
inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
const int16_t* input_data, const RuntimeShape& output_shape,
int16_t* output_data) {
const int input_left_shift = params.input_left_shift;
// Support for shifts is limited until we have a parameterized version of
// SaturatingRoundingMultiplyByPOT().
TFLITE_DCHECK_GE(input_left_shift, 0);
TFLITE_DCHECK_LE(input_left_shift, 1);
const int flat_size = MatchingFlatSize(input_shape, output_shape);
// F0 uses 0 integer bits, range [-1, 1].
// This is the return type of math functions such as tanh, logistic,
// whose range is in [-1, 1].
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
// F3 uses 3 integer bits, range [-8, 8], the input range expected here.
using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
if (input_left_shift == 0) {
for (int i = 0; i < flat_size; i++) {
F3 input = F3::FromRaw(input_data[i]);
F0 output = gemmlowp::tanh(input);
output_data[i] = output.raw();
}
} else {
for (int i = 0; i < flat_size; i++) {
F3 input = F3::FromRaw(
gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
F0 output = gemmlowp::tanh(input);
output_data[i] = output.raw();
}
}
}
inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& output_shape,
uint8_t* output_data) {
const int32_t input_zero_point = params.input_zero_point;
const int32_t input_range_radius = params.input_range_radius;
const int32_t input_multiplier = params.input_multiplier;
const int input_left_shift = params.input_left_shift;
const int32_t output_zero_point = 128;
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; i++) {
const uint8_t input_val_u8 = input_data[i];
const int32_t input_val_centered =
static_cast<int32_t>(input_val_u8) - input_zero_point;
uint8_t output_val;
if (input_val_centered <= -input_range_radius) {
output_val = 0;
} else if (input_val_centered >= input_range_radius) {
output_val = 255;
} else {
const int32_t input_val_rescaled =
MultiplyByQuantizedMultiplierGreaterThanOne(
input_val_centered, input_multiplier, input_left_shift);
using FixedPoint4 = gemmlowp::FixedPoint<int32_t, 4>;
using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
// Convert from Q0.31 to Q24.7.
using gemmlowp::RoundingDivideByPOT;
int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
output_val_s32 += output_zero_point;
if (output_val_s32 == 256) {
output_val_s32 = 255;
}
// Reinterpret as Q0.7, encoded in uint8_t.
TFLITE_DCHECK_GE(output_val_s32, 0);
TFLITE_DCHECK_LE(output_val_s32, 255);
output_val = static_cast<uint8_t>(output_val_s32);
}
output_data[i] = output_val;
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_

View File

@@ -0,0 +1,111 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
template <typename T, int N>
void TransposeImpl(const TransposeParams& params,
const RuntimeShape& unextended_input_shape,
const T* input_data,
const RuntimeShape& unextended_output_shape,
T* output_data) {
const int unextended_input_size = unextended_input_shape.DimensionsCount();
const int unextended_output_size = unextended_output_shape.DimensionsCount();
TFLITE_DCHECK_LE(unextended_input_size, N);
TFLITE_DCHECK_LE(unextended_output_size, N);
TFLITE_DCHECK_EQ(unextended_output_size, params.perm_count);
const int input_ext_size = N - unextended_input_size;
const int output_ext_size = N - unextended_output_size;
NdArrayDesc<N> input_desc;
NdArrayDesc<N> output_desc;
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_input_shape),
&input_desc);
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
&output_desc);
// The perm data is extended to match the output, each index incremented by
// the amount of front padding of the input shape.
int extended_perm[N];
for (int i = 0; i < N; ++i) {
extended_perm[i] = i < output_ext_size
? i
: params.perm[i - output_ext_size] + input_ext_size;
}
// Permutes the input shape so we don't need to permute the indexes inside
// the loop. Check to make sure output_dims is matching input_dims.
NdArrayDesc<N> perm_input_desc;
for (int k = 0; k < N; ++k) {
TFLITE_DCHECK_EQ(input_desc.extents[extended_perm[k]],
output_desc.extents[k]);
perm_input_desc.extents[k] = input_desc.extents[extended_perm[k]];
perm_input_desc.strides[k] = input_desc.strides[extended_perm[k]];
}
// Naive transpose loop (iterate on output index and compute input index).
auto tranpose_func = [&](int indexes[N]) {
output_data[SubscriptToIndex(output_desc, indexes)] =
input_data[SubscriptToIndex(perm_input_desc, indexes)];
};
NDOpsHelper<N>(output_desc, tranpose_func);
}
template <typename T, int N = 5>
void Transpose(const TransposeParams& params,
const RuntimeShape& unextended_input_shape, const T* input_data,
const RuntimeShape& unextended_output_shape, T* output_data) {
// Transpose kernel only does rearranging values not numeric evaluations on
// each cell. It's safe to implement per size of scalar type and this trick
// keeps the total code size in a reasonable range.
switch (sizeof(T)) {
case 1:
TransposeImpl<int8_t, N>(params, unextended_input_shape,
reinterpret_cast<const int8_t*>(input_data),
unextended_output_shape,
reinterpret_cast<int8_t*>(output_data));
break;
case 2:
TransposeImpl<int16_t, N>(params, unextended_input_shape,
reinterpret_cast<const int16_t*>(input_data),
unextended_output_shape,
reinterpret_cast<int16_t*>(output_data));
break;
case 4:
TransposeImpl<int32_t, N>(params, unextended_input_shape,
reinterpret_cast<const int32_t*>(input_data),
unextended_output_shape,
reinterpret_cast<int32_t*>(output_data));
break;
case 8:
TransposeImpl<int64_t, N>(params, unextended_input_shape,
reinterpret_cast<const int64_t*>(input_data),
unextended_output_shape,
reinterpret_cast<int64_t*>(output_data));
break;
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_

View File

@@ -0,0 +1,217 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace reference_ops {
inline void TransposeConv(
const ConvParams& params, const RuntimeShape& input_shape,
const float* input_data, const RuntimeShape& filter_shape,
const float* filter_data, const RuntimeShape& bias_shape,
const float* bias_data, const RuntimeShape& output_shape,
float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
// Although transpose convolution simplifies to convolution with transposed
// weights for strides of 1, non-unitary striding complicates matters. To
// keep this reference implementation as clear as possible, we use a
// "scatter" access pattern, where we loop through all the input elements,
// computing their influence on the output, rather than looping through the
// output elements in the typical "gather" access pattern of a conv. We
// therefore must initialize the output array to zero.
const int num_elements = output_shape.FlatSize();
for (int i = 0; i < num_elements; i++) {
output_data[i] = 0.0f;
}
// Loop through input elements one at a time.
for (int batch = 0; batch < batches; ++batch) {
for (int in_y = 0; in_y < input_height; ++in_y) {
for (int in_x = 0; in_x < input_width; ++in_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
// Loop through the output elements it will influence
const int out_x_origin = (in_x * stride_width) - pad_width;
const int out_y_origin = (in_y * stride_height) - pad_height;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
for (int out_channel = 0; out_channel < output_depth;
++out_channel) {
// Compute output element location
const int out_x = out_x_origin + filter_x;
const int out_y = out_y_origin + filter_y;
// We cannot accumulate out of bounds
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
(out_y < output_height)) {
float input_value = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
float filter_value =
filter_data[Offset(filter_shape, out_channel, filter_y,
filter_x, in_channel)];
output_data[Offset(output_shape, batch, out_y, out_x,
out_channel)] +=
input_value * filter_value;
}
}
}
}
}
}
}
}
if (bias_data) {
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
output_data[Offset(output_shape, batch, out_y, out_x,
out_channel)] += bias_data[out_channel];
}
}
}
}
}
}
inline void TransposeConv(
const ConvParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& filter_shape,
const uint8_t* filter_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
uint8_t* output_data, const RuntimeShape& im2col_shape,
uint8_t* im2col_data, int32_t* scratch_buffer) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
(void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int32_t input_offset = params.input_offset;
const int32_t filter_offset = params.weights_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
if (bias_data) {
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
}
const int num_elements = output_shape.FlatSize();
// We need to initialize scratch_buffer to all 0s, as we apply the same
// 'scatter' based trick as in float version.
memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
// Loop through input elements one at a time.
for (int batch = 0; batch < batches; ++batch) {
for (int in_y = 0; in_y < input_height; ++in_y) {
for (int in_x = 0; in_x < input_width; ++in_x) {
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
// Loop through the output elements it will influence.
const int out_x_origin = (in_x * stride_width) - pad_width;
const int out_y_origin = (in_y * stride_height) - pad_height;
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
for (int out_channel = 0; out_channel < output_depth;
++out_channel) {
// Compute output element location.
const int out_x = out_x_origin + filter_x;
const int out_y = out_y_origin + filter_y;
// We cannot accumulate out of bounds.
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
(out_y < output_height)) {
uint8_t input_value = input_data[Offset(
input_shape, batch, in_y, in_x, in_channel)];
uint8_t filter_value =
filter_data[Offset(filter_shape, out_channel, filter_y,
filter_x, in_channel)];
scratch_buffer[Offset(output_shape, batch, out_y, out_x,
out_channel)] +=
(input_value + input_offset) *
(filter_value + filter_offset);
}
}
}
}
}
}
}
}
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
out_channel)];
if (bias_data) {
acc += bias_data[out_channel];
}
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
acc, output_multiplier, output_shift);
scaled_acc += output_offset;
scaled_acc = std::max(scaled_acc, output_activation_min);
scaled_acc = std::min(scaled_acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
static_cast<uint8_t>(scaled_acc);
}
}
}
}
}
} // namespace reference_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_

View File

@@ -0,0 +1,155 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
namespace tflite {
template <int N>
struct Dims {
int sizes[N];
int strides[N];
};
class RuntimeShape {
public:
RuntimeShape& operator=(RuntimeShape const&) = delete;
RuntimeShape() : size_(0) {}
explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {}
RuntimeShape(int shape_size, int32_t value) : size_(shape_size) {
for (int i = 0; i < shape_size; ++i) {
SetDim(i, value);
}
}
RuntimeShape(int dimensions_count, const int32_t* dims_data)
: size_(dimensions_count) {
ReplaceWith(dimensions_count, dims_data);
}
bool operator==(const RuntimeShape& comp) const {
return this->size_ == comp.size_ &&
std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32_t)) ==
0;
}
~RuntimeShape() {}
int32_t DimensionsCount() const { return size_; }
int32_t Dims(int i) const {
TFLITE_DCHECK_GE(i, 0);
TFLITE_DCHECK_LT(i, size_);
return dims_[i];
}
void SetDim(int i, int32_t val) {
TFLITE_DCHECK_GE(i, 0);
TFLITE_DCHECK_LT(i, size_);
dims_[i] = val;
}
static RuntimeShape ExtendedShape(int new_shape_size,
const RuntimeShape& shape) {
return RuntimeShape(new_shape_size, shape, 1);
}
int32_t* DimsData() { return dims_; }
const int32_t* DimsData() const { return dims_; }
const int32_t* DimsDataUpTo5D() const { return dims_; }
void ReplaceWith(int dimensions_count, const int32_t* dims_data) {
size_ = dimensions_count;
int32_t* dst_dims = DimsData();
std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
}
// Returns the total count of elements, that is the size when flattened into a
// vector.
int FlatSize() const {
int buffer_size = 1;
const int* dims_data = reinterpret_cast<const int*>(DimsData());
for (int i = 0; i < size_; i++) {
buffer_size *= dims_data[i];
}
return buffer_size;
}
private:
// For use only by ExtendedShape(), written to guarantee (return-value) copy
// elision in C++17.
// This creates a shape padded to the desired size with the specified value.
RuntimeShape(int new_shape_size, const RuntimeShape& shape, int pad_value)
: size_(new_shape_size) {
// If the following check fails, it is likely because a 4D-only kernel is
// being used with an array of larger dimension count.
TFLITE_CHECK_GE(new_shape_size, shape.DimensionsCount());
const int size_increase = new_shape_size - shape.DimensionsCount();
for (int i = 0; i < size_increase; ++i) {
SetDim(i, pad_value);
}
std::memcpy(DimsData() + size_increase, shape.DimsData(),
sizeof(int32_t) * shape.DimensionsCount());
}
// A maximum of 4 dimensions are supported on TFLM.
static constexpr int kMaxSize = 5;
int32_t size_;
union {
int32_t dims_[kMaxSize];
};
};
// Since tensors with '0' in their shape are valid in TF, these offset functions
// allow that as long as the corresponding index is also 0. It is upto the
// calling ops to ensure that they perform verification checks on tensor shapes
// if they don't support a particular behavior.
inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) {
TFLITE_DCHECK_EQ(shape.DimensionsCount(), 4);
const int* dims_data = reinterpret_cast<const int*>(shape.DimsData());
TFLITE_DCHECK((dims_data[0] == 0 && i0 == 0) ||
(i0 >= 0 && i0 < dims_data[0]));
TFLITE_DCHECK((dims_data[1] == 0 && i1 == 0) ||
(i1 >= 0 && i1 < dims_data[1]));
TFLITE_DCHECK((dims_data[2] == 0 && i2 == 0) ||
(i2 >= 0 && i2 < dims_data[2]));
TFLITE_DCHECK((dims_data[3] == 0 && i3 == 0) ||
(i3 >= 0 && i3 < dims_data[3]));
return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
}
inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3,
int i4) {
TFLITE_DCHECK_EQ(shape.DimensionsCount(), 5);
const int* dims_data = reinterpret_cast<const int*>(shape.DimsData());
TFLITE_DCHECK((dims_data[0] == 0 && i0 == 0) ||
(i0 >= 0 && i0 < dims_data[0]));
TFLITE_DCHECK((dims_data[1] == 0 && i1 == 0) ||
(i1 >= 0 && i1 < dims_data[1]));
TFLITE_DCHECK((dims_data[2] == 0 && i2 == 0) ||
(i2 >= 0 && i2 < dims_data[2]));
TFLITE_DCHECK((dims_data[3] == 0 && i3 == 0) ||
(i3 >= 0 && i3 < dims_data[3]));
TFLITE_DCHECK((dims_data[4] == 0 && i4 == 0) ||
(i4 >= 0 && i4 < dims_data[4]));
return (((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3) *
dims_data[4] +
i4;
}
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_

View File

@@ -0,0 +1,211 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
#include <limits>
#include <vector>
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
namespace strided_slice {
// Use until std::clamp() is available from C++17.
inline int Clamp(const int v, const int lo, const int hi) {
TFLITE_DCHECK(!(hi < lo));
if (hi < v) return hi;
if (v < lo) return lo;
return v;
}
inline void StridedSlicePadIndices(tflite::StridedSliceParams* p,
int dim_count) {
// Add indices and mask bits to fully include extra dimensions
TFLITE_CHECK_LE(dim_count, 5);
TFLITE_CHECK_GE(dim_count, p->start_indices_count);
TFLITE_CHECK_EQ(p->start_indices_count, p->stop_indices_count);
TFLITE_CHECK_EQ(p->stop_indices_count, p->strides_count);
const int pad_count = dim_count - p->start_indices_count;
// Pad indices at start, so move arrays by pad_count.
for (int i = p->start_indices_count - 1; i >= 0; --i) {
p->strides[i + pad_count] = p->strides[i];
p->start_indices[i + pad_count] = p->start_indices[i];
p->stop_indices[i + pad_count] = p->stop_indices[i];
}
for (int i = 0; i < pad_count; ++i) {
p->start_indices[i] = 0;
p->stop_indices[i] = 1;
p->strides[i] = 1;
}
// Pad masks with 0s or 1s as required.
p->shrink_axis_mask <<= pad_count;
p->ellipsis_mask <<= pad_count;
p->new_axis_mask <<= pad_count;
p->begin_mask <<= pad_count;
p->end_mask <<= pad_count;
p->begin_mask |= (1 << pad_count) - 1;
p->end_mask |= (1 << pad_count) - 1;
p->start_indices_count = dim_count;
p->stop_indices_count = dim_count;
p->strides_count = dim_count;
}
// Return the index for the first element along that axis. This index will be a
// positive integer between [0, axis_size] (or [-1, axis_size -1] if stride < 0)
// that can be used to index directly into the data.
inline int StartForAxis(const tflite::StridedSliceParams& params,
const RuntimeShape& input_shape, int axis) {
const auto begin_mask = params.begin_mask;
const auto* start_indices = params.start_indices;
const auto* strides = params.strides;
const int axis_size = input_shape.Dims(axis);
if (axis_size == 0) {
return 0;
}
// Begin with the specified index.
int start = start_indices[axis];
// begin_mask override
if (begin_mask & 1 << axis) {
if (strides[axis] > 0) {
// Forward iteration - use the first element. These values will get
// clamped below (Note: We could have set them to 0 and axis_size-1, but
// use lowest() and max() to maintain symmetry with StopForAxis())
start = std::numeric_limits<int>::lowest();
} else {
// Backward iteration - use the last element.
start = std::numeric_limits<int>::max();
}
}
// Handle negative indices
if (start < 0) {
start += axis_size;
}
// Clamping
if (strides[axis] > 0) {
// Forward iteration
start = Clamp(start, 0, axis_size);
} else {
// Backward iteration
start = Clamp(start, -1, axis_size - 1);
}
return start;
}
// Return the "real" index for the end of iteration along that axis. This is an
// "end" in the traditional C sense, in that it points to one past the last
// element. ie. So if you were iterating through all elements of a 1D array of
// size 4, this function would return 4 as the stop, because it is one past the
// "real" indices of 0, 1, 2 & 3.
inline int StopForAxis(const tflite::StridedSliceParams& params,
const RuntimeShape& input_shape, int axis,
int start_for_axis) {
const auto end_mask = params.end_mask;
const auto shrink_axis_mask = params.shrink_axis_mask;
const auto* stop_indices = params.stop_indices;
const auto* strides = params.strides;
const int axis_size = input_shape.Dims(axis);
if (axis_size == 0) {
return 0;
}
// Begin with the specified index
const bool shrink_axis = shrink_axis_mask & (1 << axis);
int stop = stop_indices[axis];
// When shrinking an axis, the end position does not matter (and can be
// incorrect when negative indexing is used, see Issue #19260). Always use
// start_for_axis + 1 to generate a length 1 slice, since start_for_axis has
// already been adjusted for negative indices.
if (shrink_axis) {
return start_for_axis + 1;
}
// end_mask override
if (end_mask & (1 << axis)) {
if (strides[axis] > 0) {
// Forward iteration - use the last element. These values will get
// clamped below
stop = std::numeric_limits<int>::max();
} else {
// Backward iteration - use the first element.
stop = std::numeric_limits<int>::lowest();
}
}
// Handle negative indices
if (stop < 0) {
stop += axis_size;
}
// Clamping
// Because the end index points one past the last element, we need slightly
// different clamping ranges depending on the direction.
if (strides[axis] > 0) {
// Forward iteration
stop = Clamp(stop, 0, axis_size);
} else {
// Backward iteration
stop = Clamp(stop, -1, axis_size - 1);
}
return stop;
}
inline bool LoopCondition(int index, int stop, int stride) {
// True when we have reached the end of an axis and should loop.
return stride > 0 ? index >= stop : index <= stop;
}
inline tflite::StridedSliceParams BuildStridedSliceParams(
int begin_mask, int end_mask, int shrink_axis_mask,
const std::vector<int>& start_indices, const std::vector<int>& stop_indices,
const std::vector<int>& strides) {
tflite::StridedSliceParams op_params;
const int dims_count = start_indices.size();
op_params.start_indices_count = dims_count;
op_params.stop_indices_count = dims_count;
op_params.strides_count = dims_count;
for (int i = 0; i < dims_count; ++i) {
op_params.start_indices[i] = start_indices[i];
op_params.stop_indices[i] = stop_indices[i];
op_params.strides[i] = strides[i];
}
op_params.begin_mask = begin_mask;
op_params.ellipsis_mask = 0;
op_params.end_mask = end_mask;
op_params.new_axis_mask = 0;
op_params.shrink_axis_mask = shrink_axis_mask;
return op_params;
}
} // namespace strided_slice
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_

View File

@@ -0,0 +1,47 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
template <typename T>
inline T* GetTensorData(TfLiteTensor* tensor) {
return tensor != nullptr ? reinterpret_cast<T*>(tensor->data.raw) : nullptr;
}
template <typename T>
inline const T* GetTensorData(const TfLiteTensor* tensor) {
return tensor != nullptr ? reinterpret_cast<const T*>(tensor->data.raw)
: nullptr;
}
inline RuntimeShape GetTensorShape(const TfLiteTensor* tensor) {
if (tensor == nullptr) {
return RuntimeShape();
}
TfLiteIntArray* dims = tensor->dims;
const int dims_size = dims->size;
const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
return RuntimeShape(dims_size, dims_data);
}
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,578 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/kernel_util.h"
#include <stdint.h>
#include <stdlib.h>
#include <algorithm>
#include <complex>
#include <limits>
#include <memory>
#ifndef TF_LITE_STATIC_MEMORY
#include <string>
#endif // TF_LITE_STATIC_MEMORY
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/cppmath.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#if defined(__APPLE__)
#include "TargetConditionals.h"
#endif
namespace tflite {
namespace {
// Assumes tensor_index is a valid index (in bounds)
inline TfLiteTensor* GetTensorAtIndex(const TfLiteContext* context,
int tensor_index) {
if (context->tensors != nullptr) {
return &context->tensors[tensor_index];
} else {
return context->GetTensor(context, tensor_index);
}
}
// Validate in a single place to reduce binary size
inline TfLiteStatus ValidateTensorIndexingSafe(const TfLiteContext* context,
int index, int max_size,
const int* tensor_indices,
int* tensor_index) {
if (index < 0 || index >= max_size) {
TF_LITE_KERNEL_LOG(const_cast<TfLiteContext*>(context),
"Invalid tensor index %d (not in [0, %d))\n", index,
max_size);
return kTfLiteError;
}
if (tensor_indices[index] == kTfLiteOptionalTensor) {
TF_LITE_KERNEL_LOG(const_cast<TfLiteContext*>(context),
"Tensor at index %d was optional but was expected\n",
index);
return kTfLiteError;
}
*tensor_index = tensor_indices[index];
return kTfLiteOk;
}
// Same as above but returns -1 for invalid inputs instead of status + logging
// error.
inline int ValidateTensorIndexing(const TfLiteContext* context, int index,
int max_size, const int* tensor_indices) {
if (index >= 0 && index < max_size) {
const int tensor_index = tensor_indices[index];
if (tensor_index != kTfLiteOptionalTensor) {
return tensor_index;
}
}
return -1;
}
inline TfLiteTensor* GetMutableInput(const TfLiteContext* context,
const TfLiteNode* node, int index) {
const int tensor_index = ValidateTensorIndexing(
context, index, node->inputs->size, node->inputs->data);
if (tensor_index < 0) {
return nullptr;
}
return GetTensorAtIndex(context, tensor_index);
}
inline TfLiteStatus GetMutableInputSafe(const TfLiteContext* context,
const TfLiteNode* node, int index,
const TfLiteTensor** tensor) {
int tensor_index;
TF_LITE_ENSURE_OK(
context, ValidateTensorIndexingSafe(context, index, node->inputs->size,
node->inputs->data, &tensor_index));
*tensor = GetTensorAtIndex(context, tensor_index);
return kTfLiteOk;
}
} // anonymous namespace.
const TfLiteTensor* GetInput(const TfLiteContext* context,
const TfLiteNode* node, int index) {
return GetMutableInput(context, node, index);
}
TfLiteStatus GetInputSafe(const TfLiteContext* context, const TfLiteNode* node,
int index, const TfLiteTensor** tensor) {
return GetMutableInputSafe(context, node, index, tensor);
}
TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
int index) {
TfLiteTensor* tensor = GetMutableInput(context, node, index);
if (tensor == nullptr) return nullptr;
return tensor->is_variable ? tensor : nullptr;
}
TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
int index) {
const int tensor_index = ValidateTensorIndexing(
context, index, node->outputs->size, node->outputs->data);
if (tensor_index < 0) {
return nullptr;
}
return GetTensorAtIndex(context, tensor_index);
}
TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node,
int index, TfLiteTensor** tensor) {
int tensor_index;
TF_LITE_ENSURE_OK(
context, ValidateTensorIndexingSafe(context, index, node->outputs->size,
node->outputs->data, &tensor_index));
*tensor = GetTensorAtIndex(context, tensor_index);
return kTfLiteOk;
}
const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
const TfLiteNode* node, int index) {
return GetInput(context, node, index);
}
#ifndef TF_LITE_STATIC_MEMORY
TfLiteTensor* GetTemporary(TfLiteContext* context, const TfLiteNode* node,
int index) {
const int tensor_index = ValidateTensorIndexing(
context, index, node->temporaries->size, node->temporaries->data);
if (tensor_index < 0) {
return nullptr;
}
return GetTensorAtIndex(context, tensor_index);
}
TfLiteStatus GetTemporarySafe(const TfLiteContext* context,
const TfLiteNode* node, int index,
TfLiteTensor** tensor) {
int tensor_index;
TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe(
context, index, node->temporaries->size,
node->temporaries->data, &tensor_index));
*tensor = GetTensorAtIndex(context, tensor_index);
return kTfLiteOk;
}
const TfLiteTensor* GetIntermediates(TfLiteContext* context,
const TfLiteNode* node, int index) {
const int tensor_index = ValidateTensorIndexing(
context, index, node->intermediates->size, node->intermediates->data);
if (tensor_index < 0) {
return nullptr;
}
return GetTensorAtIndex(context, tensor_index);
}
TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context,
const TfLiteNode* node, int index,
TfLiteTensor** tensor) {
int tensor_index;
TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe(
context, index, node->intermediates->size,
node->intermediates->data, &tensor_index));
*tensor = GetTensorAtIndex(context, tensor_index);
return kTfLiteOk;
}
#endif // TF_LITE_STATIC_MEMORY
// Per-axis
TfLiteStatus PopulateConvolutionQuantizationParams(
TfLiteContext* context, const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
int32_t* output_activation_min, int32_t* output_activation_max,
int32_t* per_channel_multiplier, int32_t* per_channel_shift) {
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
return PopulateConvolutionQuantizationParams(
context, input, filter, bias, output, activation, multiplier, shift,
output_activation_min, output_activation_max, per_channel_multiplier,
per_channel_shift, affine_quantization->scale->size);
}
// Per-axis & per-tensor
TfLiteStatus PopulateConvolutionQuantizationParams(
TfLiteContext* context, const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
int32_t* output_activation_min, int32_t* output_activation_max,
int32_t* per_channel_multiplier, int32_t* per_channel_shift,
int num_channels) {
TF_LITE_ENSURE_EQ(context, input->quantization.type,
kTfLiteAffineQuantization);
TF_LITE_ENSURE_EQ(context, filter->quantization.type,
kTfLiteAffineQuantization);
// TODO(jianlijianli): Enable bias type check and bias scale == input scale
// * filter scale for each channel in affine quantization once bias
// quantization is properly populated.
// TF_LITE_ENSURE_EQ(context, bias->quantization.type,
// kTfLiteAffineQuantization);
// Check data type.
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
TF_LITE_ENSURE(context, affine_quantization);
TF_LITE_ENSURE(context, affine_quantization->scale);
const bool is_per_channel = affine_quantization->scale->size > 1;
if (is_per_channel) {
// Currently only Int8/Int16 is supported for per channel quantization.
TF_LITE_ENSURE(context,
input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
TF_LITE_ENSURE_EQ(context, filter->type, kTfLiteInt8);
TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size, num_channels);
TF_LITE_ENSURE_EQ(
context, num_channels,
filter->dims->data[affine_quantization->quantized_dimension]);
}
// Populate multiplier and shift using affine quantization.
const float input_scale = input->params.scale;
const float output_scale = output->params.scale;
const float* filter_scales = affine_quantization->scale->data;
for (int i = 0; i < num_channels; ++i) {
// If per-tensor quantization parameter is specified, broadcast it along the
// quantization dimension (channels_out).
const float scale = is_per_channel ? filter_scales[i] : filter_scales[0];
const double filter_scale = static_cast<double>(scale);
const double effective_output_scale = static_cast<double>(input_scale) *
filter_scale /
static_cast<double>(output_scale);
int32_t significand;
int channel_shift;
QuantizeMultiplier(effective_output_scale, &significand, &channel_shift);
per_channel_multiplier[i] = significand;
per_channel_shift[i] = channel_shift;
}
// Populate scalar quantization parameters.
// This check on legacy quantization parameters is kept only for backward
// compatibility.
if (input->type == kTfLiteUInt8) {
// Check bias scale == input scale * filter scale.
double real_multiplier = 0.0;
TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
context, input, filter, bias, output, &real_multiplier));
int exponent;
// Populate quantization parameters with multiplier and shift.
QuantizeMultiplier(real_multiplier, multiplier, &exponent);
*shift = -exponent;
}
if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8 ||
input->type == kTfLiteInt16) {
TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
context, activation, output, output_activation_min,
output_activation_max));
}
return kTfLiteOk;
}
TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias,
TfLiteTensor* output,
double* multiplier) {
const double input_product_scale = static_cast<double>(input->params.scale) *
static_cast<double>(filter->params.scale);
// The following conditions must be guaranteed by the training pipeline.
if (bias) {
const double bias_scale = static_cast<double>(bias->params.scale);
// Here we're making sure the input_product_scale & bias_scale are about the
// same. Since we have:
// (output - output_zp) * output_scale =
// input_product_scale * input_product + bias * bias_scale ---- (0)
//
// (0) equals:
// (input_product + bias) * input_product_scale ----- (1)
// +
// bias * (bias_scale - input_product_scale) ------ (2)
//
// For the real kernel computation, we're doing (1), so we really need to
// make sure (2) has minimum impact on the output, so:
// bias * (bias_scale - input_product_scale) / output_scale should be
// a small number for an integer.
// Since normally bias should be within a small range.
// We should expect (bias_scale - input_product_scale) / output_scale to
// be a small number like 0.02.
const double scale_diff = std::abs(input_product_scale - bias_scale);
const double output_scale = static_cast<double>(output->params.scale);
TF_LITE_ENSURE(context, scale_diff / output_scale <= 0.02);
}
return GetQuantizedConvolutionMultipler(context, input, filter, output,
multiplier);
}
TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
const TfLiteTensor* input,
const TfLiteTensor* filter,
TfLiteTensor* output,
double* multiplier) {
const double input_product_scale =
static_cast<double>(input->params.scale * filter->params.scale);
TF_LITE_ENSURE(context, input_product_scale >= 0);
*multiplier = input_product_scale / static_cast<double>(output->params.scale);
return kTfLiteOk;
}
namespace {
inline TfLiteStatus Quantize(TfLiteContext* context, float scale,
int32_t zero_point, float f, int32_t& q) {
const float tmp = TfLiteRound(f / scale);
const bool no_integer_overflow_from_quantization =
(tmp >= static_cast<float>(std::numeric_limits<int32_t>::min()) &&
tmp <= static_cast<float>(std::numeric_limits<int32_t>::max()));
TF_LITE_ENSURE(context, no_integer_overflow_from_quantization);
q = zero_point + static_cast<int32_t>(tmp);
return kTfLiteOk;
}
TfLiteStatus CalculateActivationRangeQuantizedImpl(
TfLiteContext* context, TfLiteFusedActivation activation, int32_t qmin,
int32_t qmax, TfLiteTensor* output, int32_t* act_min, int32_t* act_max) {
const auto scale = output->params.scale;
const auto zero_point = output->params.zero_point;
int32_t tmp_q;
if (activation == kTfLiteActRelu) {
TF_LITE_ENSURE_OK(context,
Quantize(context, scale, zero_point, 0.0, tmp_q));
*act_min = std::max(qmin, tmp_q);
*act_max = qmax;
} else if (activation == kTfLiteActRelu6) {
TF_LITE_ENSURE_OK(context,
Quantize(context, scale, zero_point, 0.0, tmp_q));
*act_min = std::max(qmin, tmp_q);
TF_LITE_ENSURE_OK(context,
Quantize(context, scale, zero_point, 6.0, tmp_q));
*act_max = std::min(qmax, tmp_q);
} else if (activation == kTfLiteActReluN1To1) {
TF_LITE_ENSURE_OK(context,
Quantize(context, scale, zero_point, -1.0, tmp_q));
*act_min = std::max(qmin, tmp_q);
TF_LITE_ENSURE_OK(context,
Quantize(context, scale, zero_point, 1.0, tmp_q));
*act_max = std::min(qmax, tmp_q);
} else {
*act_min = qmin;
*act_max = qmax;
}
return kTfLiteOk;
}
} // namespace
TfLiteStatus CalculateActivationRangeQuantized(TfLiteContext* context,
TfLiteFusedActivation activation,
TfLiteTensor* output,
int32_t* act_min,
int32_t* act_max) {
int32_t qmin = 0;
int32_t qmax = 0;
if (output->type == kTfLiteUInt8) {
qmin = std::numeric_limits<uint8_t>::min();
qmax = std::numeric_limits<uint8_t>::max();
} else if (output->type == kTfLiteInt8) {
qmin = std::numeric_limits<int8_t>::min();
qmax = std::numeric_limits<int8_t>::max();
} else if (output->type == kTfLiteInt16) {
qmin = std::numeric_limits<int16_t>::min();
qmax = std::numeric_limits<int16_t>::max();
} else {
TF_LITE_ENSURE(context, false);
}
return CalculateActivationRangeQuantizedImpl(context, activation, qmin, qmax,
output, act_min, act_max);
}
bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2) {
return TfLiteIntArrayEqual(input1->dims, input2->dims);
}
#ifndef TF_LITE_STATIC_MEMORY
TfLiteStatus GetOutputShapeFromInput(TfLiteContext* context,
const TfLiteTensor* input,
TfLiteIntArray** output_shape) {
if (NumDimensions(input) != 1) {
TF_LITE_KERNEL_LOG(const_cast<TfLiteContext*>(context),
"Invalid %dD input tensor (must be a 1D tensor).",
NumDimensions(input));
return kTfLiteError;
}
const int output_dims = SizeOfDimension(input, 0);
std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
TfLiteIntArrayCreate(output_dims), TfLiteIntArrayFree);
for (int i = 0; i < output_dims; i++) {
shape->data[i] = input->data.i32[i];
}
*output_shape = shape.release();
return kTfLiteOk;
}
// TODO(b/172067338): Having this function be part of TF_LITE_STATIC_MEMORY
// build results in a 6KB size increase, even though the function is unsused for
// that build. What appears to be happening is that while the linker drops the
// unsused function, the string library that gets pulled in is not dropped,
// resulting in the increased binary size.
const std::string GetShapeDebugString(const TfLiteIntArray* shape) {
std::string str;
for (int d = 0; d < shape->size; ++d) {
if (str.empty())
str = "[" + std::to_string(shape->data[d]);
else
// Don't add space after "," to make the output consistent with
// tensorflow::shape_inference::InferenceContext::DebugString()
str += "," + std::to_string(shape->data[d]);
}
if (str.empty()) {
str = "[]";
} else {
str += "]";
}
return str;
}
TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
const TfLiteTensor* input1,
const TfLiteTensor* input2,
TfLiteIntArray** output_shape) {
const int dims1 = NumDimensions(input1);
const int dims2 = NumDimensions(input2);
const int out_dims = std::max(dims1, dims2);
std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
for (int i = 0; i < out_dims; ++i) {
const int d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
const int d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
if (!(d1 == d2 || d1 == 1 || d2 == 1)) {
context->ReportError(context,
"Given shapes, %s and %s, are not broadcastable.",
GetShapeDebugString(input1->dims).c_str(),
GetShapeDebugString(input2->dims).c_str());
return kTfLiteError;
}
if (d1 == 0 || d2 == 0) {
shape->data[out_dims - i - 1] = 0;
} else {
shape->data[out_dims - i - 1] = std::max(d1, d2);
}
}
*output_shape = shape.release();
return kTfLiteOk;
}
TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
const TfLiteTensor* input1,
const TfLiteTensor* input2,
const TfLiteTensor* input3,
TfLiteIntArray** output_shape) {
const int dims1 = NumDimensions(input1);
const int dims2 = NumDimensions(input2);
const int dims3 = NumDimensions(input3);
const int out_dims = std::max(std::max(dims1, dims2), dims3);
std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
for (int i = 0; i < out_dims; ++i) {
const int d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
const int d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
const int d3 = i >= dims3 ? 1 : SizeOfDimension(input3, dims3 - i - 1);
const int min_value = std::min(std::min(d1, d2), d3);
int max_value = std::max(std::max(d1, d2), d3);
// If one dimention is 0, others must be 0 or 1.
if (min_value == 0) max_value = 0;
if (!(d1 == 1 || d1 == max_value) || !(d2 == 1 || d2 == max_value) ||
!(d3 == 1 || d3 == max_value)) {
context->ReportError(
context, "Given shapes, %s, %s and %s, are not broadcastable.",
GetShapeDebugString(input1->dims).c_str(),
GetShapeDebugString(input2->dims).c_str(),
GetShapeDebugString(input3->dims).c_str());
return kTfLiteError;
}
shape->data[out_dims - i - 1] = max_value;
}
*output_shape = shape.release();
return kTfLiteOk;
}
#endif // TF_LITE_STATIC_MEMORY
// Size of string is not constant, return 0 in such case.
int TfLiteTypeGetSize(TfLiteType type) {
switch (type) {
case kTfLiteUInt8:
static_assert(sizeof(uint8_t) == 1, "");
return 1;
case kTfLiteInt8:
static_assert(sizeof(int8_t) == 1, "");
return 1;
case kTfLiteBool:
return sizeof(bool);
case kTfLiteInt16:
static_assert(sizeof(int16_t) == 2, "");
return 2;
case kTfLiteFloat16:
static_assert(sizeof(int16_t) == 2, "");
return 2;
case kTfLiteFloat32:
static_assert(sizeof(float) == 4, "");
return 4;
case kTfLiteInt32:
static_assert(sizeof(int32_t) == 4, "");
return 4;
case kTfLiteUInt32:
static_assert(sizeof(uint32_t) == 4, "");
return 4;
case kTfLiteInt64:
static_assert(sizeof(int64_t) == 8, "");
return 8;
case kTfLiteUInt64:
static_assert(sizeof(uint64_t) == 8, "");
return 8;
case kTfLiteFloat64:
static_assert(sizeof(double) == 8, "");
return 8;
case kTfLiteComplex64:
static_assert(sizeof(std::complex<float>) == 8, "");
return 8;
case kTfLiteComplex128:
static_assert(sizeof(std::complex<double>) == 16, "");
return 16;
default:
return 0;
}
}
bool IsMobilePlatform() {
#if defined(ANDROID) || defined(__ANDROID__)
return true;
#elif defined(__APPLE__)
#if TARGET_IPHONE_SIMULATOR || TARGET_OS_IPHONE
return true;
#endif
#endif
return false;
}
} // namespace tflite

View File

@@ -0,0 +1,319 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
#define TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
#include <stdint.h>
#include <limits>
#ifndef TF_LITE_STATIC_MEMORY
#include <string>
#endif // TF_LITE_STATIC_MEMORY
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
namespace tflite {
// A fair number of functions in this header have historically been inline.
// It is ok to change functions to not be inline if the latency with
// benchmark_model for MobileNet + MobileBERT is unaffected. If such a change is
// made, move the newly non-inlined function declarations to the top of this
// header file.
// Note: You must check if result is not null:
//
// TfLiteTensor* my_tensor = GetInput(context, node, kMyTensorIdx);
// TF_LITE_ENSURE(context, my_tensor != nullptr);
//
// This is because the index might point to the optional tensor constant
// (kTfLiteOptionalTensor) in which case there is no tensor to return.
const TfLiteTensor* GetInput(const TfLiteContext* context,
const TfLiteNode* node, int index);
// Same as `GetInput` but returns boolean and uses output argument for tensor.
//
// TfLiteTensor* my_tensor;
// TF_LITE_ENSURE_OK(context,
// GetInputSafe(context, node, kMyTensorIdx, &my_tensor));
// // can use my_tensor directly from here onwards, it is not nullptr
//
// Should be used in cases where the binary size is too large.
TfLiteStatus GetInputSafe(const TfLiteContext* context, const TfLiteNode* node,
int index, const TfLiteTensor** tensor);
// Note: You must check if result is not null:
//
// TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
// TF_LITE_ENSURE(context, my_tensor != nullptr);
//
// This is because the index might point to the optional tensor constant
// (kTfLiteOptionalTensor) in which case there is no tensor to return.
TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
int index);
// Note: You must check if result is not null:
//
// TfLiteTensor* my_tensor = GetOutput(context, node, kMyTensorIdx);
// TF_LITE_ENSURE(context, my_tensor != nullptr);
//
// This is because the index might point to the optional tensor constant
// (kTfLiteOptionalTensor) in which case there is no tensor to return.
TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
int index);
// Same as `GetOutput` but returns boolean and uses output argument for tensor.
//
// TfLiteTensor* my_tensor;
// TF_LITE_ENSURE_OK(context,
// GetOutputSafe(context, node, kMyTensorIdx, &my_tensor));
// // can use my_tensor directly from here onwards, it is not nullptr
//
// Should be used in cases where the binary size is too large.
TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node,
int index, TfLiteTensor** tensor);
// Note: You must check if result is not null:
//
// TfLiteTensor* my_tensor = GetOptionalInputTensor(context, node, kIdx);
// TF_LITE_ENSURE(context, my_tensor != nullptr);
//
// This is because the index might point to the optional tensor constant
// (kTfLiteOptionalTensor) in which case there is no tensor to return.
//
// Deprecated. GetInput has the same functionality.
const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
const TfLiteNode* node, int index);
#ifndef TF_LITE_STATIC_MEMORY
// Note: You must check if result is not null:
//
// TfLiteTensor* my_tensor = GetTemporary(context, node, kMyTensorIdx);
// TF_LITE_ENSURE(context, my_tensor != nullptr);
//
// This is because the index might point to the optional tensor constant
// (kTfLiteOptionalTensor) in which case there is no tensor to return.
TfLiteTensor* GetTemporary(TfLiteContext* context, const TfLiteNode* node,
int index);
// Same as `GetTemporary` but returns boolean and uses output argument for
// tensor.
//
// TfLiteTensor* my_tensor;
// TF_LITE_ENSURE_OK(context,
// GetTemporarySafe(context, node, kMyTensorIdx,
// &my_tensor));
// // can use my_tensor directly from here onwards, it is not nullptr
//
// Should be used in cases where the binary size is too large.
TfLiteStatus GetTemporarySafe(const TfLiteContext* context,
const TfLiteNode* node, int index,
TfLiteTensor** tensor);
// Note: You must check if result is not null:
//
// TfLiteTensor* my_tensor = GetIntermediates(context, node, kMyTensorIdx);
// TF_LITE_ENSURE(context, my_tensor != nullptr);
//
// This is because the index might point to the optional tensor constant
// (kTfLiteOptionalTensor) in which case there is no tensor to return.
const TfLiteTensor* GetIntermediates(TfLiteContext* context,
const TfLiteNode* node, int index);
// Same as `GetIntermediates` but returns boolean and uses output argument for
// tensor.
//
// TfLiteTensor* my_tensor;
// TF_LITE_ENSURE_OK(context,
// GetIntermediatesSafe(context, node, kMyTensorIdx,
// &my_tensor));
// // can use my_tensor directly from here onwards, it is not nullptr
//
// Should be used in cases where the binary size is too large.
TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context,
const TfLiteNode* node, int index,
TfLiteTensor** tensor);
#endif // TF_LITE_STATIC_MEMORY
inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
return t->dims->data[dim];
}
inline int NumInputs(const TfLiteNode* node) {
return node->inputs == nullptr ? 0 : node->inputs->size;
}
inline int NumOutputs(const TfLiteNode* node) {
return node->outputs == nullptr ? 0 : node->outputs->size;
}
#ifndef TF_LITE_STATIC_MEMORY
inline int NumIntermediates(const TfLiteNode* node) {
return node->intermediates->size;
}
#endif // TF_LITE_STATIC_MEMORY
inline int64_t NumElements(const TfLiteIntArray* dims) {
int64_t count = 1;
for (int i = 0; i < dims->size; ++i) {
count *= dims->data[i];
}
return count;
}
inline int64_t NumElements(const TfLiteTensor* t) {
return NumElements(t->dims);
}
// Determines whether tensor is constant.
// TODO(b/138199592): Introduce new query which checks for constant OR
// persistent-read-only, which would be useful for most tensor kernels that
// are potentially dynamic based on the input tensor value availability at the
// time of prepare.
inline bool IsConstantTensor(const TfLiteTensor* tensor) {
return tensor->allocation_type == kTfLiteMmapRo;
}
inline bool IsConstantOrPersistentTensor(const TfLiteTensor* tensor) {
return IsConstantTensor(tensor) ||
(tensor->allocation_type == kTfLitePersistentRo);
}
// Determines whether tensor is dynamic. Note that a tensor can be non-const and
// not dynamic. This function specifically checks for a dynamic tensor.
inline bool IsDynamicTensor(const TfLiteTensor* tensor) {
return tensor->allocation_type == kTfLiteDynamic;
}
// Sets tensor to dynamic.
inline void SetTensorToDynamic(TfLiteTensor* tensor) {
if (tensor->allocation_type != kTfLiteDynamic) {
tensor->allocation_type = kTfLiteDynamic;
tensor->data.raw = nullptr;
}
}
// Sets tensor to persistent and read-only.
inline void SetTensorToPersistentRo(TfLiteTensor* tensor) {
if (tensor->allocation_type != kTfLitePersistentRo) {
tensor->allocation_type = kTfLitePersistentRo;
tensor->data.raw = nullptr;
}
}
// Determines whether it is a hybrid op - one that has float inputs and
// quantized weights.
inline bool IsHybridOp(const TfLiteTensor* input, const TfLiteTensor* weight) {
return ((weight->type == kTfLiteUInt8 || weight->type == kTfLiteInt8) &&
input->type == kTfLiteFloat32);
}
// Check dimensionality match and populate OpData for Conv and DepthwiseConv.
TfLiteStatus PopulateConvolutionQuantizationParams(
TfLiteContext* context, const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
int32_t* output_activation_min, int32_t* output_activation_max,
int32_t* per_channel_multiplier, int32_t* per_channel_shift);
TfLiteStatus PopulateConvolutionQuantizationParams(
TfLiteContext* context, const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
int32_t* output_activation_min, int32_t* output_activation_max,
int32_t* per_channel_multiplier, int32_t* per_channel_shift,
int num_channels);
// Calculates the multiplication factor for a quantized convolution (or
// quantized depthwise convolution) involving the given tensors. Returns an
// error if the scales of the tensors are not compatible.
TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias,
TfLiteTensor* output,
double* multiplier);
TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
const TfLiteTensor* input,
const TfLiteTensor* filter,
TfLiteTensor* output,
double* multiplier);
// Calculates the useful quantized range of an activation layer given its
// activation tensor.
TfLiteStatus CalculateActivationRangeQuantized(TfLiteContext* context,
TfLiteFusedActivation activation,
TfLiteTensor* output,
int32_t* act_min,
int32_t* act_max);
// Calculates the useful range of an activation layer given its activation
// tensor.a
template <typename T>
void CalculateActivationRange(TfLiteFusedActivation activation,
T* activation_min, T* activation_max) {
if (activation == kTfLiteActRelu) {
*activation_min = 0;
*activation_max = std::numeric_limits<T>::max();
} else if (activation == kTfLiteActRelu6) {
*activation_min = 0;
*activation_max = 6;
} else if (activation == kTfLiteActReluN1To1) {
*activation_min = -1;
*activation_max = 1;
} else {
*activation_min = std::numeric_limits<T>::lowest();
*activation_max = std::numeric_limits<T>::max();
}
}
// Return true if the given tensors have the same shape.
bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2);
#if !defined(TF_LITE_STATIC_MEMORY)
// Gets the output shape from the input tensor.
TfLiteStatus GetOutputShapeFromInput(TfLiteContext* context,
const TfLiteTensor* input,
TfLiteIntArray** output_shape);
const std::string GetShapeDebugString(const TfLiteIntArray* shape);
#endif // !defined(TF_LITE_STATIC_MEMORY)
// Calculates the output_shape that is necessary for element-wise operations
// with broadcasting involving the two input tensors.
TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
const TfLiteTensor* input1,
const TfLiteTensor* input2,
TfLiteIntArray** output_shape);
// Calculates the output_shape that is necessary for element-wise operations
// with broadcasting involving the three input tensors.
TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
const TfLiteTensor* input1,
const TfLiteTensor* input2,
const TfLiteTensor* input3,
TfLiteIntArray** output_shape);
// Return the size of given type in bytes. Return 0 in in case of string.
int TfLiteTypeGetSize(TfLiteType type);
// Whether the current platform is mobile (Android or iOS).
bool IsMobilePlatform();
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_

View File

@@ -0,0 +1,38 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_OP_MACROS_H_
#define TENSORFLOW_LITE_KERNELS_OP_MACROS_H_
#include "tensorflow/lite/micro/debug_log.h"
#if !defined(TF_LITE_MCU_DEBUG_LOG)
#include <cstdlib>
#define TFLITE_ABORT abort()
#else
inline void AbortImpl() {
DebugLog("HALTED\n");
while (1) {
}
}
#define TFLITE_ABORT AbortImpl();
#endif
#if defined(NDEBUG)
#define TFLITE_ASSERT_FALSE (static_cast<void>(0))
#else
#define TFLITE_ASSERT_FALSE TFLITE_ABORT
#endif
#endif // TENSORFLOW_LITE_KERNELS_OP_MACROS_H_

View File

@@ -0,0 +1,115 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_PADDING_H_
#define TENSORFLOW_LITE_KERNELS_PADDING_H_
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/kernels/internal/types.h"
namespace tflite {
inline int ComputePadding(int stride, int dilation_rate, int in_size,
int filter_size, int out_size) {
int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
int padding = ((out_size - 1) * stride + effective_filter_size - in_size) / 2;
return padding > 0 ? padding : 0;
}
// It's not guaranteed that padding is symmetric. It's important to keep
// offset for algorithms need all paddings.
inline int ComputePaddingWithOffset(int stride, int dilation_rate, int in_size,
int filter_size, int out_size,
int* offset) {
int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
int total_padding =
((out_size - 1) * stride + effective_filter_size - in_size);
total_padding = total_padding > 0 ? total_padding : 0;
*offset = total_padding % 2;
return total_padding / 2;
}
// Matching GetWindowedOutputSize in TensorFlow.
inline int ComputeOutSize(TfLitePadding padding, int image_size,
int filter_size, int stride, int dilation_rate = 1) {
int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
// TODO(b/186448822): This uses 0 since the function has no other way to
// report error case
if (stride == 0) return 0;
switch (padding) {
case kTfLitePaddingSame:
return (image_size + stride - 1) / stride;
case kTfLitePaddingValid:
return (image_size + stride - effective_filter_size) / stride;
default:
return 0;
}
}
inline TfLitePaddingValues ComputePaddingHeightWidth(
int stride_height, int stride_width, int dilation_rate_height,
int dilation_rate_width, int in_height, int in_width, int filter_height,
int filter_width, TfLitePadding padding, int* out_height, int* out_width) {
*out_width = ComputeOutSize(padding, in_width, filter_width, stride_width,
dilation_rate_width);
*out_height = ComputeOutSize(padding, in_height, filter_height, stride_height,
dilation_rate_height);
TfLitePaddingValues padding_values;
int offset = 0;
padding_values.height =
ComputePaddingWithOffset(stride_height, dilation_rate_height, in_height,
filter_height, *out_height, &offset);
padding_values.height_offset = offset;
padding_values.width =
ComputePaddingWithOffset(stride_width, dilation_rate_width, in_width,
filter_width, *out_width, &offset);
padding_values.width_offset = offset;
return padding_values;
}
inline Padding3DValues ComputePadding3DValues(
int stride_height, int stride_width, int stride_depth,
int dilation_rate_height, int dilation_rate_width, int dilation_rate_depth,
int in_height, int in_width, int in_depth, int filter_height,
int filter_width, int filter_depth, TfLitePadding padding, int* out_height,
int* out_width, int* out_depth) {
*out_width = ComputeOutSize(padding, in_width, filter_width, stride_width,
dilation_rate_width);
*out_height = ComputeOutSize(padding, in_height, filter_height, stride_height,
dilation_rate_height);
*out_depth = ComputeOutSize(padding, in_depth, filter_depth, stride_depth,
dilation_rate_depth);
Padding3DValues padding_values;
int offset = 0;
padding_values.depth =
ComputePaddingWithOffset(stride_depth, dilation_rate_depth, in_depth,
filter_depth, *out_depth, &offset);
padding_values.depth_offset = offset;
padding_values.height =
ComputePaddingWithOffset(stride_height, dilation_rate_height, in_height,
filter_height, *out_height, &offset);
padding_values.height_offset = offset;
padding_values.width =
ComputePaddingWithOffset(stride_width, dilation_rate_width, in_width,
filter_width, *out_width, &offset);
padding_values.width_offset = offset;
return padding_values;
}
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_PADDING_H_