mirror of
https://github.com/jomjol/AI-on-the-edge-device.git
synced 2025-12-10 21:46:55 +03:00
Rolling 20220103
This commit is contained in:
1179
code/components/tflite-lib/tensorflow/lite/kernels/internal/common.h
Normal file
1179
code/components/tflite-lib/tensorflow/lite/kernels/internal/common.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,112 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "tensorflow/lite/kernels/op_macros.h"
|
||||
|
||||
#ifndef TFLITE_DCHECK
|
||||
#define TFLITE_DCHECK(condition) (condition) ? (void)0 : TFLITE_ASSERT_FALSE
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_DCHECK_EQ
|
||||
#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ASSERT_FALSE
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_DCHECK_NE
|
||||
#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ASSERT_FALSE
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_DCHECK_GE
|
||||
#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_DCHECK_GT
|
||||
#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ASSERT_FALSE
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_DCHECK_LE
|
||||
#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_DCHECK_LT
|
||||
#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ASSERT_FALSE
|
||||
#endif
|
||||
|
||||
// TODO(ahentz): Clean up: We should stick to the DCHECK versions.
|
||||
#ifndef TFLITE_CHECK
|
||||
#define TFLITE_CHECK(condition) (condition) ? (void)0 : TFLITE_ABORT
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_CHECK_EQ
|
||||
#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ABORT
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_CHECK_NE
|
||||
#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ABORT
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_CHECK_GE
|
||||
#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ABORT
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_CHECK_GT
|
||||
#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ABORT
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_CHECK_LE
|
||||
#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ABORT
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_CHECK_LT
|
||||
#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
|
||||
#endif
|
||||
|
||||
#ifndef TF_LITE_STATIC_MEMORY
|
||||
// TODO(b/162019032): Consider removing these type-aliases.
|
||||
using int8 = std::int8_t;
|
||||
using uint8 = std::uint8_t;
|
||||
using int16 = std::int16_t;
|
||||
using uint16 = std::uint16_t;
|
||||
using int32 = std::int32_t;
|
||||
using uint32 = std::uint32_t;
|
||||
#endif // !defined(TF_LITE_STATIC_MEMORY)
|
||||
|
||||
// TFLITE_DEPRECATED()
|
||||
//
|
||||
// Duplicated from absl/base/macros.h to avoid pulling in that library.
|
||||
// Marks a deprecated class, struct, enum, function, method and variable
|
||||
// declarations. The macro argument is used as a custom diagnostic message (e.g.
|
||||
// suggestion of a better alternative).
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// class TFLITE_DEPRECATED("Use Bar instead") Foo {...};
|
||||
// TFLITE_DEPRECATED("Use Baz instead") void Bar() {...}
|
||||
//
|
||||
// Every usage of a deprecated entity will trigger a warning when compiled with
|
||||
// clang's `-Wdeprecated-declarations` option. This option is turned off by
|
||||
// default, but the warnings will be reported by clang-tidy.
|
||||
#if defined(__clang__) && __cplusplus >= 201103L
|
||||
#define TFLITE_DEPRECATED(message) __attribute__((deprecated(message)))
|
||||
#endif
|
||||
|
||||
#ifndef TFLITE_DEPRECATED
|
||||
#define TFLITE_DEPRECATED(message)
|
||||
#endif
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
|
||||
@@ -0,0 +1,40 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_CPPMATH_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_CPPMATH_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
namespace tflite {
|
||||
|
||||
#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
|
||||
(defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(__ZEPHYR__)
|
||||
#define TF_LITE_GLOBAL_STD_PREFIX
|
||||
#else
|
||||
#define TF_LITE_GLOBAL_STD_PREFIX std
|
||||
#endif
|
||||
|
||||
#define DECLARE_STD_GLOBAL_SWITCH1(tf_name, std_name) \
|
||||
template <class T> \
|
||||
inline T tf_name(const T x) { \
|
||||
return TF_LITE_GLOBAL_STD_PREFIX::std_name(x); \
|
||||
}
|
||||
|
||||
DECLARE_STD_GLOBAL_SWITCH1(TfLiteRound, round);
|
||||
DECLARE_STD_GLOBAL_SWITCH1(TfLiteExpm1, expm1);
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_CPPMATH_H_
|
||||
@@ -0,0 +1,35 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
namespace tflite {
|
||||
|
||||
#if defined(TF_LITE_USE_GLOBAL_MAX) || defined(__ZEPHYR__)
|
||||
inline float TfLiteMax(const float& x, const float& y) {
|
||||
return std::max(x, y);
|
||||
}
|
||||
#else
|
||||
template <class T>
|
||||
inline T TfLiteMax(const T& x, const T& y) {
|
||||
return std::fmax(x, y);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
|
||||
@@ -0,0 +1,35 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
namespace tflite {
|
||||
|
||||
#if defined(TF_LITE_USE_GLOBAL_MIN) || defined(__ZEPHYR__)
|
||||
inline float TfLiteMin(const float& x, const float& y) {
|
||||
return std::min(x, y);
|
||||
}
|
||||
#else
|
||||
template <class T>
|
||||
inline T TfLiteMin(const T& x, const T& y) {
|
||||
return std::fmin(x, y);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
|
||||
@@ -0,0 +1,20 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
|
||||
|
||||
// TFLM does not need to utilize any Neon optimizations.
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
|
||||
@@ -0,0 +1,122 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/lite/c/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
inline RuntimeShape GetTensorShape(std::vector<int32_t> data) {
|
||||
return RuntimeShape(data.size(), data.data());
|
||||
}
|
||||
|
||||
// A list of tensors in a format that can be used by kernels like split and
|
||||
// concatenation.
|
||||
template <typename T>
|
||||
class VectorOfTensors {
|
||||
public:
|
||||
// Build with the tensors in 'tensor_list'.
|
||||
VectorOfTensors(const TfLiteContext& context,
|
||||
const TfLiteIntArray& tensor_list) {
|
||||
int num_tensors = tensor_list.size;
|
||||
|
||||
all_data_.reserve(num_tensors);
|
||||
all_shape_.reserve(num_tensors);
|
||||
all_shape_ptr_.reserve(num_tensors);
|
||||
|
||||
for (int i = 0; i < num_tensors; ++i) {
|
||||
TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
|
||||
all_data_.push_back(GetTensorData<T>(t));
|
||||
all_shape_.push_back(GetTensorShape(t));
|
||||
}
|
||||
|
||||
// Taking the pointer from inside a std::vector is only OK if the vector is
|
||||
// never modified, so we populate all_shape in the previous loop and then we
|
||||
// are free to grab iterators here.
|
||||
for (int i = 0; i < num_tensors; ++i) {
|
||||
all_shape_ptr_.push_back(&all_shape_[i]);
|
||||
}
|
||||
}
|
||||
// Return a pointer to the data pointers of all tensors in the list. For
|
||||
// example:
|
||||
// float* const* f = v.data();
|
||||
// f[0][1] is the second element of the first tensor.
|
||||
T* const* data() const { return all_data_.data(); }
|
||||
|
||||
// Return a pointer the shape pointers of all tensors in the list. For
|
||||
// example:
|
||||
// const RuntimeShape* const* d = v.dims();
|
||||
// dims[1] are the dimensions of the second tensor in the list.
|
||||
const RuntimeShape* const* shapes() const { return all_shape_ptr_.data(); }
|
||||
|
||||
private:
|
||||
std::vector<T*> all_data_;
|
||||
std::vector<RuntimeShape> all_shape_;
|
||||
std::vector<RuntimeShape*> all_shape_ptr_;
|
||||
};
|
||||
|
||||
// A list of quantized tensors in a format that can be used by kernels like
|
||||
// split and concatenation.
|
||||
class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t> {
|
||||
public:
|
||||
// Build with the tensors in 'tensor_list'.
|
||||
VectorOfQuantizedTensors(const TfLiteContext& context,
|
||||
const TfLiteIntArray& tensor_list)
|
||||
: VectorOfTensors<uint8_t>(context, tensor_list) {
|
||||
for (int i = 0; i < tensor_list.size; ++i) {
|
||||
TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
|
||||
zero_point_.push_back(t->params.zero_point);
|
||||
scale_.push_back(t->params.scale);
|
||||
}
|
||||
}
|
||||
|
||||
const float* scale() const { return scale_.data(); }
|
||||
const int32_t* zero_point() const { return zero_point_.data(); }
|
||||
|
||||
private:
|
||||
std::vector<int32_t> zero_point_;
|
||||
std::vector<float> scale_;
|
||||
};
|
||||
|
||||
// Writes randomly accessed values from `input` sequentially into `output`.
|
||||
template <typename T>
|
||||
class SequentialTensorWriter {
|
||||
public:
|
||||
SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output) {
|
||||
input_data_ = GetTensorData<T>(input);
|
||||
output_ptr_ = GetTensorData<T>(output);
|
||||
}
|
||||
SequentialTensorWriter(const T* input_data, T* output_data)
|
||||
: input_data_(input_data), output_ptr_(output_data) {}
|
||||
|
||||
void Write(int position) { *output_ptr_++ = input_data_[position]; }
|
||||
void WriteN(int position, int len) {
|
||||
memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
|
||||
output_ptr_ += len;
|
||||
}
|
||||
|
||||
private:
|
||||
const T* input_data_;
|
||||
T* output_ptr_;
|
||||
};
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
|
||||
@@ -0,0 +1,124 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
|
||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||
#include "tensorflow/lite/c/common.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define __restrict__ __restrict
|
||||
#endif
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace tensor_utils {
|
||||
|
||||
// Multiplies a matrix with a scalar and reduce the result on each row to a
|
||||
// scalar.
|
||||
// Parameters:
|
||||
// - matrix: matrix of size n_row * n_col
|
||||
// - scalar: the scalar that is multiplied to each element in the matrix
|
||||
// - n_row: the row count of the matrix
|
||||
// - n_col: the column count of the matrix
|
||||
// - output: the 32bit output
|
||||
// Note: We do not need saturation because the int8 * int8 is safe from overflow
|
||||
// in (2^31-1) / (2^14) = 131072, which is bigger than the n_row. Non-zero
|
||||
// initial output value is not exceptionally large.
|
||||
void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
|
||||
int32_t n_row, int32_t n_col,
|
||||
int32_t* output);
|
||||
|
||||
// Add another vector for each batch in the batch vector.
|
||||
template <typename T>
|
||||
void VectorBatchVectorAdd(const T* vector, int v_size, int n_batch,
|
||||
T* batch_vector) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
for (int i = 0; i < v_size; ++i) {
|
||||
batch_vector[i] += vector[i];
|
||||
}
|
||||
batch_vector += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Cwise product of two vectors.
|
||||
template <typename T>
|
||||
inline void VectorVectorCwiseProduct(const T* __restrict__ vector1,
|
||||
const T* __restrict__ vector2, int v_size,
|
||||
T* __restrict__ result) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
*result++ = *vector1++ * *vector2++;
|
||||
}
|
||||
}
|
||||
|
||||
// Cwise product of a vector and a batch-vector.
|
||||
template <typename T>
|
||||
inline void VectorBatchVectorCwiseProduct(const T* vector, int v_size,
|
||||
const T* batch_vector, int n_batch,
|
||||
T* result) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
|
||||
// Update the pointers.
|
||||
result += v_size;
|
||||
batch_vector += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
|
||||
// assumption here is that result array is initialized to valid values.
|
||||
template <typename T>
|
||||
inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
|
||||
const T* __restrict__ vector2,
|
||||
int v_size,
|
||||
T* __restrict__ result) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
*result++ += *vector1++ * *vector2++;
|
||||
}
|
||||
}
|
||||
|
||||
// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
|
||||
// operation, the assumption here is that result array is initialized to valid
|
||||
// values.
|
||||
template <typename T>
|
||||
inline void VectorBatchVectorCwiseProductAccumulate(const T* vector, int v_size,
|
||||
const T* batch_vector,
|
||||
int n_batch, T* result) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
|
||||
// Update the pointers.
|
||||
result += v_size;
|
||||
batch_vector += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Batch vector initialization with another vector.
|
||||
template <typename T>
|
||||
void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch,
|
||||
T* batch_vector) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
std::copy_n(vector, v_size, batch_vector + b * v_size);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tensor_utils
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
|
||||
@@ -0,0 +1,416 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace {
|
||||
// These constants are used to manipulate the binary representation of doubles.
|
||||
// Double-precision binary64 floating point format is:
|
||||
// Bit | 63 | 62-52 | 51-0 |
|
||||
// | Sign | Exponent | Fraction |
|
||||
// To avoid 64-bit integers as much as possible, I break this into high and
|
||||
// low 32-bit chunks. High is:
|
||||
// Bit | 31 | 30-20 | 19-0 |
|
||||
// | Sign | Exponent | High Fraction |
|
||||
// Low is:
|
||||
// Bit | 31-0 |
|
||||
// | Low Fraction |
|
||||
// We then access the components through logical bit-wise operations to
|
||||
// extract the parts needed, with the positions and masks derived from the
|
||||
// layout shown above.
|
||||
constexpr uint64_t kSignMask = 0x8000000000000000LL;
|
||||
constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
|
||||
constexpr int32_t kExponentShift = 52;
|
||||
constexpr int32_t kExponentBias = 1023;
|
||||
constexpr uint32_t kExponentIsBadNum = 0x7ff;
|
||||
constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
|
||||
constexpr uint32_t kFractionShift = 22;
|
||||
constexpr uint32_t kFractionRoundingMask = 0x003fffff;
|
||||
constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
|
||||
} // namespace
|
||||
|
||||
void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
|
||||
int* shift) {
|
||||
#if TFLITE_SINGLE_ROUNDING
|
||||
// Single-rounding MultiplyByQuantizedMultiplier only supports positive
|
||||
// multipliers.
|
||||
// TFLITE_DCHECK(double_multiplier >= 0);
|
||||
#endif
|
||||
if (double_multiplier == 0.) {
|
||||
*quantized_multiplier = 0;
|
||||
*shift = 0;
|
||||
return;
|
||||
}
|
||||
#ifdef TFLITE_EMULATE_FLOAT
|
||||
// If we're trying to avoid the use of floating-point instructions (for
|
||||
// example on microcontrollers) then use an alternative implementation
|
||||
// that only requires integer and bitwise operations. To enable this, you
|
||||
// need to set the define during the build process for your platform.
|
||||
int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
|
||||
#else // TFLITE_EMULATE_FLOAT
|
||||
const double q = std::frexp(double_multiplier, shift);
|
||||
auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1LL << 31)));
|
||||
#endif // TFLITE_EMULATE_FLOAT
|
||||
TFLITE_CHECK(q_fixed <= (1LL << 31));
|
||||
if (q_fixed == (1LL << 31)) {
|
||||
q_fixed /= 2;
|
||||
++*shift;
|
||||
}
|
||||
TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
|
||||
// A shift amount smaller than -31 would cause all bits to be shifted out
|
||||
// and thus all results would be zero. We implement that instead with
|
||||
// q_fixed==0, so as to avoid hitting issues with right-shift
|
||||
// operations with shift amounts greater than 31. Note that this happens
|
||||
// roughly when abs(double_multiplier) < 2^-31 and the present handling means
|
||||
// that we're effectively flushing tiny double_multiplier's to zero.
|
||||
// We could conceivably handle values in the range (roughly) [32, 63]
|
||||
// as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
|
||||
// the present handling is just doing 'flush denormals to zero'. We could
|
||||
// reconsider and actually generate nonzero denormals if a need arises.
|
||||
if (*shift < -31) {
|
||||
*shift = 0;
|
||||
q_fixed = 0;
|
||||
}
|
||||
#if TFLITE_SINGLE_ROUNDING
|
||||
// Single-rounding MultiplyByQuantizedMultiplier doesn't support a shift > 30,
|
||||
// saturate it.
|
||||
if (*shift > 30) {
|
||||
*shift = 30;
|
||||
q_fixed = (1LL << 31) - 1;
|
||||
}
|
||||
#endif
|
||||
*quantized_multiplier = static_cast<int32_t>(q_fixed);
|
||||
}
|
||||
|
||||
void QuantizeMultiplierGreaterThanOne(double double_multiplier,
|
||||
int32_t* quantized_multiplier,
|
||||
int* left_shift) {
|
||||
TFLITE_CHECK_GT(double_multiplier, 1.);
|
||||
QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
|
||||
TFLITE_CHECK_GE(*left_shift, 0);
|
||||
}
|
||||
|
||||
void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
|
||||
int32_t* quantized_multiplier,
|
||||
int* left_shift) {
|
||||
TFLITE_CHECK_LT(double_multiplier, 1.);
|
||||
TFLITE_CHECK_GT(double_multiplier, 0.);
|
||||
int shift;
|
||||
QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
|
||||
TFLITE_CHECK_LE(shift, 0);
|
||||
*left_shift = shift;
|
||||
}
|
||||
|
||||
int64_t IntegerFrExp(double input, int* shift) {
|
||||
// Make sure our assumptions about the double layout hold.
|
||||
TFLITE_CHECK_EQ(8, sizeof(double));
|
||||
|
||||
// We want to access the bits of the input double value directly, which is
|
||||
// tricky to do safely, so use a union to handle the casting.
|
||||
union {
|
||||
double double_value;
|
||||
uint64_t double_as_uint;
|
||||
} cast_union;
|
||||
cast_union.double_value = input;
|
||||
const uint64_t u = cast_union.double_as_uint;
|
||||
|
||||
// If the bitfield is all zeros apart from the sign bit, this is a normalized
|
||||
// zero value, so return standard values for this special case.
|
||||
if ((u & ~kSignMask) == 0) {
|
||||
*shift = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Deal with NaNs and Infs, which are always indicated with a fixed pattern in
|
||||
// the exponent, and distinguished by whether the fractions are zero or
|
||||
// non-zero.
|
||||
const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
|
||||
if (exponent_part == kExponentIsBadNum) {
|
||||
*shift = std::numeric_limits<int>::max();
|
||||
if (u & kFractionMask) {
|
||||
// NaN, so just return zero (with the exponent set to INT_MAX).
|
||||
return 0;
|
||||
} else {
|
||||
// Infinity, so return +/- INT_MAX.
|
||||
if (u & kSignMask) {
|
||||
return std::numeric_limits<int64_t>::min();
|
||||
} else {
|
||||
return std::numeric_limits<int64_t>::max();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The shift is fairly easy to extract from the high bits of the double value,
|
||||
// just by masking it out and applying a bias. The std::frexp() implementation
|
||||
// always returns values between 0.5 and 1.0 though, whereas the exponent
|
||||
// assumes 1.0 to 2.0 is the standard range, so I add on one to match that
|
||||
// interface.
|
||||
*shift = (exponent_part - kExponentBias) + 1;
|
||||
|
||||
// There's an implicit high bit in the double format definition, so make sure
|
||||
// we include that at the top, and then reconstruct the rest of the fractional
|
||||
// value from the remaining fragments.
|
||||
int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
|
||||
|
||||
// We're cutting off some bits at the bottom, so to exactly match the standard
|
||||
// frexp implementation here we'll apply rounding by adding one to the least
|
||||
// significant bit of the result if the discarded portion is over half of the
|
||||
// maximum.
|
||||
if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
|
||||
fraction += 1;
|
||||
}
|
||||
// Negate the fraction if the sign bit was set.
|
||||
if (u & kSignMask) {
|
||||
fraction *= -1;
|
||||
}
|
||||
|
||||
return fraction;
|
||||
}
|
||||
|
||||
double DoubleFromFractionAndShift(int64_t fraction, int shift) {
|
||||
union {
|
||||
double double_value;
|
||||
uint64_t double_as_uint;
|
||||
} result;
|
||||
|
||||
// Detect NaNs and infinities.
|
||||
if (shift == std::numeric_limits<int>::max()) {
|
||||
if (fraction == 0) {
|
||||
return std::numeric_limits<double>::quiet_NaN();
|
||||
} else if (fraction > 0) {
|
||||
return std::numeric_limits<double>::infinity();
|
||||
} else {
|
||||
return -std::numeric_limits<double>::infinity();
|
||||
}
|
||||
}
|
||||
|
||||
// Return a normalized zero for a zero fraction.
|
||||
if (fraction == 0) {
|
||||
result.double_as_uint = 0;
|
||||
return result.double_value;
|
||||
}
|
||||
|
||||
bool is_negative = (fraction < 0);
|
||||
int64_t encoded_fraction = is_negative ? -fraction : fraction;
|
||||
int64_t encoded_shift = (shift - 1);
|
||||
while (encoded_fraction < 0x40000000) {
|
||||
encoded_fraction *= 2;
|
||||
encoded_shift -= 1;
|
||||
}
|
||||
while (encoded_fraction > 0x80000000) {
|
||||
encoded_fraction /= 2;
|
||||
encoded_shift += 1;
|
||||
}
|
||||
encoded_fraction -= 0x40000000;
|
||||
if (encoded_shift < -1022) {
|
||||
encoded_shift = -1023;
|
||||
} else if (encoded_shift > 1022) {
|
||||
encoded_shift = 1023;
|
||||
}
|
||||
encoded_shift += kExponentBias;
|
||||
uint64_t encoded_sign = is_negative ? kSignMask : 0;
|
||||
result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
|
||||
(encoded_fraction << kFractionShift);
|
||||
return result.double_value;
|
||||
}
|
||||
|
||||
double IntegerDoubleMultiply(double a, double b) {
|
||||
int a_shift;
|
||||
const int64_t a_fraction = IntegerFrExp(a, &a_shift);
|
||||
int b_shift;
|
||||
const int64_t b_fraction = IntegerFrExp(b, &b_shift);
|
||||
// Detect NaNs and infinities.
|
||||
if (a_shift == std::numeric_limits<int>::max() ||
|
||||
(b_shift == std::numeric_limits<int>::max())) {
|
||||
return std::numeric_limits<double>::quiet_NaN();
|
||||
}
|
||||
const int result_shift = a_shift + b_shift + 1;
|
||||
const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
|
||||
return DoubleFromFractionAndShift(result_fraction, result_shift);
|
||||
}
|
||||
|
||||
int IntegerDoubleCompare(double a, double b) {
|
||||
int a_shift;
|
||||
const int64_t a_fraction = IntegerFrExp(a, &a_shift);
|
||||
int b_shift;
|
||||
const int64_t b_fraction = IntegerFrExp(b, &b_shift);
|
||||
|
||||
// Detect NaNs and infinities.
|
||||
if (a_shift == std::numeric_limits<int>::max() ||
|
||||
(b_shift == std::numeric_limits<int>::max())) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if ((a_fraction == 0) && (b_fraction < 0)) {
|
||||
return 1;
|
||||
} else if ((a_fraction < 0) && (b_fraction == 0)) {
|
||||
return -1;
|
||||
} else if (a_shift < b_shift) {
|
||||
return -1;
|
||||
} else if (a_shift > b_shift) {
|
||||
return 1;
|
||||
} else if (a_fraction < b_fraction) {
|
||||
return -1;
|
||||
} else if (a_fraction > b_fraction) {
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
void PreprocessSoftmaxScaling(double beta, double input_scale,
|
||||
int input_integer_bits,
|
||||
int32_t* quantized_multiplier, int* left_shift) {
|
||||
// If the overall multiplier (input and beta) is large, then exp() of an
|
||||
// input difference of 1 scaled by this will be large. In other words, we
|
||||
// can cap the multiplier and know that, when it is used, the output will be
|
||||
// (round to) zero wherever the input is not at the maximum value.
|
||||
|
||||
// If the overall scale is less than one, and input_integer_bits=0, then the
|
||||
// result is double equivalent of Q0.31 (actually with more precision). Thus
|
||||
// this generates a Q(input_integer_bits).(31-input_integer_bits)
|
||||
// representation.
|
||||
#if TFLITE_SINGLE_ROUNDING
|
||||
const double max_real_multiplier = (1LL << 30) - 1.0;
|
||||
#else
|
||||
const double max_real_multiplier = (1LL << 31) - 1.0;
|
||||
#endif
|
||||
|
||||
#ifdef TFLITE_EMULATE_FLOAT
|
||||
const double input_beta = IntegerDoubleMultiply(beta, input_scale);
|
||||
int shift;
|
||||
int64_t fraction = IntegerFrExp(input_beta, &shift);
|
||||
shift += (31 - input_integer_bits);
|
||||
double input_beta_real_multiplier =
|
||||
DoubleFromFractionAndShift(fraction, shift);
|
||||
if (IntegerDoubleCompare(input_beta_real_multiplier, max_real_multiplier) >
|
||||
0) {
|
||||
input_beta_real_multiplier = max_real_multiplier;
|
||||
}
|
||||
#else // TFLITE_EMULATE_FLOAT
|
||||
const double input_beta_real_multiplier =
|
||||
std::min<double>(beta * input_scale * (1 << (31 - input_integer_bits)),
|
||||
max_real_multiplier);
|
||||
#endif // TFLITE_EMULATE_FLOAT
|
||||
|
||||
QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
|
||||
quantized_multiplier, left_shift);
|
||||
}
|
||||
|
||||
void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
|
||||
int input_integer_bits,
|
||||
int32_t* quantized_multiplier,
|
||||
int* left_shift,
|
||||
int32_t* reverse_scaling_divisor,
|
||||
int* reverse_scaling_left_shift) {
|
||||
PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
|
||||
quantized_multiplier, left_shift);
|
||||
|
||||
// Also calculate what amounts to the inverse scaling factor for the input.
|
||||
const double real_reverse_scaling_divisor =
|
||||
(1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
|
||||
tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
|
||||
reverse_scaling_divisor,
|
||||
reverse_scaling_left_shift);
|
||||
}
|
||||
|
||||
int CalculateInputRadius(int input_integer_bits, int input_left_shift,
|
||||
int total_signed_bits) {
|
||||
#ifdef TFLITE_EMULATE_FLOAT
|
||||
int64_t result = (1 << input_integer_bits) - 1;
|
||||
result <<= (total_signed_bits - input_integer_bits);
|
||||
result >>= input_left_shift;
|
||||
return result;
|
||||
#else // TFLITE_EMULATE_FLOAT
|
||||
const double max_input_rescaled =
|
||||
1.0 * ((1 << input_integer_bits) - 1) *
|
||||
(1LL << (total_signed_bits - input_integer_bits)) /
|
||||
(1LL << input_left_shift);
|
||||
// Tighten bound using floor. Suppose that we could use the exact value.
|
||||
// After scaling the difference, the result would be at the maximum. Thus we
|
||||
// must ensure that our value has lower magnitude.
|
||||
return static_cast<int>(std::floor(max_input_rescaled));
|
||||
#endif // TFLITE_EMULATE_FLOAT
|
||||
}
|
||||
|
||||
void NudgeQuantizationRange(const float min, const float max,
|
||||
const int quant_min, const int quant_max,
|
||||
float* nudged_min, float* nudged_max,
|
||||
float* nudged_scale) {
|
||||
// This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
|
||||
const float quant_min_float = static_cast<float>(quant_min);
|
||||
const float quant_max_float = static_cast<float>(quant_max);
|
||||
*nudged_scale = (max - min) / (quant_max_float - quant_min_float);
|
||||
const float zero_point_from_min = quant_min_float - min / *nudged_scale;
|
||||
uint16_t nudged_zero_point;
|
||||
if (zero_point_from_min < quant_min_float) {
|
||||
nudged_zero_point = static_cast<uint16_t>(quant_min);
|
||||
} else if (zero_point_from_min > quant_max_float) {
|
||||
nudged_zero_point = static_cast<uint16_t>(quant_max);
|
||||
} else {
|
||||
nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
|
||||
}
|
||||
*nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
|
||||
*nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
|
||||
}
|
||||
|
||||
void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
|
||||
const float nudged_max, const float* input_data,
|
||||
float* output_data, const float size) {
|
||||
// This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
|
||||
const float inv_nudged_scale = 1.0f / nudged_scale;
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
const float src_val = input_data[i];
|
||||
const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
|
||||
const float clamped_shifted = clamped - nudged_min;
|
||||
const float dst_val =
|
||||
TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
|
||||
nudged_min;
|
||||
output_data[i] = dst_val;
|
||||
}
|
||||
}
|
||||
|
||||
bool CheckedLog2(const float x, int* log2_result) {
|
||||
// Using TfLiteRound instead of std::round and std::log instead of
|
||||
// std::log2 to work around these functions being missing in a toolchain
|
||||
// used in some TensorFlow tests as of May 2018.
|
||||
const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
|
||||
const float x_log2_rounded = TfLiteRound(x_log2);
|
||||
const float x_log2_fracpart = x_log2 - x_log2_rounded;
|
||||
|
||||
*log2_result = static_cast<int>(x_log2_rounded);
|
||||
return std::abs(x_log2_fracpart) < 1e-3f;
|
||||
}
|
||||
|
||||
void QuantizeMultiplierArray(const double* effective_scales, size_t size,
|
||||
int32_t* effective_scale_significand,
|
||||
int* effective_shift) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
|
||||
&effective_shift[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tflite
|
||||
@@ -0,0 +1,292 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
// Given the min and max values of a float array, return
|
||||
// reasonable quantization parameters to use for this array.
|
||||
template <typename T>
|
||||
QuantizationParams ChooseQuantizationParams(double rmin, double rmax,
|
||||
bool narrow_range) {
|
||||
const T qmin = std::numeric_limits<T>::min() + (narrow_range ? 1 : 0);
|
||||
const T qmax = std::numeric_limits<T>::max();
|
||||
const double qmin_double = qmin;
|
||||
const double qmax_double = qmax;
|
||||
// 0 should always be a representable value. Let's assume that the initial
|
||||
// min,max range contains 0.
|
||||
TFLITE_CHECK_LE(rmin, 0.);
|
||||
TFLITE_CHECK_GE(rmax, 0.);
|
||||
if (rmin == rmax) {
|
||||
// Special case where the min,max range is a point. Should be {0}.
|
||||
TFLITE_CHECK_EQ(rmin, 0.);
|
||||
TFLITE_CHECK_EQ(rmax, 0.);
|
||||
QuantizationParams quantization_params;
|
||||
quantization_params.zero_point = 0;
|
||||
quantization_params.scale = 0.;
|
||||
return quantization_params;
|
||||
}
|
||||
|
||||
// General case.
|
||||
//
|
||||
// First determine the scale.
|
||||
const double scale = (rmax - rmin) / (qmax_double - qmin_double);
|
||||
|
||||
// Zero-point computation.
|
||||
// First the initial floating-point computation. The zero-point can be
|
||||
// determined from solving an affine equation for any known pair
|
||||
// (real value, corresponding quantized value).
|
||||
// We know two such pairs: (rmin, qmin) and (rmax, qmax).
|
||||
// The arithmetic error on the zero point computed from either pair
|
||||
// will be roughly machine_epsilon * (sum of absolute values of terms)
|
||||
// so we want to use the variant that adds the smaller terms.
|
||||
const double zero_point_from_min = qmin_double - rmin / scale;
|
||||
const double zero_point_from_max = qmax_double - rmax / scale;
|
||||
const double zero_point_from_min_error =
|
||||
std::abs(qmin_double) + std::abs(rmin / scale);
|
||||
const double zero_point_from_max_error =
|
||||
std::abs(qmax_double) + std::abs(rmax / scale);
|
||||
|
||||
const double zero_point_double =
|
||||
zero_point_from_min_error < zero_point_from_max_error
|
||||
? zero_point_from_min
|
||||
: zero_point_from_max;
|
||||
|
||||
// Now we need to nudge the zero point to be an integer
|
||||
// (our zero points are integer, and this is motivated by the requirement
|
||||
// to be able to represent the real value "0" exactly as a quantized value,
|
||||
// which is required in multiple places, for example in Im2col with SAME
|
||||
// padding).
|
||||
T nudged_zero_point = 0;
|
||||
if (zero_point_double < qmin_double) {
|
||||
nudged_zero_point = qmin;
|
||||
} else if (zero_point_double > qmax_double) {
|
||||
nudged_zero_point = qmax;
|
||||
} else {
|
||||
nudged_zero_point = static_cast<T>(round(zero_point_double));
|
||||
}
|
||||
// The zero point should always be in the range of quantized value,
|
||||
// [qmin, qmax].
|
||||
TFLITE_CHECK_GE(nudged_zero_point, qmin);
|
||||
TFLITE_CHECK_LE(nudged_zero_point, qmax);
|
||||
|
||||
// Finally, store the result nudged quantization params.
|
||||
QuantizationParams quantization_params;
|
||||
quantization_params.zero_point = nudged_zero_point;
|
||||
quantization_params.scale = scale;
|
||||
return quantization_params;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
QuantizationParams ChooseQuantizationParams(double rmin, double rmax) {
|
||||
return ChooseQuantizationParams<T>(rmin, rmax, false);
|
||||
}
|
||||
|
||||
// Converts a floating-point number to an integer. For all inputs x where
|
||||
// static_cast<IntOut>(x) is legal according to the C++ standard, the result
|
||||
// is identical to that cast (i.e. the result is x with its fractional part
|
||||
// truncated whenever that is representable as IntOut).
|
||||
//
|
||||
// static_cast would cause undefined behavior for the following cases, which
|
||||
// have well-defined behavior for this function:
|
||||
//
|
||||
// 1. If x is NaN, the result is zero.
|
||||
//
|
||||
// 2. If the truncated form of x is above the representable range of IntOut,
|
||||
// the result is std::numeric_limits<IntOut>::max().
|
||||
//
|
||||
// 3. If the truncated form of x is below the representable range of IntOut,
|
||||
// the result is std::numeric_limits<IntOut>::min().
|
||||
//
|
||||
// Note that cases #2 and #3 cover infinities as well as finite numbers.
|
||||
//
|
||||
// The range of FloatIn must include the range of IntOut, otherwise
|
||||
// the results are undefined.
|
||||
// TODO(sfeuz): Replace by absl::SafeCast once available.
|
||||
template <class IntOut, class FloatIn>
|
||||
IntOut SafeCast(FloatIn x) {
|
||||
static_assert(!std::numeric_limits<FloatIn>::is_integer,
|
||||
"FloatIn is integer");
|
||||
static_assert(std::numeric_limits<IntOut>::is_integer,
|
||||
"IntOut is not integer");
|
||||
static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
|
||||
|
||||
// Special case NaN, for which the logic below doesn't work.
|
||||
if (std::isnan(x)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Negative values all clip to zero for unsigned results.
|
||||
if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Handle infinities.
|
||||
if (std::isinf(x)) {
|
||||
return x < 0 ? std::numeric_limits<IntOut>::min()
|
||||
: std::numeric_limits<IntOut>::max();
|
||||
}
|
||||
|
||||
// Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
|
||||
// unless x is zero in which case exp == 0. Note that this implies that the
|
||||
// magnitude of x is strictly less than 2^exp.
|
||||
int exp = 0;
|
||||
std::frexp(x, &exp);
|
||||
|
||||
// Let N be the number of non-sign bits in the representation of IntOut. If
|
||||
// the magnitude of x is strictly less than 2^N, the truncated version of x
|
||||
// is representable as IntOut. The only representable integer for which this
|
||||
// is not the case is kMin for signed types (i.e. -2^N), but that is covered
|
||||
// by the fall-through below.
|
||||
if (exp <= std::numeric_limits<IntOut>::digits) {
|
||||
return x;
|
||||
}
|
||||
|
||||
// Handle numbers with magnitude >= 2^N.
|
||||
return x < 0 ? std::numeric_limits<IntOut>::min()
|
||||
: std::numeric_limits<IntOut>::max();
|
||||
}
|
||||
|
||||
// Decompose a double multiplier into a Q0.31 int32 representation of its
|
||||
// significand, and shift representation of NEGATIVE its exponent ---
|
||||
// this is intended as a RIGHT-shift.
|
||||
//
|
||||
// Restricted to the case where the multiplier < 1 (and non-negative).
|
||||
void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
|
||||
int32_t* quantized_multiplier,
|
||||
int* left_shift);
|
||||
|
||||
// Decompose a double multiplier into a Q0.31 int32 representation of its
|
||||
// significand, and shift representation of its exponent.
|
||||
//
|
||||
// Restricted to the case where the multiplier > 1.
|
||||
void QuantizeMultiplierGreaterThanOne(double double_multiplier,
|
||||
int32_t* quantized_multiplier,
|
||||
int* left_shift);
|
||||
|
||||
// Decompose a double multiplier into a Q0.31 int32 representation of its
|
||||
// significand, and shift representation of its exponent.
|
||||
//
|
||||
// Handles an arbitrary positive multiplier. The 'shift' output-value is
|
||||
// basically the 'floating-point exponent' of the multiplier:
|
||||
// Negative for a right-shift (when the multiplier is <1), positive for a
|
||||
// left-shift (when the multiplier is >1)
|
||||
void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
|
||||
int* shift);
|
||||
|
||||
// Splits a double input value into a returned fraction, and a shift value from
|
||||
// the exponent, using only bitwise and integer operations to support
|
||||
// microcontrollers and other environments without floating-point support.
|
||||
//
|
||||
// This is designed to be a replacement for how std::frexp() is used within the
|
||||
// QuantizeMultiplier() function, and so has a different signature than the
|
||||
// standard version, returning a 64-bit integer rather than a double. This
|
||||
// result has a maximum value of 1<<31, with the fraction expressed as a
|
||||
// proportion of that maximum.
|
||||
//
|
||||
// std::frexp() returns NaNs and infinities unmodified, but since we're
|
||||
// returning integers that can't represent those values, instead we return
|
||||
// a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64
|
||||
// result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and
|
||||
// std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will
|
||||
// result in return values that end up truncating some bits at the end,
|
||||
// reflecting the loss of precision inherent in denormalization.
|
||||
int64_t IntegerFrExp(double input, int* shift);
|
||||
|
||||
// Converts an integer fraction in the format produced by IntegerFrExp (where
|
||||
// 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an
|
||||
// IEEE binary64 double format result. The implementation uses only integer and
|
||||
// bitwise operators, so no floating point hardware support or emulation is
|
||||
// needed. This is here so quantized operations can run non-time-critical
|
||||
// preparation calculations on microcontrollers and other platforms without
|
||||
// float support.
|
||||
double DoubleFromFractionAndShift(int64_t fraction, int shift);
|
||||
|
||||
// Performs a multiplication of two numbers in double format, using only integer
|
||||
// and bitwise instructions. This is aimed at supporting housekeeping functions
|
||||
// for quantized operations on microcontrollers without floating-point hardware.
|
||||
double IntegerDoubleMultiply(double a, double b);
|
||||
|
||||
// Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is
|
||||
// greater than b. It is implemented using only integer and logical instructions
|
||||
// so that it can be easily run on microcontrollers for quantized operations.
|
||||
int IntegerDoubleCompare(double a, double b);
|
||||
|
||||
// This first creates a multiplier in a double equivalent of
|
||||
// Q(input_integer_bits).(31-input_integer_bits) representation, with extra
|
||||
// precision in the double's fractional bits. It then splits the result into
|
||||
// significand and exponent.
|
||||
void PreprocessSoftmaxScaling(double beta, double input_scale,
|
||||
int input_integer_bits,
|
||||
int32_t* quantized_multiplier, int* left_shift);
|
||||
// Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
|
||||
void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
|
||||
int input_integer_bits,
|
||||
int32_t* quantized_multiplier,
|
||||
int* left_shift,
|
||||
int32_t* reverse_scaling_divisor,
|
||||
int* reverse_scaling_left_shift);
|
||||
// Calculate the largest input that will result in a within-bounds intermediate
|
||||
// result within MultiplyByQuantizedMultiplierGreaterThanOne. In other words,
|
||||
// it must not overflow before we reduce the value by multiplication by the
|
||||
// input multiplier. The negative radius is used as the minimum difference in
|
||||
// Softmax.
|
||||
int CalculateInputRadius(int input_integer_bits, int input_left_shift,
|
||||
int total_signed_bits = 31);
|
||||
|
||||
// Nudges a min/max quantization range to ensure zero is zero.
|
||||
// Gymnastics with nudged zero point is to ensure that real zero maps to
|
||||
// an integer, which is required for e.g. zero-padding in convolutional layers.
|
||||
// Outputs nudged_min, nudged_max, nudged_scale.
|
||||
void NudgeQuantizationRange(const float min, const float max,
|
||||
const int quant_min, const int quant_max,
|
||||
float* nudged_min, float* nudged_max,
|
||||
float* nudged_scale);
|
||||
|
||||
// Fake quantizes (quantizes and dequantizes) input_data using the scale,
|
||||
// nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code
|
||||
// in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor.
|
||||
void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
|
||||
const float nudged_max, const float* input_data,
|
||||
float* output_data, const float size);
|
||||
|
||||
// If x is approximately a power of two (with any positive or negative
|
||||
// exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise
|
||||
// returns false.
|
||||
bool CheckedLog2(const float x, int* log2_result);
|
||||
|
||||
// Decomposes an array of double multipliers into a Q0.31 int32 representation
|
||||
// of its significand, and shift representation of its exponent.
|
||||
//
|
||||
// Handles an arbitrary multiplier. The 'shift' output-value is
|
||||
// basically the 'floating-point exponent' of the multiplier:
|
||||
// Negative for a right-shift (when the multiplier is <1), positive for a
|
||||
// left-shift (when the multiplier is >1)
|
||||
void QuantizeMultiplierArray(const double* effective_scales, size_t size,
|
||||
int32_t* effective_scale_significand,
|
||||
int* effective_shift);
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
|
||||
@@ -0,0 +1,399 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void Add(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const T* input1_data,
|
||||
const RuntimeShape& input2_shape, const T* input2_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
T activation_min, activation_max;
|
||||
GetActivationParams(params, &activation_min, &activation_max);
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = ActivationFunctionWithMinMax(
|
||||
input1_data[i] + input2_data[i], activation_min, activation_max);
|
||||
}
|
||||
}
|
||||
|
||||
// Element-wise add that can often be used for inner loop of broadcast add as
|
||||
// well as the non-broadcast add.
|
||||
|
||||
// This function is used for 8-bit as well as for 16-bit, but the accumulator
|
||||
// is 32-bit for both cases. The overflow does not happen due to the
|
||||
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
|
||||
template <typename T>
|
||||
inline void AddElementwise(int size, const ArithmeticParams& params,
|
||||
const T* input1_data, const T* input2_data,
|
||||
T* output_data) {
|
||||
TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
|
||||
TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
|
||||
TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
|
||||
TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sum, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[i] = static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
// Scalar-broadcast add that can be used for inner loop of more general
|
||||
// broadcast add, so that, for example, scalar-broadcast with batch will still
|
||||
// be fast.
|
||||
inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
|
||||
uint8_t input1_data, const uint8_t* input2_data,
|
||||
uint8_t* output_data) {
|
||||
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
||||
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
||||
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
||||
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
||||
|
||||
const int32_t input1_val = params.input1_offset + input1_data;
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sum, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[i] = static_cast<uint8_t>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
inline void Add(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const uint8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8_t* input2_data,
|
||||
const RuntimeShape& output_shape, uint8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
||||
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
||||
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
||||
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
||||
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void AddGeneralParamScale(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int16_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int16_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
int max_value = std::numeric_limits<int16_t>::max();
|
||||
|
||||
TFLITE_DCHECK_GT(params.input1_offset, -max_value);
|
||||
TFLITE_DCHECK_GT(params.input2_offset, -max_value);
|
||||
TFLITE_DCHECK_LT(params.input1_offset, max_value);
|
||||
TFLITE_DCHECK_LT(params.input2_offset, max_value);
|
||||
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void Add(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const int16_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const int16_t* input2_data,
|
||||
const RuntimeShape& output_shape, int16_t* output_data,
|
||||
bool pot_scale = true) {
|
||||
if (!pot_scale) {
|
||||
AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
|
||||
input2_data, output_shape, output_data);
|
||||
return;
|
||||
}
|
||||
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
|
||||
const int input1_shift = params.input1_shift;
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
const int16_t output_activation_min = params.quantized_activation_min;
|
||||
const int16_t output_activation_max = params.quantized_activation_max;
|
||||
|
||||
TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
|
||||
TFLITE_DCHECK_LE(input1_shift, 0);
|
||||
TFLITE_DCHECK_LE(params.input2_shift, 0);
|
||||
const int16_t* not_shift_input =
|
||||
input1_shift == 0 ? input1_data : input2_data;
|
||||
const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
|
||||
const int input_right_shift =
|
||||
input1_shift == 0 ? -params.input2_shift : -input1_shift;
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
// F0 uses 0 integer bits, range [-1, 1].
|
||||
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
||||
|
||||
F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
|
||||
F0 scaled_input = F0::FromRaw(
|
||||
gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
|
||||
F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
|
||||
const int16_t raw_output = result.raw();
|
||||
const int16_t clamped_output = std::min(
|
||||
output_activation_max, std::max(output_activation_min, raw_output));
|
||||
output_data[i] = clamped_output;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline typename std::enable_if<!is_small_integer<T>::value, void>::type
|
||||
BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const T* input1_data,
|
||||
const RuntimeShape& input2_shape, const T* input2_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
const RuntimeShape extended_output_shape =
|
||||
RuntimeShape::ExtendedShape(4, output_shape);
|
||||
|
||||
T activation_min, activation_max;
|
||||
GetActivationParams(params, &activation_min, &activation_max);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
||||
ActivationFunctionWithMinMax<T>(
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)],
|
||||
activation_min, activation_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This function is used for 8-bit as well as for 16-bit, but the accumulator
|
||||
// is 32-bit for both cases. The overflow does not happen due to the
|
||||
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
|
||||
template <typename T>
|
||||
inline typename std::enable_if<is_small_integer<T>::value, void>::type
|
||||
BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const T* input1_data,
|
||||
const RuntimeShape& input2_shape, const T* input2_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
const RuntimeShape extended_output_shape =
|
||||
RuntimeShape::ExtendedShape(4, output_shape);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
const int32_t input1_val =
|
||||
params.input1_offset +
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
|
||||
const int32_t input2_val =
|
||||
params.input2_offset +
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
|
||||
const int32_t shifted_input1_val =
|
||||
input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val =
|
||||
input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier,
|
||||
params.input1_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier,
|
||||
params.input2_shift);
|
||||
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sum, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
||||
static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
|
||||
const RuntimeShape& unswitched_input1_shape,
|
||||
const uint8_t* unswitched_input1_data,
|
||||
const RuntimeShape& unswitched_input2_shape,
|
||||
const uint8_t* unswitched_input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
ArithmeticParams switched_params = unswitched_params;
|
||||
switched_params.input1_offset = unswitched_params.input2_offset;
|
||||
switched_params.input1_multiplier = unswitched_params.input2_multiplier;
|
||||
switched_params.input1_shift = unswitched_params.input2_shift;
|
||||
switched_params.input2_offset = unswitched_params.input1_offset;
|
||||
switched_params.input2_multiplier = unswitched_params.input1_multiplier;
|
||||
switched_params.input2_shift = unswitched_params.input1_shift;
|
||||
|
||||
const bool use_unswitched =
|
||||
unswitched_params.broadcast_category ==
|
||||
tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
|
||||
|
||||
const ArithmeticParams& params =
|
||||
use_unswitched ? unswitched_params : switched_params;
|
||||
const uint8_t* input1_data =
|
||||
use_unswitched ? unswitched_input1_data : unswitched_input2_data;
|
||||
const uint8_t* input2_data =
|
||||
use_unswitched ? unswitched_input2_data : unswitched_input1_data;
|
||||
|
||||
// Fivefold nested loops. The second input resets its position for each
|
||||
// iteration of the second loop. The first input resets its position at the
|
||||
// beginning of the fourth loop. The innermost loop is an elementwise add of
|
||||
// sections of the arrays.
|
||||
uint8_t* output_data_ptr = output_data;
|
||||
const uint8_t* input1_data_ptr = input1_data;
|
||||
const uint8_t* input2_data_reset = input2_data;
|
||||
// In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
|
||||
// between input shapes. y3 for input 1 is always broadcast, and so the
|
||||
// dimension there is 1, whereas optionally y1 might be broadcast for input 2.
|
||||
// Put another way,
|
||||
// input1.shape.FlatSize = y0 * y1 * y2 * y4,
|
||||
// input2.shape.FlatSize = y0 * y2 * y3 * y4.
|
||||
int y0 = params.broadcast_shape[0];
|
||||
int y1 = params.broadcast_shape[1];
|
||||
int y2 = params.broadcast_shape[2];
|
||||
int y3 = params.broadcast_shape[3];
|
||||
int y4 = params.broadcast_shape[4];
|
||||
if (y4 > 1) {
|
||||
// General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
|
||||
// dimension.
|
||||
for (int i0 = 0; i0 < y0; ++i0) {
|
||||
const uint8_t* input2_data_ptr;
|
||||
for (int i1 = 0; i1 < y1; ++i1) {
|
||||
input2_data_ptr = input2_data_reset;
|
||||
for (int i2 = 0; i2 < y2; ++i2) {
|
||||
for (int i3 = 0; i3 < y3; ++i3) {
|
||||
AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
|
||||
output_data_ptr);
|
||||
input2_data_ptr += y4;
|
||||
output_data_ptr += y4;
|
||||
}
|
||||
// We have broadcast y4 of input1 data y3 times, and now move on.
|
||||
input1_data_ptr += y4;
|
||||
}
|
||||
}
|
||||
// We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
|
||||
input2_data_reset = input2_data_ptr;
|
||||
}
|
||||
} else {
|
||||
// Special case of y4 == 1, in which the innermost loop is a single element
|
||||
// and can be combined with the next (y3) as an inner broadcast.
|
||||
//
|
||||
// Note that this handles the case of pure scalar broadcast when
|
||||
// y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
|
||||
// broadcast with batch (as y2 > 1).
|
||||
//
|
||||
// NOTE The process is the same as the above general case except simplified
|
||||
// for y4 == 1 and the loop over y3 is contained within the
|
||||
// AddScalarBroadcast function.
|
||||
for (int i0 = 0; i0 < y0; ++i0) {
|
||||
const uint8_t* input2_data_ptr;
|
||||
for (int i1 = 0; i1 < y1; ++i1) {
|
||||
input2_data_ptr = input2_data_reset;
|
||||
for (int i2 = 0; i2 < y2; ++i2) {
|
||||
AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
|
||||
output_data_ptr);
|
||||
input2_data_ptr += y3;
|
||||
output_data_ptr += y3;
|
||||
input1_data_ptr += 1;
|
||||
}
|
||||
}
|
||||
input2_data_reset = input2_data_ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
|
||||
@@ -0,0 +1,86 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
// T is expected to be either float or int.
|
||||
template <typename T>
|
||||
inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
|
||||
const T* const* input_data, T* output_data) {
|
||||
// All inputs and output should have the same shape, this is checked during
|
||||
// Prepare stage.
|
||||
const size_t size = input_shape.FlatSize();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
T x = 0;
|
||||
for (size_t j = 0; j < num_inputs; ++j) {
|
||||
x += input_data[j][i];
|
||||
}
|
||||
output_data[i] = x;
|
||||
}
|
||||
}
|
||||
|
||||
inline void AddN(const ArithmeticParams& params,
|
||||
const RuntimeShape& input_shape, const size_t num_inputs,
|
||||
const int8_t* const* input_data, int8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
// Input offset is negative input zero point. Activation tensors are
|
||||
// asymmetric quantized so they span the full int8 range.
|
||||
// All inputs should have same zero-point and scale, this is checked during
|
||||
// Prepare stage.
|
||||
TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
|
||||
TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
|
||||
|
||||
// All inputs and output should have the same shape, this is checked during
|
||||
// Prepare stage.
|
||||
const size_t size = input_shape.FlatSize();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
// accumulate in scaled_x before clamping to avoid overflow
|
||||
const int32_t x = params.input1_offset; // x = 0
|
||||
const int32_t shifted_x = x * (1 << params.left_shift);
|
||||
int32_t scaled_x = MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_x, params.input1_multiplier, params.input1_shift);
|
||||
|
||||
for (size_t j = 0; j < num_inputs; ++j) {
|
||||
const int32_t y = params.input1_offset + input_data[j][i];
|
||||
const int32_t shifted_y = y * (1 << params.left_shift);
|
||||
int32_t scaled_y = MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_y, params.input1_multiplier, params.input1_shift);
|
||||
scaled_x += scaled_y;
|
||||
}
|
||||
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
scaled_x, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[i] = static_cast<int8_t>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
|
||||
@@ -0,0 +1,88 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
std::function<bool(T, T)> GetComparefunction(bool is_arg_max) {
|
||||
if (is_arg_max) {
|
||||
return std::greater<T>();
|
||||
} else {
|
||||
return std::less<T>();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename T3, typename Cmp>
|
||||
void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
|
||||
const T3* input2_data, const RuntimeShape& output_shape,
|
||||
T2* output_data, const Cmp& cmp) {
|
||||
TFLITE_DCHECK_GT(input1_shape.DimensionsCount(), 0);
|
||||
TFLITE_DCHECK_EQ(input1_shape.DimensionsCount() - 1,
|
||||
output_shape.DimensionsCount());
|
||||
int axis = input2_data[0];
|
||||
if (axis < 0) {
|
||||
axis += input1_shape.DimensionsCount();
|
||||
}
|
||||
const int axis_size = input1_shape.Dims(axis);
|
||||
|
||||
int outer_size = 1;
|
||||
for (int i = 0; i < axis; ++i) {
|
||||
TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
|
||||
outer_size *= input1_shape.Dims(i);
|
||||
}
|
||||
|
||||
int inner_size = 1;
|
||||
const int dims_count = input1_shape.DimensionsCount();
|
||||
for (int i = axis + 1; i < dims_count; ++i) {
|
||||
TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i - 1));
|
||||
inner_size *= input1_shape.Dims(i);
|
||||
}
|
||||
for (int outer = 0; outer < outer_size; ++outer) {
|
||||
for (int inner = 0; inner < inner_size; ++inner) {
|
||||
auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
|
||||
T2 min_max_index = 0;
|
||||
for (int i = 1; i < axis_size; ++i) {
|
||||
const auto& curr_value =
|
||||
input1_data[(outer * axis_size + i) * inner_size + inner];
|
||||
if (cmp(curr_value, min_max_value)) {
|
||||
min_max_value = curr_value;
|
||||
min_max_index = static_cast<T2>(i);
|
||||
}
|
||||
}
|
||||
output_data[outer * inner_size + inner] = min_max_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename T3>
|
||||
void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
|
||||
const T3* input2_data, const RuntimeShape& output_shape,
|
||||
T2* output_data, const bool is_arg_max) {
|
||||
ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data,
|
||||
GetComparefunction<T1>(is_arg_max));
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
|
||||
@@ -0,0 +1,275 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/tensor_utils_common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
namespace batch_matmul {
|
||||
|
||||
// Determine which dimension is the broadcast dimension.
|
||||
inline int broadcast_dim(int lhs_dim, int rhs_dim) {
|
||||
if (lhs_dim == rhs_dim) return lhs_dim;
|
||||
if (lhs_dim == 1) return rhs_dim;
|
||||
TFLITE_DCHECK_EQ(rhs_dim, 1);
|
||||
return lhs_dim;
|
||||
}
|
||||
|
||||
// Compute the "extent" for iterating on this dimension.
|
||||
// If we are broadcasting, then don't advance (i.e return 0).
|
||||
inline int extent(const RuntimeShape& shape, int x) {
|
||||
if (shape.Dims(x) == 1) {
|
||||
return 0;
|
||||
}
|
||||
int prod = 1;
|
||||
for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
|
||||
prod *= shape.Dims(i);
|
||||
}
|
||||
return prod;
|
||||
}
|
||||
|
||||
} // namespace batch_matmul
|
||||
|
||||
template <typename Ta, typename Tb, typename Tout>
|
||||
inline void BatchMatMul(const RuntimeShape& lhs_shape, const Ta* lhs_data,
|
||||
const RuntimeShape& rhs_shape, const Tb* rhs_data,
|
||||
const RuntimeShape& output_shape, Tout* output_data) {
|
||||
const RuntimeShape extended_lhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, lhs_shape);
|
||||
const RuntimeShape extended_rhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, rhs_shape);
|
||||
|
||||
const int batch_dim0 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
|
||||
const int batch_dim1 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
|
||||
const int batch_dim2 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
|
||||
|
||||
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
|
||||
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
|
||||
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
|
||||
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
|
||||
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
|
||||
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
|
||||
|
||||
// Set params for each matrix multiply.
|
||||
const int lhs_rows = extended_lhs_shape.Dims(3);
|
||||
const int rhs_cols = extended_rhs_shape.Dims(4);
|
||||
const int accum_depth = extended_lhs_shape.Dims(4);
|
||||
|
||||
for (int b0 = 0; b0 < batch_dim0; ++b0) {
|
||||
const Ta* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
|
||||
const Tb* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
|
||||
for (int b1 = 0; b1 < batch_dim1; ++b1) {
|
||||
const Ta* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
|
||||
const Tb* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
|
||||
for (int b2 = 0; b2 < batch_dim2; ++b2) {
|
||||
const Ta* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
|
||||
const Tb* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
|
||||
Tout* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
|
||||
b1 * batch_dim2 + b2) *
|
||||
lhs_rows * rhs_cols;
|
||||
for (int j = 0; j < rhs_cols; ++j) {
|
||||
for (int i = 0; i < lhs_rows; ++i) {
|
||||
Tout total = 0;
|
||||
for (int k = 0; k < accum_depth; ++k) {
|
||||
total += static_cast<Tout>(lhs_ptr2[accum_depth * i + k]) *
|
||||
static_cast<Tout>(rhs_ptr2[j * accum_depth + k]);
|
||||
}
|
||||
int idx = lhs_rows * j + i;
|
||||
out_ptr[idx] = total;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
|
||||
const RuntimeShape& rhs_shape, const int8_t* rhs_data,
|
||||
const float* scaling_factors,
|
||||
const int32_t* input_offset, int32_t* row_sums,
|
||||
const RuntimeShape& output_shape, float* output_data,
|
||||
bool* compute_row_sums) {
|
||||
const RuntimeShape extended_lhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, lhs_shape);
|
||||
const RuntimeShape extended_rhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, rhs_shape);
|
||||
|
||||
const int batch_dim0 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
|
||||
const int batch_dim1 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
|
||||
const int batch_dim2 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
|
||||
|
||||
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
|
||||
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
|
||||
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
|
||||
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
|
||||
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
|
||||
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
|
||||
|
||||
// Set params for each matrix multiply.
|
||||
const int lhs_rows = extended_lhs_shape.Dims(3);
|
||||
const int rhs_cols = extended_rhs_shape.Dims(4);
|
||||
const int accum_depth = extended_lhs_shape.Dims(4);
|
||||
|
||||
const int ioff_ext0 = rhs_ext0 == 0 ? 0 : rhs_cols;
|
||||
const int ioff_ext1 = rhs_ext1 == 0 ? 0 : rhs_cols;
|
||||
const int ioff_ext2 = rhs_ext2 == 0 ? 0 : rhs_cols;
|
||||
const int woff_ext0 = lhs_ext0 == 0 ? 0 : lhs_rows;
|
||||
const int woff_ext1 = lhs_ext1 == 0 ? 0 : lhs_rows;
|
||||
const int woff_ext2 = lhs_ext2 == 0 ? 0 : lhs_rows;
|
||||
|
||||
if (!compute_row_sums || *compute_row_sums) {
|
||||
int num_weights_matrices = 1;
|
||||
for (int i = 1; i < extended_lhs_shape.DimensionsCount() - 2; ++i) {
|
||||
num_weights_matrices *= extended_lhs_shape.Dims(i);
|
||||
}
|
||||
tensor_utils::ReductionSumVector(
|
||||
lhs_data, row_sums, num_weights_matrices * lhs_rows, accum_depth);
|
||||
if (compute_row_sums) {
|
||||
*compute_row_sums = false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int b0 = 0; b0 < batch_dim0; ++b0) {
|
||||
const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
|
||||
const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
|
||||
const int32_t* ioff_ptr0 = input_offset + (b0 * ioff_ext0);
|
||||
const float* scale_ptr0 = scaling_factors + (b0 * ioff_ext0);
|
||||
const int32_t* woff_ptr0 = row_sums + (b0 * woff_ext0);
|
||||
for (int b1 = 0; b1 < batch_dim1; ++b1) {
|
||||
const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
|
||||
const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
|
||||
const int32_t* ioff_ptr1 = ioff_ptr0 + (b1 * ioff_ext1);
|
||||
const float* scale_ptr1 = scale_ptr0 + (b1 * ioff_ext1);
|
||||
const int32_t* woff_ptr1 = woff_ptr0 + (b1 * woff_ext1);
|
||||
for (int b2 = 0; b2 < batch_dim2; ++b2) {
|
||||
const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
|
||||
const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
|
||||
const int32_t* ioff_ptr2 = ioff_ptr1 + (b2 * ioff_ext2);
|
||||
const float* scale_ptr2 = scale_ptr1 + (b2 * ioff_ext2);
|
||||
const int32_t* woff_ptr2 = woff_ptr1 + (b2 * woff_ext2);
|
||||
float* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
|
||||
b1 * batch_dim2 + b2) *
|
||||
lhs_rows * rhs_cols;
|
||||
for (int j = 0; j < rhs_cols; ++j) {
|
||||
const float batch_scaling_factor = scale_ptr2[j];
|
||||
const float batch_offset = static_cast<float>(ioff_ptr2[j]);
|
||||
for (int i = 0; i < lhs_rows; ++i) {
|
||||
int32_t total = 0;
|
||||
for (int k = 0; k < accum_depth; ++k) {
|
||||
total +=
|
||||
lhs_ptr2[accum_depth * i + k] * rhs_ptr2[j * accum_depth + k];
|
||||
}
|
||||
int32_t row_sum = woff_ptr2[i];
|
||||
total -= row_sum * batch_offset;
|
||||
int idx = lhs_rows * j + i;
|
||||
out_ptr[idx] += batch_scaling_factor * total;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename AccumT>
|
||||
inline void BatchMatMul(const FullyConnectedParams& params,
|
||||
const RuntimeShape& lhs_shape, const T* lhs_data,
|
||||
const RuntimeShape& rhs_shape, const T* rhs_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
const RuntimeShape extended_lhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, lhs_shape);
|
||||
const RuntimeShape extended_rhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, rhs_shape);
|
||||
|
||||
const int batch_dim0 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
|
||||
const int batch_dim1 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
|
||||
const int batch_dim2 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
|
||||
|
||||
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
|
||||
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
|
||||
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
|
||||
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
|
||||
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
|
||||
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
|
||||
|
||||
// Set params for each matrix multiply.
|
||||
const int lhs_rows = extended_lhs_shape.Dims(3);
|
||||
const int rhs_cols = extended_rhs_shape.Dims(4);
|
||||
const int accum_depth = extended_lhs_shape.Dims(4);
|
||||
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
|
||||
for (int b0 = 0; b0 < batch_dim0; ++b0) {
|
||||
const T* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
|
||||
const T* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
|
||||
for (int b1 = 0; b1 < batch_dim1; ++b1) {
|
||||
const T* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
|
||||
const T* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
|
||||
for (int b2 = 0; b2 < batch_dim2; ++b2) {
|
||||
const T* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
|
||||
const T* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
|
||||
T* out_ptr = output_data +
|
||||
((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) *
|
||||
lhs_rows * rhs_cols;
|
||||
|
||||
for (int j = 0; j < rhs_cols; ++j) {
|
||||
for (int i = 0; i < lhs_rows; ++i) {
|
||||
AccumT total = 0;
|
||||
for (int k = 0; k < accum_depth; ++k) {
|
||||
AccumT lhs_val = lhs_ptr2[accum_depth * i + k];
|
||||
AccumT rhs_val = rhs_ptr2[accum_depth * j + k];
|
||||
total += (lhs_val + filter_offset) * (rhs_val + input_offset);
|
||||
}
|
||||
int32_t total_scaled = MultiplyByQuantizedMultiplier(
|
||||
total, output_multiplier, output_shift);
|
||||
total_scaled += output_offset;
|
||||
total_scaled = std::max(total_scaled, output_activation_min);
|
||||
total_scaled = std::min(total_scaled, output_activation_max);
|
||||
const int idx = lhs_rows * j + i;
|
||||
out_ptr[idx] = static_cast<T>(total_scaled);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
|
||||
@@ -0,0 +1,101 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
// TODO(b/135760455): Move this method anonymous namespace in a cc file.
|
||||
inline RuntimeShape ExtendShapeBatchToSpace(const RuntimeShape& shape) {
|
||||
if (shape.DimensionsCount() == 4) {
|
||||
return shape;
|
||||
}
|
||||
RuntimeShape new_shape(4, 1);
|
||||
new_shape.SetDim(0, shape.Dims(0));
|
||||
new_shape.SetDim(1, shape.Dims(1));
|
||||
new_shape.SetDim(3, shape.Dims(2));
|
||||
return new_shape;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void BatchToSpaceND(const RuntimeShape& unextended_input1_shape,
|
||||
const T* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const int32_t* block_shape_data,
|
||||
const RuntimeShape& unextended_input3_shape,
|
||||
const int32_t* crops_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BatchToSpaceND");
|
||||
TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
|
||||
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
|
||||
unextended_output_shape.DimensionsCount());
|
||||
|
||||
const RuntimeShape input1_shape =
|
||||
ExtendShapeBatchToSpace(unextended_input1_shape);
|
||||
const RuntimeShape output_shape =
|
||||
ExtendShapeBatchToSpace(unextended_output_shape);
|
||||
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_batch_size = output_shape.Dims(0);
|
||||
|
||||
const int depth = input1_shape.Dims(3);
|
||||
const int input_width = input1_shape.Dims(2);
|
||||
const int input_height = input1_shape.Dims(1);
|
||||
const int input_batch_size = input1_shape.Dims(0);
|
||||
|
||||
const int block_shape_height = block_shape_data[0];
|
||||
const int block_shape_width =
|
||||
unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
|
||||
const int crops_top = crops_data[0];
|
||||
const int crops_left =
|
||||
unextended_input1_shape.DimensionsCount() == 4 ? crops_data[2] : 0;
|
||||
for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
|
||||
const int out_batch = in_batch % output_batch_size;
|
||||
const int spatial_offset = in_batch / output_batch_size;
|
||||
for (int in_h = 0; in_h < input_height; ++in_h) {
|
||||
const int out_h = in_h * block_shape_height +
|
||||
spatial_offset / block_shape_width - crops_top;
|
||||
if (out_h < 0 || out_h >= output_height) {
|
||||
continue;
|
||||
}
|
||||
for (int in_w = 0; in_w < input_width; ++in_w) {
|
||||
const int out_w = in_w * block_shape_width +
|
||||
spatial_offset % block_shape_width - crops_left;
|
||||
|
||||
if (out_w < 0 || out_w >= output_width) {
|
||||
continue;
|
||||
}
|
||||
T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
|
||||
const T* in =
|
||||
input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
|
||||
memcpy(out, in, depth * sizeof(T));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
|
||||
@@ -0,0 +1,91 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
// Also appears to duplicate MinimumMaximum.
|
||||
//
|
||||
// R: Result type. T1: Input 1 type. T2: Input 2 type.
|
||||
template <typename R, typename T1, typename T2>
|
||||
inline void BroadcastBinaryFunction4DSlow(
|
||||
const RuntimeShape& unextended_input1_shape, const T1* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape, const T2* input2_data,
|
||||
const RuntimeShape& unextended_output_shape, R* output_data,
|
||||
R (*func)(T1, T2)) {
|
||||
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
||||
unextended_input2_shape, &desc1, &desc2);
|
||||
|
||||
const int* dims_data =
|
||||
reinterpret_cast<const int*>(output_shape.DimsDataUpTo5D());
|
||||
for (int b = 0; b < output_shape.Dims(0); ++b) {
|
||||
int out_idx_b = b * dims_data[1];
|
||||
int in_idx1_b = desc1.strides[0] * b;
|
||||
int in_idx2_b = desc2.strides[0] * b;
|
||||
for (int y = 0; y < output_shape.Dims(1); ++y) {
|
||||
int out_idx_y = (out_idx_b + y) * dims_data[2];
|
||||
int in_idx1_y = in_idx1_b + desc1.strides[1] * y;
|
||||
int in_idx2_y = in_idx2_b + desc2.strides[1] * y;
|
||||
for (int x = 0; x < output_shape.Dims(2); ++x) {
|
||||
int out_idx_x = (out_idx_y + x) * dims_data[3];
|
||||
int in1_idx = in_idx1_y + desc1.strides[2] * x;
|
||||
int in2_idx = in_idx2_y + desc2.strides[2] * x;
|
||||
for (int c = 0; c < output_shape.Dims(3); ++c) {
|
||||
auto out_idx = out_idx_x + c;
|
||||
auto in1_val = input1_data[in1_idx];
|
||||
auto in2_val = input2_data[in2_idx];
|
||||
output_data[out_idx] = func(in1_val, in2_val);
|
||||
in1_idx += desc1.strides[3];
|
||||
in2_idx += desc2.strides[3];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// R: Result type. T1: Input 1 type. T2: Input 2 type.
|
||||
template <typename R, typename T1, typename T2>
|
||||
inline void BinaryFunction(const RuntimeShape& input1_shape,
|
||||
const T1* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const T2* input2_data,
|
||||
const RuntimeShape& output_shape, R* output_data,
|
||||
R (*func)(T1, T2)) {
|
||||
const int flat_size =
|
||||
MatchingFlatSize(input1_shape, input2_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = func(input1_data[i], input2_data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
|
||||
@@ -0,0 +1,37 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = std::ceil(input_data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
|
||||
@@ -0,0 +1,280 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
|
||||
|
||||
#include "tensorflow/lite/c/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline bool EqualFn(T lhs, T rhs) {
|
||||
return lhs == rhs;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline bool NotEqualFn(T lhs, T rhs) {
|
||||
return lhs != rhs;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline bool GreaterFn(T lhs, T rhs) {
|
||||
return lhs > rhs;
|
||||
}
|
||||
template <typename T>
|
||||
inline bool GreaterEqualFn(T lhs, T rhs) {
|
||||
return lhs >= rhs;
|
||||
}
|
||||
template <typename T>
|
||||
inline bool LessFn(T lhs, T rhs) {
|
||||
return lhs < rhs;
|
||||
}
|
||||
template <typename T>
|
||||
inline bool LessEqualFn(T lhs, T rhs) {
|
||||
return lhs <= rhs;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
using ComparisonFn = bool (*)(T, T);
|
||||
|
||||
template <typename T, ComparisonFn<T> F>
|
||||
inline void ComparisonImpl(
|
||||
const ComparisonParams& op_params, const RuntimeShape& input1_shape,
|
||||
const T* input1_data, const RuntimeShape& input2_shape,
|
||||
const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
|
||||
const int64_t flatsize =
|
||||
MatchingFlatSize(input1_shape, input2_shape, output_shape);
|
||||
for (int64_t i = 0; i < flatsize; ++i) {
|
||||
output_data[i] = F(input1_data[i], input2_data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <ComparisonFn<float> F>
|
||||
inline void Comparison(const ComparisonParams& op_params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const float* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const float* input2_data,
|
||||
const RuntimeShape& output_shape, bool* output_data) {
|
||||
ComparisonImpl<float, F>(op_params, input1_shape, input1_data, input2_shape,
|
||||
input2_data, output_shape, output_data);
|
||||
}
|
||||
|
||||
template <typename T, ComparisonFn<int32_t> F>
|
||||
inline void ComparisonWithScaling(
|
||||
const ComparisonParams& op_params, const RuntimeShape& input1_shape,
|
||||
const T* input1_data, const RuntimeShape& input2_shape,
|
||||
const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
|
||||
int left_shift = op_params.left_shift;
|
||||
int32_t input1_offset = op_params.input1_offset;
|
||||
int32_t input1_multiplier = op_params.input1_multiplier;
|
||||
int input1_shift = op_params.input1_shift;
|
||||
int32_t input2_offset = op_params.input2_offset;
|
||||
int32_t input2_multiplier = op_params.input2_multiplier;
|
||||
int input2_shift = op_params.input2_shift;
|
||||
|
||||
const int64_t flatsize =
|
||||
MatchingFlatSize(input1_shape, input2_shape, output_shape);
|
||||
for (int64_t i = 0; i < flatsize; ++i) {
|
||||
const int32_t input1_val = input1_offset + input1_data[i];
|
||||
const int32_t input2_val = input2_offset + input2_data[i];
|
||||
const int32_t shifted_input1_val = input1_val * (1 << left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, input1_multiplier, input1_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, input2_multiplier, input2_shift);
|
||||
output_data[i] = F(scaled_input1_val, scaled_input2_val);
|
||||
}
|
||||
}
|
||||
|
||||
struct BroadcastComparison4DSlowCommon {
|
||||
const RuntimeShape output_shape;
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
};
|
||||
|
||||
inline BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
|
||||
const RuntimeShape& unextended_input1_shape,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const RuntimeShape& unextended_output_shape) {
|
||||
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
||||
unextended_input2_shape, &desc1, &desc2);
|
||||
return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1,
|
||||
desc2};
|
||||
}
|
||||
|
||||
template <typename T, ComparisonFn<T> F>
|
||||
inline void BroadcastComparison4DSlowImpl(
|
||||
const ComparisonParams& op_params,
|
||||
const RuntimeShape& unextended_input1_shape, const T* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape, const T* input2_data,
|
||||
const RuntimeShape& unextended_output_shape, bool* output_data) {
|
||||
const BroadcastComparison4DSlowCommon dims =
|
||||
BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
|
||||
unextended_input2_shape,
|
||||
unextended_output_shape);
|
||||
|
||||
for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
|
||||
output_data[Offset(dims.output_shape, b, y, x, c)] =
|
||||
F(input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)],
|
||||
input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <ComparisonFn<float> F>
|
||||
inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const float* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const float* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
bool* output_data) {
|
||||
BroadcastComparison4DSlowImpl<float, F>(op_params, input1_shape, input1_data,
|
||||
input2_shape, input2_data,
|
||||
output_shape, output_data);
|
||||
}
|
||||
|
||||
template <typename T, ComparisonFn<int32_t> F>
|
||||
inline void BroadcastComparison4DSlowWithScaling(
|
||||
const ComparisonParams& op_params,
|
||||
const RuntimeShape& unextended_input1_shape, const T* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape, const T* input2_data,
|
||||
const RuntimeShape& unextended_output_shape, bool* output_data) {
|
||||
const BroadcastComparison4DSlowCommon dims =
|
||||
BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
|
||||
unextended_input2_shape,
|
||||
unextended_output_shape);
|
||||
|
||||
int left_shift = op_params.left_shift;
|
||||
int32_t input1_offset = op_params.input1_offset;
|
||||
int32_t input1_multiplier = op_params.input1_multiplier;
|
||||
int input1_shift = op_params.input1_shift;
|
||||
int32_t input2_offset = op_params.input2_offset;
|
||||
int32_t input2_multiplier = op_params.input2_multiplier;
|
||||
int input2_shift = op_params.input2_shift;
|
||||
|
||||
for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
|
||||
const int32_t input1_val =
|
||||
input1_offset +
|
||||
input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)];
|
||||
const int32_t input2_val =
|
||||
input2_offset +
|
||||
input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)];
|
||||
const int32_t shifted_input1_val = input1_val * (1 << left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, input1_multiplier, input1_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, input2_multiplier, input2_shift);
|
||||
output_data[Offset(dims.output_shape, b, y, x, c)] =
|
||||
F(scaled_input1_val, scaled_input2_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define TFLITE_COMPARISON_OP(name) \
|
||||
inline void name(const ComparisonParams& op_params, \
|
||||
const RuntimeShape& input1_shape, const float* input1_data, \
|
||||
const RuntimeShape& input2_shape, const float* input2_data, \
|
||||
const RuntimeShape& output_shape, bool* output_data) { \
|
||||
Comparison<name##Fn>(op_params, input1_shape, input1_data, input2_shape, \
|
||||
input2_data, output_shape, output_data); \
|
||||
} \
|
||||
template <typename T> \
|
||||
inline void name##NoScaling( \
|
||||
const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
|
||||
const T* input1_data, const RuntimeShape& input2_shape, \
|
||||
const T* input2_data, const RuntimeShape& output_shape, \
|
||||
bool* output_data) { \
|
||||
ComparisonImpl<T, name##Fn>(op_params, input1_shape, input1_data, \
|
||||
input2_shape, input2_data, output_shape, \
|
||||
output_data); \
|
||||
} \
|
||||
template <typename T> \
|
||||
inline void name##WithScaling( \
|
||||
const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
|
||||
const T* input1_data, const RuntimeShape& input2_shape, \
|
||||
const T* input2_data, const RuntimeShape& output_shape, \
|
||||
bool* output_data) { \
|
||||
ComparisonWithScaling<T, name##Fn>(op_params, input1_shape, input1_data, \
|
||||
input2_shape, input2_data, \
|
||||
output_shape, output_data); \
|
||||
} \
|
||||
template <typename T> \
|
||||
inline void Broadcast4DSlow##name##NoScaling( \
|
||||
const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
|
||||
const T* input1_data, const RuntimeShape& input2_shape, \
|
||||
const T* input2_data, const RuntimeShape& output_shape, \
|
||||
bool* output_data) { \
|
||||
BroadcastComparison4DSlowImpl<T, name##Fn>( \
|
||||
op_params, input1_shape, input1_data, input2_shape, input2_data, \
|
||||
output_shape, output_data); \
|
||||
} \
|
||||
inline void Broadcast4DSlow##name( \
|
||||
const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
|
||||
const float* input1_data, const RuntimeShape& input2_shape, \
|
||||
const float* input2_data, const RuntimeShape& output_shape, \
|
||||
bool* output_data) { \
|
||||
BroadcastComparison4DSlow<name##Fn>(op_params, input1_shape, input1_data, \
|
||||
input2_shape, input2_data, \
|
||||
output_shape, output_data); \
|
||||
} \
|
||||
template <typename T> \
|
||||
inline void Broadcast4DSlow##name##WithScaling( \
|
||||
const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
|
||||
const T* input1_data, const RuntimeShape& input2_shape, \
|
||||
const T* input2_data, const RuntimeShape& output_shape, \
|
||||
bool* output_data) { \
|
||||
BroadcastComparison4DSlowWithScaling<T, name##Fn>( \
|
||||
op_params, input1_shape, input1_data, input2_shape, input2_data, \
|
||||
output_shape, output_data); \
|
||||
}
|
||||
TFLITE_COMPARISON_OP(Equal);
|
||||
TFLITE_COMPARISON_OP(NotEqual);
|
||||
TFLITE_COMPARISON_OP(Greater);
|
||||
TFLITE_COMPARISON_OP(GreaterEqual);
|
||||
TFLITE_COMPARISON_OP(Less);
|
||||
TFLITE_COMPARISON_OP(LessEqual);
|
||||
#undef TFLITE_COMPARISON_OP
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
|
||||
@@ -0,0 +1,139 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename Scalar>
|
||||
inline void Concatenation(const ConcatenationParams& params,
|
||||
const RuntimeShape* const* input_shapes,
|
||||
const Scalar* const* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
Scalar* output_data) {
|
||||
int axis = params.axis;
|
||||
int inputs_count = params.inputs_count;
|
||||
const int concat_dimensions = output_shape.DimensionsCount();
|
||||
TFLITE_DCHECK_LT(axis, concat_dimensions);
|
||||
|
||||
int64_t concat_size = 0;
|
||||
for (int i = 0; i < inputs_count; i++) {
|
||||
TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
|
||||
for (int j = 0; j < concat_dimensions; j++) {
|
||||
if (j != axis) {
|
||||
MatchingDim(*input_shapes[i], j, output_shape, j);
|
||||
}
|
||||
}
|
||||
concat_size += input_shapes[i]->Dims(axis);
|
||||
}
|
||||
TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
|
||||
int64_t outer_size = 1;
|
||||
for (int i = 0; i < axis; ++i) {
|
||||
outer_size *= output_shape.Dims(i);
|
||||
}
|
||||
// For all input arrays,
|
||||
// FlatSize() = outer_size * Dims(axis) * base_inner_size;
|
||||
int64_t base_inner_size = 1;
|
||||
for (int i = axis + 1; i < concat_dimensions; ++i) {
|
||||
base_inner_size *= output_shape.Dims(i);
|
||||
}
|
||||
|
||||
Scalar* output_ptr = output_data;
|
||||
for (int k = 0; k < outer_size; k++) {
|
||||
for (int i = 0; i < inputs_count; ++i) {
|
||||
const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
|
||||
const Scalar* input_ptr = input_data[i] + k * copy_size;
|
||||
memcpy(output_ptr, input_ptr, copy_size * sizeof(Scalar));
|
||||
output_ptr += copy_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(b/174275780): The quantized implementation of concatentation isn't fully
|
||||
// quantized as it takes scale as a floating point value. This should be fixed
|
||||
// when optimizng this routine further.
|
||||
inline void ConcatenationWithScaling(const ConcatenationParams& params,
|
||||
const RuntimeShape* const* input_shapes,
|
||||
const uint8_t* const* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
int axis = params.axis;
|
||||
const int32_t* input_zeropoint = params.input_zeropoint;
|
||||
const float* input_scale = params.input_scale;
|
||||
int inputs_count = params.inputs_count;
|
||||
const int32_t output_zeropoint = params.output_zeropoint;
|
||||
const float output_scale = params.output_scale;
|
||||
|
||||
const int concat_dimensions = output_shape.DimensionsCount();
|
||||
TFLITE_DCHECK_LT(axis, concat_dimensions);
|
||||
|
||||
int64_t concat_size = 0;
|
||||
for (int i = 0; i < inputs_count; i++) {
|
||||
TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
|
||||
for (int j = 0; j < concat_dimensions; j++) {
|
||||
if (j != axis) {
|
||||
MatchingDim(*input_shapes[i], j, output_shape, j);
|
||||
}
|
||||
}
|
||||
concat_size += input_shapes[i]->Dims(axis);
|
||||
}
|
||||
TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
|
||||
int64_t outer_size = 1;
|
||||
for (int i = 0; i < axis; ++i) {
|
||||
outer_size *= output_shape.Dims(i);
|
||||
}
|
||||
// For all input arrays,
|
||||
// FlatSize() = outer_size * Dims(axis) * base_inner_size;
|
||||
int64_t base_inner_size = 1;
|
||||
for (int i = axis + 1; i < concat_dimensions; ++i) {
|
||||
base_inner_size *= output_shape.Dims(i);
|
||||
}
|
||||
|
||||
const float inverse_output_scale = 1.f / output_scale;
|
||||
uint8_t* output_ptr = output_data;
|
||||
for (int k = 0; k < outer_size; k++) {
|
||||
for (int i = 0; i < inputs_count; ++i) {
|
||||
const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
|
||||
const uint8_t* input_ptr = input_data[i] + k * copy_size;
|
||||
if (input_zeropoint[i] == output_zeropoint &&
|
||||
input_scale[i] == output_scale) {
|
||||
memcpy(output_ptr, input_ptr, copy_size);
|
||||
} else {
|
||||
const float scale = input_scale[i] * inverse_output_scale;
|
||||
const float bias = -input_zeropoint[i] * scale;
|
||||
for (int j = 0; j < copy_size; ++j) {
|
||||
const int32_t value = static_cast<int32_t>(tflite::TfLiteRound(
|
||||
input_ptr[j] * scale + bias)) +
|
||||
output_zeropoint;
|
||||
output_ptr[j] = static_cast<uint8_t>(
|
||||
std::max<int32_t>(std::min<int32_t>(255, value), 0));
|
||||
}
|
||||
}
|
||||
output_ptr += copy_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_
|
||||
@@ -0,0 +1,264 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& filter_shape,
|
||||
const float* filter_data, const RuntimeShape& bias_shape,
|
||||
const float* bias_data, const RuntimeShape& output_shape,
|
||||
float* output_data, const RuntimeShape& im2col_shape,
|
||||
float* im2col_data) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const float output_activation_min = params.float_activation_min;
|
||||
const float output_activation_max = params.float_activation_max;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
float total = 0.f;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
const int in_y = in_y_origin + dilation_height_factor * filter_y;
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
|
||||
if (!is_point_inside_image) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
float input_value = input_data[Offset(input_shape, batch, in_y,
|
||||
in_x, in_channel)];
|
||||
float filter_value = filter_data[Offset(
|
||||
filter_shape, out_channel, filter_y, filter_x, in_channel)];
|
||||
total += (input_value * filter_value);
|
||||
}
|
||||
}
|
||||
}
|
||||
float bias_value = 0.0f;
|
||||
if (bias_data) {
|
||||
bias_value = bias_data[out_channel];
|
||||
}
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
|
||||
ActivationFunctionWithMinMax(total + bias_value,
|
||||
output_activation_min,
|
||||
output_activation_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data, const RuntimeShape& im2col_shape,
|
||||
uint8_t* im2col_data, void* cpu_backend_context) {
|
||||
(void)cpu_backend_context; // only used in optimized code.
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
const int in_y = in_y_origin + dilation_height_factor * filter_y;
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
|
||||
if (!is_point_inside_image) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
int32_t input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
in_x, in_channel)];
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, out_channel, filter_y, filter_x, in_channel)];
|
||||
acc +=
|
||||
(filter_val + filter_offset) * (input_val + input_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias_data) {
|
||||
acc += bias_data[out_channel];
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
|
||||
output_shift);
|
||||
acc += output_offset;
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
|
||||
static_cast<uint8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void HybridConvPerChannel(
|
||||
const ConvParams& params, float* scaling_factors_ptr,
|
||||
const RuntimeShape& input_shape, const int8_t* input_data,
|
||||
const RuntimeShape& filter_shape, const int8_t* filter_data,
|
||||
const RuntimeShape& bias_shape, const float* bias_data,
|
||||
const RuntimeShape& output_shape, float* output_data,
|
||||
const RuntimeShape& im2col_shape, int8_t* im2col_data,
|
||||
const float* per_channel_scale, int32_t* input_offset) {
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const float output_activation_min = params.float_activation_min;
|
||||
const float output_activation_max = params.float_activation_max;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
const int in_y =
|
||||
in_y_origin + dilation_height_factor * filter_y;
|
||||
// If the location is outside the bounds of the input image,
|
||||
// use zero as a default value.
|
||||
if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height)) {
|
||||
int32_t input_val = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
int32_t filter_val =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
acc += filter_val * (input_val - input_offset[batch]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
float acc_float =
|
||||
acc * per_channel_scale[out_channel] * scaling_factors_ptr[batch];
|
||||
if (bias_data) {
|
||||
acc_float += bias_data[out_channel];
|
||||
}
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
|
||||
ActivationFunctionWithMinMax(acc_float, output_activation_min,
|
||||
output_activation_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
|
||||
@@ -0,0 +1,175 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void CumSum(const T* input_data, const RuntimeShape& shape, int32_t axis,
|
||||
bool exclusive, bool reverse, T* output_data) {
|
||||
const int32_t rank = shape.DimensionsCount();
|
||||
TFLITE_DCHECK_GE(rank, 1);
|
||||
TFLITE_DCHECK_GE(axis, 0);
|
||||
TFLITE_DCHECK_LT(axis, rank);
|
||||
|
||||
size_t inner = 1;
|
||||
size_t outer = 1;
|
||||
size_t depth = 1;
|
||||
for (int32_t i = 0; i < rank; i++) {
|
||||
if (i < axis)
|
||||
inner *= shape.Dims(i);
|
||||
else if (i > axis)
|
||||
outer *= shape.Dims(i);
|
||||
else
|
||||
depth = shape.Dims(i);
|
||||
}
|
||||
|
||||
for (size_t outer_index = 0; outer_index < outer; outer_index++) {
|
||||
size_t outer_index_adj;
|
||||
if (reverse)
|
||||
outer_index_adj = (outer - 1) - outer_index;
|
||||
else
|
||||
outer_index_adj = outer_index;
|
||||
for (size_t inner_index = 0; inner_index < inner; inner_index++) {
|
||||
T accumulator = 0;
|
||||
size_t inner_index_adj;
|
||||
if (reverse)
|
||||
inner_index_adj = (inner - 1) - inner_index;
|
||||
else
|
||||
inner_index_adj = inner_index;
|
||||
for (size_t depth_index = 0; depth_index < depth; depth_index++) {
|
||||
size_t depth_index_adj;
|
||||
if (reverse)
|
||||
depth_index_adj = (depth - 1) - depth_index;
|
||||
else
|
||||
depth_index_adj = depth_index;
|
||||
|
||||
size_t index = outer_index_adj;
|
||||
index += inner_index_adj * depth * outer;
|
||||
index += depth_index_adj * outer;
|
||||
|
||||
if (exclusive) {
|
||||
output_data[index] = accumulator;
|
||||
accumulator += input_data[index];
|
||||
} else {
|
||||
accumulator += input_data[index];
|
||||
output_data[index] = accumulator;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Quantized INT8 CUMSUM
|
||||
//
|
||||
inline void CumSum(const ArithmeticParams& params, const int8_t* input_data,
|
||||
const RuntimeShape& shape, int32_t axis, bool exclusive,
|
||||
bool reverse, int8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
// Input offset is negative input zero point. Activation tensors are
|
||||
// asymmetric quantized so they span the full int8 range.
|
||||
// All inputs should have same zero-point and scale, this is checked during
|
||||
// Prepare stage.
|
||||
TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
|
||||
TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
|
||||
|
||||
const int32_t rank = shape.DimensionsCount();
|
||||
TFLITE_DCHECK_GE(rank, 1);
|
||||
TFLITE_DCHECK_GE(axis, 0);
|
||||
TFLITE_DCHECK_LT(axis, rank);
|
||||
|
||||
size_t inner = 1;
|
||||
size_t outer = 1;
|
||||
size_t depth = 1;
|
||||
for (int32_t i = 0; i < rank; i++) {
|
||||
if (i < axis)
|
||||
inner *= shape.Dims(i);
|
||||
else if (i > axis)
|
||||
outer *= shape.Dims(i);
|
||||
else
|
||||
depth = shape.Dims(i);
|
||||
}
|
||||
|
||||
for (size_t outer_index = 0; outer_index < outer; outer_index++) {
|
||||
size_t outer_index_adj;
|
||||
if (reverse)
|
||||
outer_index_adj = (outer - 1) - outer_index;
|
||||
else
|
||||
outer_index_adj = outer_index;
|
||||
for (size_t inner_index = 0; inner_index < inner; inner_index++) {
|
||||
int32_t accumulator = params.input1_offset; // accumulator = 0
|
||||
accumulator *= (1 << params.left_shift);
|
||||
accumulator = MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
accumulator, params.input1_multiplier, params.input1_shift);
|
||||
|
||||
size_t inner_index_adj;
|
||||
if (reverse)
|
||||
inner_index_adj = (inner - 1) - inner_index;
|
||||
else
|
||||
inner_index_adj = inner_index;
|
||||
|
||||
for (size_t depth_index = 0; depth_index < depth; depth_index++) {
|
||||
size_t depth_index_adj;
|
||||
if (reverse)
|
||||
depth_index_adj = (depth - 1) - depth_index;
|
||||
else
|
||||
depth_index_adj = depth_index;
|
||||
|
||||
size_t index = outer_index_adj;
|
||||
index += inner_index_adj * depth * outer;
|
||||
index += depth_index_adj * outer;
|
||||
|
||||
const int32_t y = params.input1_offset + input_data[index];
|
||||
const int32_t shifted_y = y * (1 << params.left_shift);
|
||||
const int32_t scaled_y = MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_y, params.input1_multiplier, params.input1_shift);
|
||||
|
||||
int32_t scaled_output;
|
||||
if (exclusive) {
|
||||
scaled_output = accumulator;
|
||||
accumulator += scaled_y;
|
||||
} else {
|
||||
accumulator += scaled_y;
|
||||
scaled_output = accumulator;
|
||||
}
|
||||
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
scaled_output, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[index] = static_cast<int8_t>(clamped_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CUMSUM_H_
|
||||
@@ -0,0 +1,79 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
const int input_depth = input_shape.Dims(3);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_batch = input_shape.Dims(0);
|
||||
|
||||
const int output_depth = output_shape.Dims(3);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_batch = output_shape.Dims(0);
|
||||
|
||||
const int32_t block_size = op_params.block_size;
|
||||
|
||||
TFLITE_DCHECK_EQ(input_width * block_size, output_width);
|
||||
TFLITE_DCHECK_EQ(input_height * block_size, output_height);
|
||||
TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size);
|
||||
TFLITE_DCHECK_EQ(input_batch, output_batch);
|
||||
|
||||
for (int out_b = 0; out_b < output_batch; ++out_b) {
|
||||
for (int out_h = 0; out_h < output_height; ++out_h) {
|
||||
for (int out_w = 0; out_w < output_width; ++out_w) {
|
||||
for (int out_d = 0; out_d < output_depth; ++out_d) {
|
||||
const int in_d =
|
||||
out_d + ((out_h % block_size) * block_size + out_w % block_size) *
|
||||
output_depth;
|
||||
|
||||
const int in_w = out_w / block_size;
|
||||
const int in_h = out_h / block_size;
|
||||
const int in_b = out_b;
|
||||
|
||||
const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
|
||||
const int output_index =
|
||||
Offset(output_shape, out_b, out_h, out_w, out_d);
|
||||
|
||||
output_data[output_index] = input_data[input_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTH_TO_SPACE_H_
|
||||
@@ -0,0 +1,100 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void DepthwiseConv(
|
||||
const DepthwiseParams& params, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& filter_shape,
|
||||
const float* filter_data, const RuntimeShape& bias_shape,
|
||||
const float* bias_data, const RuntimeShape& output_shape,
|
||||
float* output_data) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int depth_multiplier = params.depth_multiplier;
|
||||
const float output_activation_min = params.float_activation_min;
|
||||
const float output_activation_max = params.float_activation_max;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int input_depth = input_shape.Dims(3);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int ic = 0; ic < input_depth; ++ic) {
|
||||
for (int m = 0; m < depth_multiplier; m++) {
|
||||
const int oc = m + ic * depth_multiplier;
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
float total = 0.f;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
const int in_y =
|
||||
in_y_origin + dilation_height_factor * filter_y;
|
||||
// If the location is outside the bounds of the input image,
|
||||
// use zero as a default value.
|
||||
if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height)) {
|
||||
float input_value =
|
||||
input_data[Offset(input_shape, b, in_y, in_x, ic)];
|
||||
float filter_value = filter_data[Offset(
|
||||
filter_shape, 0, filter_y, filter_x, oc)];
|
||||
total += (input_value * filter_value);
|
||||
}
|
||||
}
|
||||
}
|
||||
float bias_value = 0.0f;
|
||||
if (bias_data) {
|
||||
bias_value = bias_data[oc];
|
||||
}
|
||||
output_data[Offset(output_shape, b, out_y, out_x, oc)] =
|
||||
ActivationFunctionWithMinMax(total + bias_value,
|
||||
output_activation_min,
|
||||
output_activation_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // end namespace reference_ops
|
||||
} // end namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
|
||||
@@ -0,0 +1,319 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
// Used in tests and template parameters to control which version of depthwise
|
||||
// convolution is called. Primarily for reference code, and specializations
|
||||
// forced in tests.
|
||||
enum class DepthwiseConvImplementation {
|
||||
// Run all tests against kUseStandardEntry even if also testing another
|
||||
// kernel, since we need to be sure that the main DepthwiseConv() function in
|
||||
// optimized_ops.h dispatches to a correctly-executing kernel.
|
||||
kNone = 0, // The "default" option: use the normal
|
||||
// DepthwiseConv kernel (entry) function.
|
||||
kUseGenericKernel, // Forced use of generic kernel.
|
||||
kUseNeon3x3, // 3x3 kernel that uses NEON when available.
|
||||
kUseNeon3x3DotProduct, // 3x3 kernel that uses dot-product enabled NEON
|
||||
// when available.
|
||||
kUseCModel3x3DotProduct, // 3x3 kernel, reference C model that is intended
|
||||
// to match overall design NEON code.
|
||||
kUseUnwound3x3DotProduct, // 3x3 kernel, reference C model with unwound loops
|
||||
// and some arrays.
|
||||
kUseIntrinsics3x3DotProduct, // 3x3 kernel using NEON intrinsics.
|
||||
};
|
||||
|
||||
// Category of depthwise convolution output rounding.
|
||||
enum class DepthwiseConvOutputRounding {
|
||||
kNone = 0, // Invalid: specific method must be specified.
|
||||
kAwayFromZero, // Original method: exact halves rounded away from zero.
|
||||
kUpward, // Halves towards +infinity: adds 0.5 before truncate.
|
||||
// This is where a future kNearestEven would be placed.
|
||||
};
|
||||
|
||||
// Category of depthwise convolution depth multiplication.
|
||||
enum class DepthwiseConvDepthMultiplication {
|
||||
kNoMultiplication = 0, // Depth multiplier = 1.
|
||||
kUnitInputDepth, // Input depth = 1, output depth = depth multiplier.
|
||||
};
|
||||
|
||||
namespace reference_ops {
|
||||
namespace depthwise_conv {
|
||||
|
||||
template <DepthwiseConvOutputRounding output_rounding>
|
||||
inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
|
||||
int shift) {
|
||||
TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
|
||||
return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
|
||||
}
|
||||
|
||||
// Single-rounding MultiplyByQuantizedMultiplier
|
||||
#if TFLITE_SINGLE_ROUNDING
|
||||
template <>
|
||||
inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
|
||||
int32_t x, int32_t quantized_multiplier, int shift) {
|
||||
using gemmlowp::RoundingDivideByPOT;
|
||||
using gemmlowp::SaturatingRoundingDoublingHighMul;
|
||||
int left_shift = shift > 0 ? shift : 0;
|
||||
int right_shift = shift > 0 ? 0 : -shift;
|
||||
return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
|
||||
x * (1 << left_shift), quantized_multiplier),
|
||||
right_shift);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
|
||||
int32_t x, int32_t quantized_multiplier, int shift) {
|
||||
return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
|
||||
}
|
||||
// Double-rounding MultiplyByQuantizedMultiplier
|
||||
#else
|
||||
template <>
|
||||
inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
|
||||
int32_t x, int32_t quantized_multiplier, int shift) {
|
||||
return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
|
||||
int32_t x, int32_t quantized_multiplier, int shift) {
|
||||
using gemmlowp::SaturatingRoundingDoublingHighMul;
|
||||
const int left_shift = shift > 0 ? shift : 0;
|
||||
const int right_shift = shift > 0 ? 0 : -shift;
|
||||
const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0;
|
||||
return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
|
||||
quantized_multiplier) +
|
||||
rounding_offset) >>
|
||||
right_shift;
|
||||
}
|
||||
#endif // TFLITE_SINGLE_ROUNDING
|
||||
|
||||
template <DepthwiseConvOutputRounding output_rounding>
|
||||
struct DepthwiseConvBasicKernel {
|
||||
static inline void Run(
|
||||
const DepthwiseParams& params, const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int depth_multiplier = params.depth_multiplier;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int input_depth = input_shape.Dims(3);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int ic = 0; ic < input_depth; ++ic) {
|
||||
for (int m = 0; m < depth_multiplier; m++) {
|
||||
const int oc = m + ic * depth_multiplier;
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x =
|
||||
in_x_origin + dilation_width_factor * filter_x;
|
||||
const int in_y =
|
||||
in_y_origin + dilation_height_factor * filter_y;
|
||||
// If the location is outside the bounds of the input image,
|
||||
// use zero as a default value.
|
||||
if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height)) {
|
||||
int32_t input_val =
|
||||
input_data[Offset(input_shape, b, in_y, in_x, ic)];
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, 0, filter_y, filter_x, oc)];
|
||||
acc += (filter_val + filter_offset) *
|
||||
(input_val + input_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias_data) {
|
||||
acc += bias_data[oc];
|
||||
}
|
||||
acc = DepthwiseConvRound<output_rounding>(acc, output_multiplier,
|
||||
output_shift);
|
||||
acc += output_offset;
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_data[Offset(output_shape, b, out_y, out_x, oc)] =
|
||||
static_cast<uint8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(b/148596273): Reconcile reference versions, perhaps with common
|
||||
// MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
|
||||
static inline void RunPerChannel(
|
||||
const DepthwiseParams& params, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
// Get parameters.
|
||||
// TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int depth_multiplier = params.depth_multiplier;
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
const int32_t* output_multiplier = params.output_multiplier_per_channel;
|
||||
const int32_t* output_shift = params.output_shift_per_channel;
|
||||
|
||||
// Check dimensions of the tensors.
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int input_depth = input_shape.Dims(3);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
for (int m = 0; m < depth_multiplier; ++m) {
|
||||
const int output_channel = m + in_channel * depth_multiplier;
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x =
|
||||
in_x_origin + dilation_width_factor * filter_x;
|
||||
const int in_y =
|
||||
in_y_origin + dilation_height_factor * filter_y;
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
if (is_point_inside_image) {
|
||||
int32_t input_val = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, 0, filter_y, filter_x, output_channel)];
|
||||
// Accumulate with 32 bits accumulator.
|
||||
// In the nudging process during model quantization, we
|
||||
// force real value of 0.0 be represented by a quantized
|
||||
// value. This guarantees that the input_offset is a int8_t,
|
||||
// even though it is represented using int32_t. int32_t +=
|
||||
// int8_t
|
||||
// * (int8_t - int8_t) so the highest value we can get from
|
||||
// each accumulation is [-127, 127] * ([-128, 127] -
|
||||
// [-128, 127]), which is [-32512, 32512]. log2(32512)
|
||||
// = 14.98, which means we can accumulate at least 2^16
|
||||
// multiplications without overflow. The accumulator is
|
||||
// applied to a filter so the accumulation logic will hold
|
||||
// as long as the filter size (filter_y * filter_x *
|
||||
// in_channel) does not exceed 2^16, which is the case in
|
||||
// all the models we have seen so far.
|
||||
acc += filter_val * (input_val + input_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias_data) {
|
||||
acc += bias_data[output_channel];
|
||||
}
|
||||
acc = DepthwiseConvRound<output_rounding>(
|
||||
acc, output_multiplier[output_channel],
|
||||
output_shift[output_channel]);
|
||||
acc += output_offset;
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x,
|
||||
output_channel)] = static_cast<int8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace depthwise_conv
|
||||
|
||||
inline void DepthwiseConv(
|
||||
const DepthwiseParams& params, const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
return depthwise_conv::DepthwiseConvBasicKernel<
|
||||
DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
|
||||
input_data, filter_shape,
|
||||
filter_data, bias_shape,
|
||||
bias_data, output_shape,
|
||||
output_data);
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // end namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
|
||||
@@ -0,0 +1,78 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEQUANTIZE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEQUANTIZE_H_
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
// Dequantizes into a float without rounding.
|
||||
template <typename InputT, typename OutputT>
|
||||
inline void Dequantize(const tflite::DequantizationParams& op_params,
|
||||
const RuntimeShape& input_shape,
|
||||
const InputT* input_data,
|
||||
const RuntimeShape& output_shape, OutputT* output_data) {
|
||||
int32_t zero_point = op_params.zero_point;
|
||||
const double scale = op_params.scale;
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
const int32_t val = input_data[i];
|
||||
const OutputT result = static_cast<OutputT>(scale * (val - zero_point));
|
||||
output_data[i] = result;
|
||||
}
|
||||
}
|
||||
|
||||
// Dequantizes per-channel quantized tensor to float.
|
||||
template <typename T>
|
||||
inline void PerChannelDequantize(
|
||||
const tflite::PerChannelDequantizationParams& op_params,
|
||||
const RuntimeShape& input_shape, const T* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
// Ensure flat size is same.
|
||||
MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
const int32_t* zero_point = op_params.zero_point;
|
||||
const float* scale = op_params.scale;
|
||||
const int32_t quantized_dimension = op_params.quantized_dimension;
|
||||
const int32_t num_dims = input_shape.DimensionsCount();
|
||||
const int32_t* dims_data = input_shape.DimsData();
|
||||
std::vector<int> current_dim(num_dims, 0);
|
||||
|
||||
do {
|
||||
size_t offset =
|
||||
ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
|
||||
current_dim.data(), 0, nullptr);
|
||||
const int channel = current_dim[quantized_dimension];
|
||||
const int32_t val = input_data[offset];
|
||||
const float result =
|
||||
static_cast<float>(scale[channel] * (val - zero_point[channel]));
|
||||
output_data[offset] = result;
|
||||
} while (NextIndex(num_dims, reinterpret_cast<const int*>(dims_data),
|
||||
current_dim.data()));
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
|
||||
} // namespace tflite
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEQUANTIZE_H_
|
||||
@@ -0,0 +1,37 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
inline void Elu(const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
const float val = input_data[i];
|
||||
output_data[i] = val < 0.0f ? TfLiteExpm1(val) : val;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
|
||||
@@ -0,0 +1,38 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void Exp(const T* input_data, const size_t num_elements,
|
||||
T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("Exp");
|
||||
for (size_t idx = 0; idx < num_elements; ++idx) {
|
||||
output_data[idx] = std::exp(input_data[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
|
||||
@@ -0,0 +1,38 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
void Fill(const RuntimeShape& value_shape, const T* value_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
TFLITE_DCHECK_EQ(value_shape.DimensionsCount(), 0);
|
||||
const int flat_size = output_shape.FlatSize();
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = *value_data;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
|
||||
@@ -0,0 +1,39 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
inline void Floor(const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
int offset = i;
|
||||
output_data[offset] = std::floor(input_data[offset]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
|
||||
@@ -0,0 +1,35 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
|
||||
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
T FloorDiv(T input1, T input2) {
|
||||
return std::floor(std::divides<double>()(static_cast<double>(input1),
|
||||
static_cast<double>(input2)));
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_DIV_H_
|
||||
@@ -0,0 +1,44 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
|
||||
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
T FloorMod(T input1, T input2) {
|
||||
struct FloatMod {
|
||||
float operator()(const float lhs, const float rhs) const {
|
||||
return std::fmod(lhs, rhs);
|
||||
}
|
||||
};
|
||||
using ModFunc = typename std::conditional<std::is_integral<T>::value,
|
||||
std::modulus<T>, FloatMod>::type;
|
||||
ModFunc mod_func;
|
||||
T trunc_mod = mod_func(input1, input2);
|
||||
return (trunc_mod != 0) && ((input2 < 0) != (trunc_mod < 0))
|
||||
? (trunc_mod + input2)
|
||||
: trunc_mod;
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_MOD_H_
|
||||
@@ -0,0 +1,321 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void FullyConnected(
|
||||
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& weights_shape,
|
||||
const float* weights_data, const RuntimeShape& bias_shape,
|
||||
const float* bias_data, const RuntimeShape& output_shape,
|
||||
float* output_data) {
|
||||
const float output_activation_min = params.float_activation_min;
|
||||
const float output_activation_max = params.float_activation_max;
|
||||
// TODO(b/62193649): This really should be:
|
||||
// const int batches = ArraySize(output_dims, 1);
|
||||
// but the current --variable_batch hack consists in overwriting the 3rd
|
||||
// dimension with the runtime batch size, as we don't keep track for each
|
||||
// array of which dimension is the batch dimension in it.
|
||||
const int output_dims_count = output_shape.DimensionsCount();
|
||||
const int weights_dims_count = weights_shape.DimensionsCount();
|
||||
const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
|
||||
const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
|
||||
output_shape, output_dims_count - 1);
|
||||
const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int out_c = 0; out_c < output_depth; ++out_c) {
|
||||
float total = 0.f;
|
||||
for (int d = 0; d < accum_depth; ++d) {
|
||||
total += input_data[b * accum_depth + d] *
|
||||
weights_data[out_c * accum_depth + d];
|
||||
}
|
||||
float bias_value = 0.0f;
|
||||
if (bias_data) {
|
||||
bias_value = bias_data[out_c];
|
||||
}
|
||||
output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
|
||||
total + bias_value, output_activation_min, output_activation_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void FullyConnected(
|
||||
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
|
||||
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
|
||||
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
// TODO(b/62193649): This really should be:
|
||||
// const int batches = ArraySize(output_dims, 1);
|
||||
// but the current --variable_batch hack consists in overwriting the 3rd
|
||||
// dimension with the runtime batch size, as we don't keep track for each
|
||||
// array of which dimension is the batch dimension in it.
|
||||
const int output_dim_count = output_shape.DimensionsCount();
|
||||
const int filter_dim_count = filter_shape.DimensionsCount();
|
||||
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
|
||||
const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
|
||||
output_shape, output_dim_count - 1);
|
||||
const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int out_c = 0; out_c < output_depth; ++out_c) {
|
||||
int32_t acc = 0;
|
||||
for (int d = 0; d < accum_depth; ++d) {
|
||||
int32_t input_val = input_data[b * accum_depth + d];
|
||||
int32_t filter_val = filter_data[out_c * accum_depth + d];
|
||||
acc += (filter_val + filter_offset) * (input_val + input_offset);
|
||||
}
|
||||
if (bias_data) {
|
||||
acc += bias_data[out_c];
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
|
||||
acc += output_offset;
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void FullyConnected(
|
||||
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
TFLITE_DCHECK_EQ(output_offset, 0);
|
||||
// TODO(b/62193649): This really should be:
|
||||
// const int batches = ArraySize(output_dims, 1);
|
||||
// but the current --variable_batch hack consists in overwriting the 3rd
|
||||
// dimension with the runtime batch size, as we don't keep track for each
|
||||
// array of which dimension is the batch dimension in it.
|
||||
const int output_dim_count = output_shape.DimensionsCount();
|
||||
const int filter_dim_count = filter_shape.DimensionsCount();
|
||||
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
|
||||
const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
|
||||
output_shape, output_dim_count - 1);
|
||||
const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int out_c = 0; out_c < output_depth; ++out_c) {
|
||||
// Internal accumulation.
|
||||
// Initialize accumulator with the bias-value.
|
||||
int32_t accum = bias_data[out_c];
|
||||
// Accumulation loop.
|
||||
for (int d = 0; d < accum_depth; ++d) {
|
||||
int16_t input_val = input_data[b * accum_depth + d] + input_offset;
|
||||
int16_t filter_val =
|
||||
filter_data[out_c * accum_depth + d] + filter_offset;
|
||||
accum += filter_val * input_val;
|
||||
}
|
||||
// Down-scale the final int32_t accumulator to the scale used by our
|
||||
// (16-bit, typically 3 integer bits) fixed-point format. The quantized
|
||||
// multiplier and shift here have been pre-computed offline
|
||||
// (e.g. by toco).
|
||||
accum =
|
||||
MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
|
||||
// Saturate, cast to int16_t, and store to output array.
|
||||
accum = std::max(accum, output_activation_min - output_offset);
|
||||
accum = std::min(accum, output_activation_max - output_offset);
|
||||
accum += output_offset;
|
||||
output_data[out_c + output_depth * b] = accum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void ShuffledFullyConnected(
|
||||
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data, const RuntimeShape& weights_shape,
|
||||
const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data, uint8_t* shuffled_input_workspace_data) {
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
|
||||
TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
|
||||
TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
|
||||
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
|
||||
// TODO(b/62193649): This really should be:
|
||||
// const int batches = ArraySize(output_dims, 1);
|
||||
// but the current --variable_batch hack consists in overwriting the 3rd
|
||||
// dimension with the runtime batch size, as we don't keep track for each
|
||||
// array of which dimension is the batch dimension in it.
|
||||
const int output_dim_count = output_shape.DimensionsCount();
|
||||
const int weights_dim_count = weights_shape.DimensionsCount();
|
||||
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
|
||||
const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
|
||||
output_shape, output_dim_count - 1);
|
||||
const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
|
||||
TFLITE_DCHECK((accum_depth % 16) == 0);
|
||||
TFLITE_DCHECK((output_depth % 4) == 0);
|
||||
|
||||
// Shuffling and xoring of input activations into the workspace buffer
|
||||
uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
|
||||
if (batches == 1) {
|
||||
for (int i = 0; i < accum_depth; i++) {
|
||||
shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
|
||||
}
|
||||
} else if (batches == 4) {
|
||||
for (int c = 0; c < accum_depth; c += 16) {
|
||||
for (int b = 0; b < 4; b++) {
|
||||
const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
|
||||
for (int j = 0; j < 16; j++) {
|
||||
uint8_t src_val = *src_data_ptr++;
|
||||
// Flip the sign bit, so that the kernel will only need to
|
||||
// reinterpret these uint8_t values as int8_t, getting for free the
|
||||
// subtraction of the zero_point value 128.
|
||||
uint8_t dst_val = src_val ^ 0x80;
|
||||
*shuffled_input_workspace_ptr++ = dst_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
TFLITE_DCHECK(false);
|
||||
return;
|
||||
}
|
||||
|
||||
// Actual computation
|
||||
if (batches == 1) {
|
||||
int16_t* output_ptr = output_data;
|
||||
// Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
|
||||
// so that just reinterpreting them as int8_t values is equivalent to
|
||||
// subtracting 128 from them, thus implementing for free the subtraction of
|
||||
// the zero_point value 128.
|
||||
const int8_t* shuffled_weights_ptr =
|
||||
reinterpret_cast<const int8_t*>(shuffled_weights_data);
|
||||
// Likewise, we preshuffled and pre-xored the input data above.
|
||||
const int8_t* shuffled_input_data =
|
||||
reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
|
||||
for (int c = 0; c < output_depth; c += 4) {
|
||||
// Internal accumulation.
|
||||
// Initialize accumulator with the bias-value.
|
||||
int32_t accum[4] = {0};
|
||||
// Accumulation loop.
|
||||
for (int d = 0; d < accum_depth; d += 16) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int j = 0; j < 16; j++) {
|
||||
int8_t input_val = shuffled_input_data[d + j];
|
||||
int8_t weights_val = *shuffled_weights_ptr++;
|
||||
accum[i] += weights_val * input_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 4; i++) {
|
||||
// Add bias value
|
||||
int32_t acc = accum[i] + bias_data[c + i];
|
||||
// Down-scale the final int32_t accumulator to the scale used by our
|
||||
// (16-bit, typically 3 integer bits) fixed-point format. The quantized
|
||||
// multiplier and shift here have been pre-computed offline
|
||||
// (e.g. by toco).
|
||||
acc =
|
||||
MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
|
||||
// Saturate, cast to int16_t, and store to output array.
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_ptr[c + i] = acc;
|
||||
}
|
||||
}
|
||||
} else if (batches == 4) {
|
||||
int16_t* output_ptr = output_data;
|
||||
// Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
|
||||
// so that just reinterpreting them as int8_t values is equivalent to
|
||||
// subtracting 128 from them, thus implementing for free the subtraction of
|
||||
// the zero_point value 128.
|
||||
const int8_t* shuffled_weights_ptr =
|
||||
reinterpret_cast<const int8_t*>(shuffled_weights_data);
|
||||
// Likewise, we preshuffled and pre-xored the input data above.
|
||||
const int8_t* shuffled_input_data =
|
||||
reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
|
||||
for (int c = 0; c < output_depth; c += 4) {
|
||||
const int8_t* shuffled_input_ptr = shuffled_input_data;
|
||||
// Accumulation loop.
|
||||
// Internal accumulation.
|
||||
// Initialize accumulator with the bias-value.
|
||||
int32_t accum[4][4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int b = 0; b < 4; b++) {
|
||||
accum[i][b] = 0;
|
||||
}
|
||||
}
|
||||
for (int d = 0; d < accum_depth; d += 16) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int b = 0; b < 4; b++) {
|
||||
for (int j = 0; j < 16; j++) {
|
||||
int8_t input_val = shuffled_input_ptr[16 * b + j];
|
||||
int8_t weights_val = shuffled_weights_ptr[16 * i + j];
|
||||
accum[i][b] += weights_val * input_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
shuffled_input_ptr += 64;
|
||||
shuffled_weights_ptr += 64;
|
||||
}
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int b = 0; b < 4; b++) {
|
||||
// Add bias value
|
||||
int32_t acc = accum[i][b] + bias_data[c + i];
|
||||
// Down-scale the final int32_t accumulator to the scale used by our
|
||||
// (16-bit, typically 3 integer bits) fixed-point format. The
|
||||
// quantized multiplier and shift here have been pre-computed offline
|
||||
// (e.g. by toco).
|
||||
acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
|
||||
output_shift);
|
||||
// Saturate, cast to int16_t, and store to output array.
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_ptr[b * output_depth + c + i] = acc;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
TFLITE_DCHECK(false);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
|
||||
@@ -0,0 +1,166 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline int16_t SaturatingLeftShift(int16_t value, int amount) {
|
||||
int32_t result = static_cast<int32_t>(value) * (1 << amount);
|
||||
result = std::min<int32_t>(result, std::numeric_limits<int16_t>::max());
|
||||
result = std::max<int32_t>(result, std::numeric_limits<int16_t>::min());
|
||||
return result;
|
||||
}
|
||||
|
||||
// Similar to ARM instruction SQDMULH.
|
||||
// Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
|
||||
// rounding to zero instead of to nearest (SQRDMULH).
|
||||
inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
|
||||
bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
|
||||
std::int32_t a_32(a);
|
||||
std::int32_t b_32(b);
|
||||
std::int32_t ab_32 = a_32 * b_32;
|
||||
std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
|
||||
return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("ReferenceHardSwish/Float");
|
||||
auto matching_size = MatchingFlatSize(input_shape, output_shape);
|
||||
const T* in_end = input_data + matching_size;
|
||||
for (; input_data < in_end; input_data++, output_data++) {
|
||||
const float in = *input_data;
|
||||
*output_data =
|
||||
in * std::min(static_cast<T>(6), std::max(static_cast<T>(0), in + 3)) /
|
||||
6;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void HardSwish(const HardSwishParams& params,
|
||||
const RuntimeShape& input_shape, const T* input_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("ReferenceHardSwish/Quantized");
|
||||
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
const int16_t input_value = input_data[i] - params.input_zero_point;
|
||||
// Left-shift as much as we can without overflow/saturation to put
|
||||
// significant bits in the high bits of our 16-bit fixedpoint values, so
|
||||
// that fixed-point approximate computations below are as accurate as
|
||||
// possible.
|
||||
const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
|
||||
// Compute the input value on essentially the output scale, just not
|
||||
// right-shifted yet. This is the value that we'll use in the (x >= +3)
|
||||
// case, and that in the general case we'll multiply against the "relu-ish"
|
||||
// fixed-point multiplier in [0, 1].
|
||||
const int16_t input_value_on_preshift_output_scale =
|
||||
gemmlowp::SaturatingRoundingDoublingHighMul(
|
||||
input_value_on_hires_input_scale,
|
||||
params.output_multiplier_fixedpoint_int16);
|
||||
// Now compute the "relu-ish multiplier". In the (-3 <= x <= +3) case, that
|
||||
// is just an affine rescaling of x from [-3, 3] to [0, 1]. In the general
|
||||
// case, it is just that plus saturation at the boundaries of [-3, 3].
|
||||
// First, we rescale from [-3, 3] to [-1, 1], saturating.
|
||||
// That is done by rescaling the input value with a fixed-point multiplier
|
||||
// (reluish_multiplier_fixedpoint) and bit-shift such that we represent
|
||||
// that input value on the scale where the real value 3.0f is represented
|
||||
// by the quantized value 32768. (+32768 is actually not representable as
|
||||
// int16_t, so this saturates at +32767, and that is seen empirically to be
|
||||
// a negligible contribution to numerical error/bias).
|
||||
//
|
||||
// This code is careful to correctly implement any magnitude of multiplier,
|
||||
// involving either a right shift or a left shift, with correct saturation
|
||||
// behavior in the left-shift case. This forces this code to be more
|
||||
// complicated, but is necessary for real applications: a partially
|
||||
// trained quantized MobileNet v3-small model that motivated this code
|
||||
// exhibits some large [min, max] range boundaries, of the order of
|
||||
// magnitude of 10 or 100 depending on layers.
|
||||
//
|
||||
// The next few lines are basically just an ordinary
|
||||
// MultiplyByQuantizedMultiplier, except that we are more careful here
|
||||
// about the fine details of saturation when left-shifting, because here
|
||||
// overflow in left-shift is a common case, not an anomaly as
|
||||
// MultiplyByQuantizedMultiplier assumes.
|
||||
int16_t reluish_value = input_value_on_hires_input_scale;
|
||||
// Shift left, saturating, as much as we can while ensuring that this
|
||||
// saturation will not contribute to the result. That is, left shift amount
|
||||
// reduced by 1.
|
||||
if (params.reluish_multiplier_exponent > 0) {
|
||||
reluish_value = SaturatingLeftShift(
|
||||
reluish_value, params.reluish_multiplier_exponent - 1);
|
||||
}
|
||||
// Apply the fixed-point multiplier, dividing the value by a divisor
|
||||
// ranging in [1, 2].
|
||||
reluish_value = gemmlowp::SaturatingRoundingDoublingHighMul(
|
||||
reluish_value, params.reluish_multiplier_fixedpoint_int16);
|
||||
// Apply the last bit of left-shift. Thus, in the left-shifting case, if
|
||||
// any saturation affects the result, it is happening here --- any
|
||||
// saturation having occurred above is overwritten here, not affecting the
|
||||
// result.
|
||||
if (params.reluish_multiplier_exponent > 0) {
|
||||
reluish_value = SaturatingLeftShift(reluish_value, 1);
|
||||
}
|
||||
// Shift right, in the right-shifting case.
|
||||
if (params.reluish_multiplier_exponent < 0) {
|
||||
reluish_value = gemmlowp::RoundingDivideByPOT(
|
||||
reluish_value, -params.reluish_multiplier_exponent);
|
||||
}
|
||||
// At this point we have rescaled the value into a 16bit fixedpoint
|
||||
// reluish_value in [-1, 1].
|
||||
// We now convert that to a 16bit fixedpoint value in [0, 1].
|
||||
reluish_value = (reluish_value + (1 << 15)) >> 1;
|
||||
// Use of SaturatingDoublingHighMul here is important to cancel the biases
|
||||
// from the above SaturatingRoundingDoublingHighMul.
|
||||
//
|
||||
// On a partially trained MobileNet-v3-small,
|
||||
//
|
||||
// | bias on | ImageNet
|
||||
// | quantized | Top-1
|
||||
// Operation used here | values | accuracy (50k)
|
||||
// --------------------------------------+------------+-----------
|
||||
// SaturatingDoublingHighMul | -0.0024 | 58.920
|
||||
// SaturatingRoundingDoublingHighMul | -0.0067 | 58.064
|
||||
//
|
||||
// In activations_test, this is covered by this testcase:
|
||||
// QuantizedActivationsOpTest.HardSwishBias
|
||||
//
|
||||
const int16_t preshift_output_value = SaturatingDoublingHighMul(
|
||||
reluish_value, input_value_on_preshift_output_scale);
|
||||
// We were so far operating on the pre-shift output scale. Now we finally
|
||||
// apply that output shift, arriving at the final output scale.
|
||||
int16_t output_value = gemmlowp::RoundingDivideByPOT(
|
||||
preshift_output_value, -params.output_multiplier_exponent);
|
||||
output_value += params.output_zero_point;
|
||||
output_value =
|
||||
std::min<int16_t>(output_value, std::numeric_limits<T>::max());
|
||||
output_value =
|
||||
std::max<int16_t>(output_value, std::numeric_limits<T>::min());
|
||||
output_data[i] = output_value;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
|
||||
@@ -0,0 +1,144 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
inline void CheckArithmeticParams(const ArithmeticParams& params) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
// Input offset is negative input zero point. Activation tensors are
|
||||
// asymmetric quantized so they span the full int8 range.
|
||||
TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
|
||||
TFLITE_DCHECK_GE(-params.input2_offset, std::numeric_limits<int8_t>::min());
|
||||
TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
|
||||
TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
|
||||
}
|
||||
|
||||
inline void ElementWise(
|
||||
int size, const ArithmeticParams& params, const int8_t* input1_data,
|
||||
const int8_t* input2_data, int8_t* output_data,
|
||||
void (*check_arithmetic_params)(const ArithmeticParams&),
|
||||
int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
|
||||
CheckArithmeticParams(params);
|
||||
for (int i = 0; i < size; ++i) {
|
||||
output_data[i] = binary_func(input1_data[i], input2_data[i], params);
|
||||
}
|
||||
}
|
||||
|
||||
inline void BroadcastBinaryFunction4DSlow(
|
||||
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
||||
const int8_t* input1_data, const RuntimeShape& input2_shape,
|
||||
const int8_t* input2_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data,
|
||||
void (*check_arithmetic_params)(const ArithmeticParams&),
|
||||
int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
const RuntimeShape extended_output_shape =
|
||||
RuntimeShape::ExtendedShape(4, output_shape);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] = binary_func(
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)],
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)], params);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline int8_t AddFunc(int8_t x, int8_t y, const ArithmeticParams& params) {
|
||||
const int32_t input1_val = params.input1_offset + x;
|
||||
const int32_t input2_val = params.input2_offset + y;
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sum, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
return static_cast<int8_t>(clamped_output);
|
||||
}
|
||||
|
||||
// Element-wise add that can often be used for inner loop of broadcast add as
|
||||
// well as the non-broadcast add.
|
||||
inline void AddElementwise(int size, const ArithmeticParams& params,
|
||||
const int8_t* input1_data, const int8_t* input2_data,
|
||||
int8_t* output_data) {
|
||||
ElementWise(size, params, input1_data, input2_data, output_data,
|
||||
CheckArithmeticParams, AddFunc);
|
||||
}
|
||||
|
||||
inline void Add(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const int8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const int8_t* input2_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
CheckArithmeticParams(params);
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int8_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int8_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
BroadcastBinaryFunction4DSlow(params, input1_shape, input1_data, input2_shape,
|
||||
input2_data, output_shape, output_data,
|
||||
CheckArithmeticParams, AddFunc);
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
|
||||
@@ -0,0 +1,221 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
// Fixed-point per-channel-quantization convolution reference kernel.
|
||||
inline void ConvPerChannel(
|
||||
const ConvParams& params, const int32_t* output_multiplier,
|
||||
const int32_t* output_shift, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
// Get parameters.
|
||||
const int32_t input_offset = params.input_offset; // r = s(q - Z)
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
|
||||
// Set min and max value of the output.
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
|
||||
// Consistency check.
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
|
||||
// Check dimensions of the tensors.
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
const int in_y = in_y_origin + dilation_height_factor * filter_y;
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
|
||||
if (!is_point_inside_image) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
int32_t input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
in_x, in_channel)];
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, out_channel, filter_y, filter_x, in_channel)];
|
||||
// Accumulate with 32 bits accumulator.
|
||||
// In the nudging process during model quantization, we force
|
||||
// real value of 0.0 be represented by a quantized value. This
|
||||
// guarantees that the input_offset is a int8_t, even though
|
||||
// it is represented using int32_t. int32_t += int8_t *
|
||||
// (int8_t - int8_t) so the highest value we can get from each
|
||||
// accumulation is [-127, 127] * ([-128, 127] -
|
||||
// [-128, 127]), which is [-32512, 32512]. log2(32512)
|
||||
// = 14.98, which means we can accumulate at least 2^16
|
||||
// multiplications without overflow. The accumulator is
|
||||
// applied to a filter so the accumulation logic will hold as
|
||||
// long as the filter size (filter_y * filter_x * in_channel)
|
||||
// does not exceed 2^16, which is the case in all the models
|
||||
// we have seen so far.
|
||||
// TODO(b/174275578): Add a check to make sure the
|
||||
// accumulator depth is smaller than 2^16.
|
||||
acc += filter_val * (input_val + input_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (bias_data) {
|
||||
acc += bias_data[out_channel];
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(
|
||||
acc, output_multiplier[out_channel], output_shift[out_channel]);
|
||||
acc += output_offset;
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
|
||||
static_cast<int8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fixed-point per-channel-quantization convolution reference kernel.
|
||||
// 16-bit data and 8-bit filter
|
||||
inline void ConvPerChannel(
|
||||
const ConvParams& params, const int32_t* output_multiplier,
|
||||
const int32_t* output_shift, const RuntimeShape& input_shape,
|
||||
const int16_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const std::int64_t* bias_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
// Get parameters.
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
|
||||
// Set min and max value of the output.
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
|
||||
// Consistency check.
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
|
||||
// Check dimensions of the tensors.
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
std::int64_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
const int in_y = in_y_origin + dilation_height_factor * filter_y;
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
|
||||
if (!is_point_inside_image) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
int32_t input_val = input_data[Offset(input_shape, batch, in_y,
|
||||
in_x, in_channel)];
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, out_channel, filter_y, filter_x, in_channel)];
|
||||
// Accumulate with 64 bits accumulator.
|
||||
// int64_t += int8_t * int16_t so the highest value we can
|
||||
// get from each accumulation is [-127, 127] * ([-32768,
|
||||
// 32767] -
|
||||
// [-32768, 32767]), which is [-8322945, 8322945].
|
||||
// log2(8322945) = 22.99.
|
||||
acc += filter_val * input_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias_data) {
|
||||
acc += bias_data[out_channel];
|
||||
}
|
||||
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
|
||||
acc, output_multiplier[out_channel], output_shift[out_channel]);
|
||||
scaled_acc = std::max(scaled_acc, output_activation_min);
|
||||
scaled_acc = std::min(scaled_acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
|
||||
static_cast<int16_t>(scaled_acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
|
||||
@@ -0,0 +1,289 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
inline void DepthwiseConvPerChannel(
|
||||
const DepthwiseParams& params, const int32_t* output_multiplier,
|
||||
const int32_t* output_shift, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
// Get parameters.
|
||||
// TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int depth_multiplier = params.depth_multiplier;
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
|
||||
// Check dimensions of the tensors.
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int input_depth = input_shape.Dims(3);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
for (int m = 0; m < depth_multiplier; ++m) {
|
||||
const int output_channel = m + in_channel * depth_multiplier;
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
const int in_y =
|
||||
in_y_origin + dilation_height_factor * filter_y;
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
if (is_point_inside_image) {
|
||||
int32_t input_val = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, 0, filter_y, filter_x, output_channel)];
|
||||
// Accumulate with 32 bits accumulator.
|
||||
// In the nudging process during model quantization, we force
|
||||
// real value of 0.0 be represented by a quantized value. This
|
||||
// guarantees that the input_offset is a int8_t, even though
|
||||
// it is represented using int32_t. int32_t += int8_t *
|
||||
// (int8_t - int8_t) so the highest value we can get from each
|
||||
// accumulation is [-127, 127] * ([-128, 127] -
|
||||
// [-128, 127]), which is [-32512, 32512]. log2(32512)
|
||||
// = 14.98, which means we can accumulate at least 2^16
|
||||
// multiplications without overflow. The accumulator is
|
||||
// applied to a filter so the accumulation logic will hold as
|
||||
// long as the filter size (filter_y * filter_x * in_channel)
|
||||
// does not exceed 2^16, which is the case in all the models
|
||||
// we have seen so far.
|
||||
// TODO(b/174275578): Add a check to make sure the
|
||||
// accumulator depth is smaller than 2^16.
|
||||
acc += filter_val * (input_val + input_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias_data) {
|
||||
acc += bias_data[output_channel];
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(
|
||||
acc, output_multiplier[output_channel],
|
||||
output_shift[output_channel]);
|
||||
acc += output_offset;
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x,
|
||||
output_channel)] = static_cast<int8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void DepthwiseConvPerChannel(
|
||||
const DepthwiseParams& params, const int32_t* output_multiplier,
|
||||
const int32_t* output_shift, const RuntimeShape& input_shape,
|
||||
const int16_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const std::int64_t* bias_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
// Get parameters.
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int depth_multiplier = params.depth_multiplier;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
|
||||
// Check dimensions of the tensors.
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int input_depth = input_shape.Dims(3);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
for (int m = 0; m < depth_multiplier; ++m) {
|
||||
const int output_channel = m + in_channel * depth_multiplier;
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
std::int64_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
const int in_y =
|
||||
in_y_origin + dilation_height_factor * filter_y;
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
if (is_point_inside_image) {
|
||||
int32_t input_val = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, 0, filter_y, filter_x, output_channel)];
|
||||
// Accumulate with 64 bits accumulator.
|
||||
// We assume maximum of 2^16 accumulations as with the 8-bit
|
||||
// case so actually the value in the accumulator should not
|
||||
// exceed 40 bits
|
||||
acc += static_cast<int64_t>(filter_val) *
|
||||
static_cast<int64_t>(input_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias_data) {
|
||||
acc += bias_data[output_channel];
|
||||
}
|
||||
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
|
||||
acc, output_multiplier[output_channel],
|
||||
output_shift[output_channel]);
|
||||
scaled_acc = std::max(scaled_acc, output_activation_min);
|
||||
scaled_acc = std::min(scaled_acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x,
|
||||
output_channel)] =
|
||||
static_cast<int16_t>(scaled_acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void DepthwiseConvHybridPerChannel(
|
||||
const DepthwiseParams& params, float* scaling_factors_ptr,
|
||||
const RuntimeShape& input_shape, const int8_t* input_data,
|
||||
const RuntimeShape& filter_shape, const int8_t* filter_data,
|
||||
const RuntimeShape& bias_shape, const float* bias_data,
|
||||
const RuntimeShape& output_shape, float* output_data,
|
||||
const float* per_channel_scale, int32_t* input_offset) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int dilation_width_factor = params.dilation_width_factor;
|
||||
const int dilation_height_factor = params.dilation_height_factor;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
const int depth_multiplier = params.depth_multiplier;
|
||||
const float output_activation_min = params.float_activation_min;
|
||||
const float output_activation_max = params.float_activation_max;
|
||||
// Check dimensions of the tensors.
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int input_depth = input_shape.Dims(3);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int bias_depth = bias_shape.FlatSize();
|
||||
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
|
||||
TFLITE_DCHECK_EQ(bias_depth, output_depth);
|
||||
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
for (int m = 0; m < depth_multiplier; ++m) {
|
||||
const int output_channel = m + in_channel * depth_multiplier;
|
||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
||||
int32_t acc = 0;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
const int in_x = in_x_origin + dilation_width_factor * filter_x;
|
||||
const int in_y =
|
||||
in_y_origin + dilation_height_factor * filter_y;
|
||||
// Zero padding by omitting the areas outside the image.
|
||||
const bool is_point_inside_image =
|
||||
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
|
||||
(in_y < input_height);
|
||||
if (is_point_inside_image) {
|
||||
int32_t input_val = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
int32_t filter_val = filter_data[Offset(
|
||||
filter_shape, 0, filter_y, filter_x, output_channel)];
|
||||
acc += filter_val * (input_val - input_offset[batch]);
|
||||
}
|
||||
}
|
||||
}
|
||||
float acc_float = static_cast<float>(acc);
|
||||
acc_float *=
|
||||
per_channel_scale[output_channel] * scaling_factors_ptr[batch];
|
||||
if (bias_data && output_channel < bias_depth) {
|
||||
acc_float += bias_data[output_channel];
|
||||
}
|
||||
output_data[Offset(output_shape, batch, out_y, out_x,
|
||||
output_channel)] =
|
||||
ActivationFunctionWithMinMax(acc_float, output_activation_min,
|
||||
output_activation_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
|
||||
@@ -0,0 +1,109 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
inline void FullyConnected(
|
||||
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
|
||||
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
const int filter_dim_count = filter_shape.DimensionsCount();
|
||||
const int batches = output_shape.Dims(0);
|
||||
const int output_depth = output_shape.Dims(1);
|
||||
TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
|
||||
const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int out_c = 0; out_c < output_depth; ++out_c) {
|
||||
int32_t acc = 0;
|
||||
for (int d = 0; d < accum_depth; ++d) {
|
||||
int32_t input_val = input_data[b * accum_depth + d];
|
||||
int32_t filter_val = filter_data[out_c * accum_depth + d];
|
||||
acc += (filter_val + filter_offset) * (input_val + input_offset);
|
||||
}
|
||||
if (bias_data) {
|
||||
acc += bias_data[out_c];
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
|
||||
acc += output_offset;
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_data[out_c + output_depth * b] = static_cast<int8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void FullyConnected(
|
||||
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
||||
const int16_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int64_t* bias_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
|
||||
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
|
||||
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
const int filter_dim_count = filter_shape.DimensionsCount();
|
||||
const int output_dim_count = output_shape.DimensionsCount();
|
||||
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
|
||||
const int output_depth = output_shape.Dims(output_dim_count - 1);
|
||||
TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
|
||||
const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int out_c = 0; out_c < output_depth; ++out_c) {
|
||||
int64_t acc = 0;
|
||||
for (int d = 0; d < accum_depth; ++d) {
|
||||
int32_t input_val = input_data[b * accum_depth + d];
|
||||
int32_t filter_val = filter_data[out_c * accum_depth + d];
|
||||
acc += (filter_val + filter_offset) * input_val;
|
||||
}
|
||||
if (bias_data) {
|
||||
acc += bias_data[out_c];
|
||||
}
|
||||
int32_t acc_scaled =
|
||||
MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
|
||||
acc_scaled = std::max(acc_scaled, output_activation_min);
|
||||
acc_scaled = std::min(acc_scaled, output_activation_max);
|
||||
output_data[out_c + output_depth * b] = static_cast<int16_t>(acc_scaled);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
|
||||
@@ -0,0 +1,65 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
|
||||
int32_t depth, const int8_t* input_data,
|
||||
int8_t* output_data) {
|
||||
static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
|
||||
static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
|
||||
// The output scale must be in sync with Prepare().
|
||||
// Output is in 1/128 scale so the actual output range is nudged from [-1, 1]
|
||||
// to [-1, 127/128].
|
||||
static constexpr int32_t kOutputScale = 7;
|
||||
for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
|
||||
// int32_t = (int8_t - int8_t) ^ 2.
|
||||
// ([-128, 127] - [-128, 127]) ^ 2 = [0, (2^8 - 1)^2] so the accumulator is
|
||||
// safe from overflowing in at least 2^16 steps.
|
||||
int32_t acc = 0;
|
||||
for (int inner_index = 0; inner_index < depth; ++inner_index) {
|
||||
int32_t input =
|
||||
input_data[depth * outer_index + inner_index] - input_zero_point;
|
||||
acc += input * input;
|
||||
}
|
||||
int32_t inv_l2norm_multiplier;
|
||||
int inv_l2norm_shift;
|
||||
GetInvSqrtQuantizedMultiplierExp(acc, kReverseShift, &inv_l2norm_multiplier,
|
||||
&inv_l2norm_shift);
|
||||
|
||||
for (int inner_index = 0; inner_index < depth; ++inner_index) {
|
||||
int32_t input =
|
||||
input_data[depth * outer_index + inner_index] - input_zero_point;
|
||||
|
||||
// Rescale and downcast. Rescale is folded into the division.
|
||||
int32_t output_in_q24 = MultiplyByQuantizedMultiplier(
|
||||
input, inv_l2norm_multiplier, inv_l2norm_shift + kOutputScale);
|
||||
output_in_q24 =
|
||||
std::min(static_cast<int32_t>(kMaxInt8),
|
||||
std::max(static_cast<int32_t>(kMinInt8), output_in_q24));
|
||||
output_data[depth * outer_index + inner_index] =
|
||||
static_cast<int8_t>(output_in_q24);
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
|
||||
@@ -0,0 +1,119 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
|
||||
|
||||
#include <limits>
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
|
||||
int32_t input_multiplier, int32_t input_left_shift,
|
||||
int32_t input_size, const int8_t* input_data,
|
||||
int8_t* output_data) {
|
||||
// Integer bits must be in sync with Prepare() function.
|
||||
static constexpr int32_t kInputIntegerBits = 4;
|
||||
static constexpr int32_t kOutputIntegerBits = 8;
|
||||
static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
|
||||
static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
|
||||
static constexpr int32_t kOutputZeroPoint = -128;
|
||||
|
||||
for (int i = 0; i < input_size; ++i) {
|
||||
const int32_t input =
|
||||
static_cast<int32_t>(input_data[i]) - input_zero_point;
|
||||
if (input <= -input_range_radius) {
|
||||
output_data[i] = kMinInt8;
|
||||
} else if (input >= input_range_radius) {
|
||||
output_data[i] = kMaxInt8;
|
||||
} else {
|
||||
const int32_t input_in_q4 = MultiplyByQuantizedMultiplier(
|
||||
input, input_multiplier, input_left_shift);
|
||||
using FixedPoint4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
|
||||
const int32_t output_in_q0 =
|
||||
gemmlowp::logistic(FixedPoint4::FromRaw(input_in_q4)).raw();
|
||||
|
||||
// Rescale and downcast.
|
||||
using gemmlowp::RoundingDivideByPOT;
|
||||
int32_t output_in_q23 =
|
||||
RoundingDivideByPOT(output_in_q0, 31 - kOutputIntegerBits);
|
||||
output_in_q23 = std::min(std::max(output_in_q23 + kOutputZeroPoint,
|
||||
static_cast<int32_t>(kMinInt8)),
|
||||
static_cast<int32_t>(kMaxInt8));
|
||||
output_data[i] = static_cast<int8_t>(output_in_q23);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void Logistic(int32_t input_multiplier, int32_t input_left_shift,
|
||||
int32_t input_size, const int16_t* ptr_input_data,
|
||||
int16_t* ptr_output_data) {
|
||||
// We use the LUT for sigmoid and take into account, that
|
||||
// tanh(x) = 2*sigmoid(2*x) - 1
|
||||
|
||||
// We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
|
||||
// In case of general parameter scale, multiplier 3 is taken into account
|
||||
// in TanhPrepare function and it is included in
|
||||
// input_multiplier already.
|
||||
|
||||
TFLITE_DCHECK_GE(input_left_shift, 0);
|
||||
if (input_multiplier == 0) { // power of two case
|
||||
input_multiplier = 3 << input_left_shift;
|
||||
input_left_shift = 0;
|
||||
}
|
||||
|
||||
int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
|
||||
|
||||
for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
|
||||
int32_t input_data =
|
||||
((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
|
||||
|
||||
// We do interpolation on unsigned values.
|
||||
uint32_t abs_input_data = abs(input_data);
|
||||
|
||||
// We divide by 2 power of 9, because
|
||||
// we need to divide by 2 in power of 7 for
|
||||
// the input conversion + 1/4 from the scale above.
|
||||
|
||||
// Define uh as uint32_t type not to make this function overflow.
|
||||
uint32_t uh = abs_input_data >> 9;
|
||||
uint32_t result;
|
||||
|
||||
if (uh >= 255) {
|
||||
// Saturate to maximum.
|
||||
result = 0x7FFF << 10;
|
||||
} else {
|
||||
uint32_t ua = sigmoid_table_uint16[uh];
|
||||
uint32_t ub = sigmoid_table_uint16[uh + 1];
|
||||
uint32_t ut = abs_input_data & 0x1ff;
|
||||
// Interpolation is done using the fractional bit.
|
||||
result = (ua << 9) + ut * (ub - ua);
|
||||
}
|
||||
|
||||
result = (input_data >= 0) ? (result + (1 << 9))
|
||||
: ((1 << (16 + 9)) - result + (1 << 9) - 1);
|
||||
|
||||
// Back to 16-bit.
|
||||
result >>= 10;
|
||||
|
||||
*ptr_output_data = result;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
|
||||
@@ -0,0 +1,77 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
template <typename integer_type>
|
||||
inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
|
||||
int32_t shift, const RuntimeShape& unextended_input_shape,
|
||||
const integer_type* input_data, int32_t input_zero_point,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
integer_type* output_data, int32_t output_zero_point) {
|
||||
// Current implementation only supports dimension equals 4 and simultaneous
|
||||
// reduction over width and height.
|
||||
TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
const int output_batch = output_shape.Dims(0);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_depth = output_shape.Dims(3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int num_elements_in_axis = input_width * input_height;
|
||||
|
||||
TFLITE_CHECK_EQ(op_params.axis_count, 2);
|
||||
TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
|
||||
(op_params.axis[0] == 2 && op_params.axis[1] == 1));
|
||||
TFLITE_CHECK_EQ(output_height, 1);
|
||||
TFLITE_CHECK_EQ(output_width, 1);
|
||||
|
||||
static constexpr int32_t kMinInt = std::numeric_limits<integer_type>::min();
|
||||
static constexpr int32_t kMaxInt = std::numeric_limits<integer_type>::max();
|
||||
|
||||
for (int out_b = 0; out_b < output_batch; ++out_b) {
|
||||
for (int out_d = 0; out_d < output_depth; ++out_d) {
|
||||
int32_t acc = 0;
|
||||
for (int in_h = 0; in_h < input_height; ++in_h) {
|
||||
for (int in_w = 0; in_w < input_width; ++in_w) {
|
||||
acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)] -
|
||||
input_zero_point;
|
||||
}
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
|
||||
acc = acc > 0 ? (acc + num_elements_in_axis / 2) / num_elements_in_axis
|
||||
: (acc - num_elements_in_axis / 2) / num_elements_in_axis;
|
||||
acc += output_zero_point;
|
||||
acc = std::min(std::max(acc, kMinInt), kMaxInt);
|
||||
output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
|
||||
static_cast<integer_type>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
|
||||
@@ -0,0 +1,131 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void MulElementwise(int size, const ArithmeticParams& params,
|
||||
const T* input1_data, const T* input2_data,
|
||||
T* output_data) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplier(input1_val * input2_val,
|
||||
params.output_multiplier,
|
||||
params.output_shift);
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[i] = static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void Mul(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const T* input1_data,
|
||||
const RuntimeShape& input2_shape, const T* input2_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
ruy::profiler::ScopeLabel label("Mul/8bit");
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
MulElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
// Mul with 16 bit inputs and int8_t outputs.
|
||||
inline void Mul(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const int16_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const int16_t* input2_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
ruy::profiler::ScopeLabel label("Mul/Int16Int8");
|
||||
int32_t output_offset = params.output_offset;
|
||||
int32_t output_activation_min = params.quantized_activation_min;
|
||||
int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
// F0 uses 0 integer bits, range [-1, 1].
|
||||
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
||||
|
||||
F0 unclamped_result =
|
||||
F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
|
||||
int16_t rescaled_result =
|
||||
gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
|
||||
int16_t clamped_result = std::min<int16_t>(
|
||||
output_activation_max - output_offset, rescaled_result);
|
||||
clamped_result = std::max<int16_t>(output_activation_min - output_offset,
|
||||
clamped_result);
|
||||
output_data[i] = output_offset + clamped_result;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void BroadcastMul4DSlow(
|
||||
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
||||
const T* input1_data, const RuntimeShape& input2_shape,
|
||||
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastMul4DSlow");
|
||||
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
// The input shapes are extended as part of NdArrayDesc initialization.
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
const RuntimeShape extended_output_shape =
|
||||
RuntimeShape::ExtendedShape(4, output_shape);
|
||||
|
||||
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
const int32_t input1_val =
|
||||
params.input1_offset +
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
|
||||
const int32_t input2_val =
|
||||
params.input2_offset +
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
|
||||
const int32_t unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplier(input1_val * input2_val,
|
||||
params.output_multiplier,
|
||||
params.output_shift);
|
||||
const int32_t clamped_output = std::min(
|
||||
params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
||||
static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
|
||||
@@ -0,0 +1,262 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
|
||||
|
||||
#include <limits>
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
inline bool AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const int8_t* input_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int stride_height = params.stride_height;
|
||||
const int stride_width = params.stride_width;
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int channel = 0; channel < depth; ++channel) {
|
||||
const int in_x_origin =
|
||||
(out_x * stride_width) - params.padding_values.width;
|
||||
const int in_y_origin =
|
||||
(out_y * stride_height) - params.padding_values.height;
|
||||
// Compute the boundaries of the filter region clamped so as to
|
||||
// ensure that the filter window fits in the input array.
|
||||
const int filter_x_start = std::max(0, -in_x_origin);
|
||||
const int filter_x_end =
|
||||
std::min(params.filter_width, input_width - in_x_origin);
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
int32_t acc = 0;
|
||||
int filter_count = 0;
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
for (int filter_x = filter_x_start; filter_x < filter_x_end;
|
||||
++filter_x) {
|
||||
const int in_x = in_x_origin + filter_x;
|
||||
const int in_y = in_y_origin + filter_y;
|
||||
acc +=
|
||||
input_data[Offset(input_shape, batch, in_y, in_x, channel)];
|
||||
filter_count++;
|
||||
}
|
||||
}
|
||||
if (filter_count == 0) return false;
|
||||
// Round to the closest integer value.
|
||||
acc = acc > 0 ? (acc + filter_count / 2) / filter_count
|
||||
: (acc - filter_count / 2) / filter_count;
|
||||
acc = std::max(acc, params.quantized_activation_min);
|
||||
acc = std::min(acc, params.quantized_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
static_cast<int8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
TFLITE_DCHECK_GE(params.quantized_activation_min,
|
||||
std::numeric_limits<int8_t>::min());
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_max,
|
||||
std::numeric_limits<int8_t>::max());
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int stride_height = params.stride_height;
|
||||
const int stride_width = params.stride_width;
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int channel = 0; channel < depth; ++channel) {
|
||||
const int in_x_origin =
|
||||
(out_x * stride_width) - params.padding_values.width;
|
||||
const int in_y_origin =
|
||||
(out_y * stride_height) - params.padding_values.height;
|
||||
// Compute the boundaries of the filter region clamped so as to
|
||||
// ensure that the filter window fits in the input array.
|
||||
const int filter_x_start = std::max(0, -in_x_origin);
|
||||
const int filter_x_end =
|
||||
std::min(params.filter_width, input_width - in_x_origin);
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
int8_t max = std::numeric_limits<int8_t>::lowest();
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
for (int filter_x = filter_x_start; filter_x < filter_x_end;
|
||||
++filter_x) {
|
||||
const int in_x = in_x_origin + filter_x;
|
||||
const int in_y = in_y_origin + filter_y;
|
||||
max = std::max(
|
||||
max,
|
||||
input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
|
||||
}
|
||||
}
|
||||
max = std::max<int8_t>(max, params.quantized_activation_min);
|
||||
max = std::min<int8_t>(max, params.quantized_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
static_cast<int8_t>(max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline bool AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const int16_t* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int stride_height = params.stride_height;
|
||||
const int stride_width = params.stride_width;
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int channel = 0; channel < depth; ++channel) {
|
||||
const int in_x_origin =
|
||||
(out_x * stride_width) - params.padding_values.width;
|
||||
const int in_y_origin =
|
||||
(out_y * stride_height) - params.padding_values.height;
|
||||
// Compute the boundaries of the filter region clamped so as to
|
||||
// ensure that the filter window fits in the input array.
|
||||
const int filter_x_start = std::max(0, -in_x_origin);
|
||||
const int filter_x_end =
|
||||
std::min(params.filter_width, input_width - in_x_origin);
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
int32_t acc = 0;
|
||||
int filter_count = 0;
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
for (int filter_x = filter_x_start; filter_x < filter_x_end;
|
||||
++filter_x) {
|
||||
const int in_x = in_x_origin + filter_x;
|
||||
const int in_y = in_y_origin + filter_y;
|
||||
acc +=
|
||||
input_data[Offset(input_shape, batch, in_y, in_x, channel)];
|
||||
filter_count++;
|
||||
}
|
||||
}
|
||||
if (filter_count == 0) return false;
|
||||
// Round to the closest integer value.
|
||||
acc = acc > 0 ? (acc + filter_count / 2) / filter_count
|
||||
: (acc - filter_count / 2) / filter_count;
|
||||
acc = std::max(acc, params.quantized_activation_min);
|
||||
acc = std::min(acc, params.quantized_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
static_cast<int16_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
const int16_t* input_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
TFLITE_DCHECK_GE(params.quantized_activation_min,
|
||||
std::numeric_limits<int16_t>::min());
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_max,
|
||||
std::numeric_limits<int16_t>::max());
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int stride_height = params.stride_height;
|
||||
const int stride_width = params.stride_width;
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int channel = 0; channel < depth; ++channel) {
|
||||
const int in_x_origin =
|
||||
(out_x * stride_width) - params.padding_values.width;
|
||||
const int in_y_origin =
|
||||
(out_y * stride_height) - params.padding_values.height;
|
||||
// Compute the boundaries of the filter region clamped so as to
|
||||
// ensure that the filter window fits in the input array.
|
||||
const int filter_x_start = std::max(0, -in_x_origin);
|
||||
const int filter_x_end =
|
||||
std::min(params.filter_width, input_width - in_x_origin);
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
int16_t max = std::numeric_limits<int16_t>::lowest();
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
for (int filter_x = filter_x_start; filter_x < filter_x_end;
|
||||
++filter_x) {
|
||||
const int in_x = in_x_origin + filter_x;
|
||||
const int in_y = in_y_origin + filter_y;
|
||||
max = std::max(
|
||||
max,
|
||||
input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
|
||||
}
|
||||
}
|
||||
max = std::max<int16_t>(max, params.quantized_activation_min);
|
||||
max = std::min<int16_t>(max, params.quantized_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
static_cast<int16_t>(max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
|
||||
@@ -0,0 +1,116 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
|
||||
int32_t input_multiplier, int32_t input_shift,
|
||||
const RuntimeShape& input_shape, const int8_t* input_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
// Integer bits must be in sync with Prepare() function.
|
||||
static constexpr int32_t kInputIntegerBits = 4;
|
||||
static constexpr int32_t kOutputScale = 7;
|
||||
static constexpr int32_t kMinInt8 = std::numeric_limits<int8_t>::min();
|
||||
static constexpr int32_t kMaxInt8 = std::numeric_limits<int8_t>::max();
|
||||
using F4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
|
||||
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
const int32_t input =
|
||||
static_cast<int32_t>(input_data[i]) - input_zero_point;
|
||||
if (input <= -input_range_radius) {
|
||||
output_data[i] = kMinInt8;
|
||||
} else if (input >= input_range_radius) {
|
||||
output_data[i] = kMaxInt8;
|
||||
} else {
|
||||
const int32_t input_in_q4 =
|
||||
MultiplyByQuantizedMultiplier(input, input_multiplier, input_shift);
|
||||
const int32_t output_in_q0 =
|
||||
gemmlowp::tanh(F4::FromRaw(input_in_q4)).raw();
|
||||
|
||||
// Rescale and downcast.
|
||||
using gemmlowp::RoundingDivideByPOT;
|
||||
int32_t output_in_q24 =
|
||||
RoundingDivideByPOT(output_in_q0, 31 - kOutputScale);
|
||||
output_in_q24 = std::min(std::max(output_in_q24, kMinInt8), kMaxInt8);
|
||||
output_data[i] = static_cast<int8_t>(output_in_q24);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
|
||||
const RuntimeShape& input_shape, const int16_t* ptr_input_data,
|
||||
const RuntimeShape& output_shape, int16_t* ptr_output_data) {
|
||||
// We use the LUT for sigmoid and take into account, that
|
||||
// tanh(x) = 2*sigmoid(2*x) - 1
|
||||
|
||||
// We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
|
||||
// In case of general parameter scale, multiplier 3 is taken into account
|
||||
// in TanhPrepare function and it is included in
|
||||
// input_multiplier already.
|
||||
|
||||
if (input_multiplier == 0) { // power of two case
|
||||
input_multiplier = 3 << input_left_shift;
|
||||
input_left_shift = 0;
|
||||
}
|
||||
|
||||
int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
|
||||
|
||||
int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++) {
|
||||
int32_t input_data =
|
||||
((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
|
||||
|
||||
uint32_t abs_input_data = abs(input_data);
|
||||
uint32_t uh = abs_input_data >> 8;
|
||||
int32_t result;
|
||||
|
||||
if (uh >= 255) {
|
||||
// Saturate to maximum.
|
||||
result = 0xFFFF << 8;
|
||||
} else {
|
||||
uint32_t ua = sigmoid_table_uint16[uh];
|
||||
uint32_t ub = sigmoid_table_uint16[uh + 1];
|
||||
|
||||
uint8_t ut = abs_input_data & 0xFF;
|
||||
|
||||
result = (ua << 8) + ut * (ub - ua);
|
||||
}
|
||||
|
||||
result = (input_data >= 0)
|
||||
? (result - (1 << (14 + 9)) + (1 << (9 - 2)))
|
||||
: (-result + (1 << (14 + 9)) + (1 << (9 - 2)) - 1);
|
||||
|
||||
// Convert back to 16-bit.
|
||||
result >>= (9 - 1);
|
||||
|
||||
*ptr_output_data = result;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
|
||||
@@ -0,0 +1,221 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_integer_ops {
|
||||
|
||||
// Fixed-point per-channel-quantization transpose convolution reference kernel.
|
||||
inline void TransposeConv(
|
||||
const ConvParams& params, const int32_t* output_multiplier,
|
||||
const int32_t* output_shift, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
int8_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
|
||||
int32_t* scratch_buffer) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_activation_min = std::numeric_limits<int8_t>::min();
|
||||
const int32_t output_activation_max = std::numeric_limits<int8_t>::max();
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
|
||||
const int num_elements = output_shape.FlatSize();
|
||||
// We need to initialize scratch_buffer to all 0s, as we apply the same
|
||||
// 'scatter' based trick as in float version.
|
||||
memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
|
||||
|
||||
// Loop through input elements one at a time.
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int in_y = 0; in_y < input_height; ++in_y) {
|
||||
for (int in_x = 0; in_x < input_width; ++in_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
// Loop through the output elements it will influence.
|
||||
const int out_x_origin = (in_x * stride_width) - pad_width;
|
||||
const int out_y_origin = (in_y * stride_height) - pad_height;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth;
|
||||
++out_channel) {
|
||||
// Compute output element location.
|
||||
const int out_x = out_x_origin + filter_x;
|
||||
const int out_y = out_y_origin + filter_y;
|
||||
// We cannot accumulate out of bounds.
|
||||
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
|
||||
(out_y < output_height)) {
|
||||
const int8_t input_value = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
const int8_t filter_value =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
scratch_buffer[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)] +=
|
||||
(input_value + input_offset) * filter_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)];
|
||||
if (bias_data) {
|
||||
acc += bias_data[out_channel];
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(
|
||||
acc, output_multiplier[out_channel], output_shift[out_channel]);
|
||||
acc += output_offset;
|
||||
acc = std::max(acc, output_activation_min);
|
||||
acc = std::min(acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
|
||||
static_cast<int8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// int16_t input (zero_point=0), int8_t filter, int64 accumulator
|
||||
inline void TransposeConv(
|
||||
const ConvParams& params, const int32_t* output_multiplier,
|
||||
const int32_t* output_shift, const RuntimeShape& input_shape,
|
||||
const int16_t* input_data, const RuntimeShape& filter_shape,
|
||||
const int8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const std::int64_t* bias_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
|
||||
std::int64_t* scratch_buffer) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int32_t output_activation_min = std::numeric_limits<int16_t>::min();
|
||||
const int32_t output_activation_max = std::numeric_limits<int16_t>::max();
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
|
||||
const int num_elements = output_shape.FlatSize();
|
||||
// We need to initialize scratch_buffer to all 0s, as we apply the same
|
||||
// 'scatter' based trick as in float version.
|
||||
memset(scratch_buffer, 0, num_elements * sizeof(std::int64_t));
|
||||
|
||||
// Loop through input elements one at a time.
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int in_y = 0; in_y < input_height; ++in_y) {
|
||||
for (int in_x = 0; in_x < input_width; ++in_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
// Loop through the output elements it will influence.
|
||||
const int out_x_origin = (in_x * stride_width) - pad_width;
|
||||
const int out_y_origin = (in_y * stride_height) - pad_height;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth;
|
||||
++out_channel) {
|
||||
// Compute output element location.
|
||||
const int out_x = out_x_origin + filter_x;
|
||||
const int out_y = out_y_origin + filter_y;
|
||||
// We cannot accumulate out of bounds.
|
||||
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
|
||||
(out_y < output_height)) {
|
||||
const int32_t input_value = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
const int32_t filter_value =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
scratch_buffer[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)] +=
|
||||
input_value * filter_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
std::int64_t acc = scratch_buffer[Offset(output_shape, batch, out_y,
|
||||
out_x, out_channel)];
|
||||
if (bias_data) {
|
||||
acc += bias_data[out_channel];
|
||||
}
|
||||
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
|
||||
acc, output_multiplier[out_channel], output_shift[out_channel]);
|
||||
scaled_acc = std::max(scaled_acc, output_activation_min);
|
||||
scaled_acc = std::min(scaled_acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
|
||||
static_cast<int16_t>(scaled_acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
|
||||
@@ -0,0 +1,90 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
#include "tensorflow/lite/c/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
|
||||
const RuntimeShape& input_shape,
|
||||
const float* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
float* output_data, float epsilon = 1e-6) {
|
||||
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
||||
const int outer_size =
|
||||
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
||||
const int depth =
|
||||
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
||||
for (int i = 0; i < outer_size; ++i) {
|
||||
float squared_l2_norm = 0;
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
const float val = input_data[depth * i + c];
|
||||
squared_l2_norm += val * val;
|
||||
}
|
||||
float l2_norm = std::sqrt(squared_l2_norm);
|
||||
l2_norm = std::max(l2_norm, epsilon);
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
|
||||
const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
||||
const int depth =
|
||||
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
||||
const int outer_size =
|
||||
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
||||
const int32_t input_zero_point = op_params.input_zero_point;
|
||||
|
||||
for (int i = 0; i < outer_size; ++i) {
|
||||
int32_t square_l2_norm = 0;
|
||||
for (int c = 0; c < depth; c++) {
|
||||
int32_t diff = input_data[depth * i + c] - input_zero_point;
|
||||
square_l2_norm += diff * diff;
|
||||
}
|
||||
int32_t inv_l2norm_multiplier;
|
||||
int inv_l2norm_shift;
|
||||
GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
|
||||
&inv_l2norm_multiplier, &inv_l2norm_shift);
|
||||
for (int c = 0; c < depth; c++) {
|
||||
int32_t diff = input_data[depth * i + c] - input_zero_point;
|
||||
int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
|
||||
int32_t unclamped_output_val = 128 + rescaled_diff;
|
||||
int32_t output_val =
|
||||
std::min(static_cast<int32_t>(255),
|
||||
std::max(static_cast<int32_t>(0), unclamped_output_val));
|
||||
output_data[depth * i + c] = static_cast<uint8_t>(output_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
|
||||
@@ -0,0 +1,69 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void LeakyRelu(const tflite::LeakyReluParams& params,
|
||||
const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
const float val = input_data[i];
|
||||
// Note that alpha might be > 1 or < 0, so we don't use std::max here.
|
||||
output_data[i] = val > 0 ? val : val * params.alpha;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void QuantizeLeakyRelu(const LeakyReluParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
T* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
static const int32_t quantized_min = std::numeric_limits<T>::min();
|
||||
static const int32_t quantized_max = std::numeric_limits<T>::max();
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
const int32_t input_value = input_data[i] - params.input_offset;
|
||||
int32_t unclamped_output;
|
||||
if (input_value >= 0) {
|
||||
unclamped_output = params.output_offset +
|
||||
MultiplyByQuantizedMultiplier(
|
||||
input_value, params.output_multiplier_identity,
|
||||
params.output_shift_identity);
|
||||
} else {
|
||||
unclamped_output = params.output_offset +
|
||||
MultiplyByQuantizedMultiplier(
|
||||
input_value, params.output_multiplier_alpha,
|
||||
params.output_shift_alpha);
|
||||
}
|
||||
const T clamped_output =
|
||||
std::min(quantized_max, std::max(quantized_min, unclamped_output));
|
||||
output_data[i] = static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
|
||||
@@ -0,0 +1,256 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <limits>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void LogSoftmax(const SoftmaxParams& params,
|
||||
const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
||||
const int outer_size =
|
||||
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
||||
const int depth =
|
||||
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
||||
|
||||
for (int i = 0; i < outer_size; ++i) {
|
||||
// Find max element value which we'll use to ensure numerical stability
|
||||
// taking advantage of the following equality:
|
||||
// log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
|
||||
float max = std::numeric_limits<float>::lowest();
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
max = std::max(max, input_data[i * depth + c]);
|
||||
}
|
||||
|
||||
// Compute sum.
|
||||
float sum = 0.f;
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
sum += std::exp(input_data[i * depth + c] - max);
|
||||
}
|
||||
|
||||
// Compute result.
|
||||
const float log_sum = std::log(sum);
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void LogSoftmax(const SoftmaxParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data,
|
||||
const RuntimeShape& output_shape, uint8_t* output_data) {
|
||||
const int32_t input_multiplier = params.input_multiplier;
|
||||
const int32_t input_left_shift = params.input_left_shift;
|
||||
const int32_t reverse_scaling_divisor = params.reverse_scaling_divisor;
|
||||
const int32_t reverse_scaling_right_shift =
|
||||
params.reverse_scaling_right_shift;
|
||||
const int diff_min = params.diff_min;
|
||||
// The representation chosen for the input to the exp() function is Q5.26.
|
||||
// We need to leave extra space since values that we skip might be as large
|
||||
// as -32 before multiplying by input_beta_multiplier, and therefore as
|
||||
// large as -16 afterwards. Note that exp(-8) is definitely not
|
||||
// insignificant to accumulation, but exp(-16) definitely is.
|
||||
static constexpr int kScaledDiffIntegerBits = 5;
|
||||
static constexpr int kAccumulationIntegerBits = 12;
|
||||
static constexpr int kOutputIntegerBits = 4;
|
||||
using FixedPointScaledDiff =
|
||||
gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
|
||||
using FixedPointAccum =
|
||||
gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
|
||||
|
||||
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
||||
const int outer_size =
|
||||
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
||||
const int depth =
|
||||
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
||||
|
||||
for (int i = 0; i < outer_size; ++i) {
|
||||
uint8_t max_in_row = 0;
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
max_in_row = std::max(max_in_row, input_data[i * depth + c]);
|
||||
}
|
||||
|
||||
FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
int32_t input_diff =
|
||||
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
|
||||
if (input_diff >= diff_min) {
|
||||
const int32_t input_diff_rescaled =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(
|
||||
input_diff, input_multiplier, input_left_shift);
|
||||
const FixedPointScaledDiff scaled_diff_f8 =
|
||||
FixedPointScaledDiff::FromRaw(input_diff_rescaled);
|
||||
sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
|
||||
exp_on_negative_values(scaled_diff_f8));
|
||||
}
|
||||
}
|
||||
|
||||
const int32_t fixed_log_sum_of_exps =
|
||||
log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>(
|
||||
sum_of_exps)
|
||||
.raw();
|
||||
|
||||
// rescaled_diff_min is smallest representable in
|
||||
// Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
|
||||
// log-sub-exps that will be subtracted in the loop.
|
||||
//
|
||||
// The thresholds diff_min, etc are negative.
|
||||
const int rescaled_diff_min =
|
||||
fixed_log_sum_of_exps + std::numeric_limits<int32_t>::lowest();
|
||||
const int adjusted_diff_min =
|
||||
std::max(static_cast<int32_t>(
|
||||
diff_min - 1), // Note use of > below instead of >= above.
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
rescaled_diff_min, reverse_scaling_divisor,
|
||||
-reverse_scaling_right_shift));
|
||||
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
int32_t input_diff =
|
||||
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
|
||||
if (input_diff > adjusted_diff_min) {
|
||||
const int32_t input_diff_rescaled =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(
|
||||
input_diff, input_multiplier, input_left_shift);
|
||||
int32_t unsat_output =
|
||||
gemmlowp::RoundingDivideByPOT(
|
||||
(input_diff_rescaled - fixed_log_sum_of_exps),
|
||||
31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
|
||||
255;
|
||||
|
||||
output_data[i * depth + c] = static_cast<uint8_t>(
|
||||
std::max(std::min(unsat_output, static_cast<int32_t>(255)),
|
||||
static_cast<int32_t>(0)));
|
||||
} else {
|
||||
// Set output to smallest value.
|
||||
output_data[i * depth + c] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void LogSoftmaxQuantized(const SoftmaxParams& params,
|
||||
const size_t outer_size, const size_t depth,
|
||||
const RuntimeShape& input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
T* output_data) {
|
||||
const int32_t input_multiplier = params.input_multiplier;
|
||||
const int32_t input_left_shift = params.input_left_shift;
|
||||
const int32_t reverse_scaling_divisor = params.reverse_scaling_divisor;
|
||||
const int32_t reverse_scaling_right_shift =
|
||||
params.reverse_scaling_right_shift;
|
||||
const int diff_min = params.diff_min;
|
||||
|
||||
static constexpr T kMinT8 = std::numeric_limits<T>::min();
|
||||
static constexpr T kMaxT8 = std::numeric_limits<T>::max();
|
||||
static constexpr int32_t kMinInt32 = std::numeric_limits<int32_t>::min();
|
||||
|
||||
// All IntegerBits must agree with Prepare function.
|
||||
// Input is chosen as Q5.26 so exp(-1 * 2^5 * 2^-1) = exp(-16) is negligible.
|
||||
static constexpr int kInputIntegerBits = 5;
|
||||
static constexpr int kAccumulationIntegerBits = 12;
|
||||
static constexpr int kOutputIntegerBits = 4;
|
||||
using F5 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
|
||||
using F12 = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
|
||||
|
||||
for (size_t outer_index = 0; outer_index < outer_size; ++outer_index) {
|
||||
T max_in_row = kMinT8;
|
||||
for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
|
||||
max_in_row =
|
||||
std::max(max_in_row, input_data[outer_index * depth + inner_index]);
|
||||
}
|
||||
|
||||
// Accumulator "sum_of_exps_in_q12" is safe from overflowing in 2^12 steps.
|
||||
F12 sum_of_exps_in_q12 = F12::FromRaw(0);
|
||||
for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
|
||||
int32_t input_diff =
|
||||
static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
|
||||
max_in_row;
|
||||
if (input_diff >= diff_min) {
|
||||
const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
|
||||
input_diff, input_multiplier, input_left_shift);
|
||||
sum_of_exps_in_q12 =
|
||||
sum_of_exps_in_q12 +
|
||||
gemmlowp::Rescale<kAccumulationIntegerBits>(
|
||||
exp_on_negative_values(F5::FromRaw(input_diff_in_q5)));
|
||||
}
|
||||
}
|
||||
|
||||
const int32_t log_sum_of_exps_in_q5 =
|
||||
log_x_for_x_greater_than_or_equal_to_1<kInputIntegerBits>(
|
||||
sum_of_exps_in_q12)
|
||||
.raw();
|
||||
|
||||
// Potentially reduced the valid range. shifted_log_sum_of_exps_in_q5 is
|
||||
// smallest representable in Q5.26 plus the log_sum_of_exps.
|
||||
const int32_t shifted_log_sum_of_exps_in_q5 =
|
||||
log_sum_of_exps_in_q5 + kMinInt32;
|
||||
const int32_t adjusted_diff_min =
|
||||
std::max(static_cast<int32_t>(diff_min - 1),
|
||||
MultiplyByQuantizedMultiplier(shifted_log_sum_of_exps_in_q5,
|
||||
reverse_scaling_divisor,
|
||||
-reverse_scaling_right_shift));
|
||||
|
||||
for (size_t inner_index = 0; inner_index < depth; ++inner_index) {
|
||||
int32_t input_diff =
|
||||
static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
|
||||
max_in_row;
|
||||
// Note use of > below instead of >= above.
|
||||
if (input_diff > adjusted_diff_min) {
|
||||
const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
|
||||
input_diff, input_multiplier, input_left_shift);
|
||||
|
||||
// Rescale and downcast.
|
||||
int32_t output_in_q27 =
|
||||
gemmlowp::RoundingDivideByPOT(
|
||||
(input_diff_in_q5 - log_sum_of_exps_in_q5),
|
||||
31 - kInputIntegerBits - kOutputIntegerBits) +
|
||||
kMaxT8;
|
||||
|
||||
output_in_q27 =
|
||||
std::max(std::min(output_in_q27, static_cast<int32_t>(kMaxT8)),
|
||||
static_cast<int32_t>(kMinT8));
|
||||
output_data[outer_index * depth + inner_index] =
|
||||
static_cast<T>(output_in_q27);
|
||||
} else {
|
||||
output_data[outer_index * depth + inner_index] = kMinT8;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void LogSoftmax(const SoftmaxParams& params, const size_t outer_size,
|
||||
const size_t depth, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
LogSoftmaxQuantized(params, outer_size, depth, input_shape, input_data,
|
||||
output_shape, output_data);
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOG_SOFTMAX_H_
|
||||
@@ -0,0 +1,132 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
#include "tensorflow/lite/kernels/op_macros.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const float cutoff_upper = 16.619047164916992188f;
|
||||
const float cutoff_lower = -9.f;
|
||||
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
// Rational for using approximation in reference kernel.
|
||||
// 0. This approximation gives enough precision for float.
|
||||
// 1. This works around an issue on an embedded chipset where exp() does not
|
||||
// return correctly as expected - exp(x) should return inf when overflown
|
||||
// not 1.701417 IEEE 754 defines representation for inf.
|
||||
// 2. This will speed up calculation and is matching the behavior in the
|
||||
// optimized kernels. (check the definition of scalar_logistic_op<float>)
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
float val = input_data[i];
|
||||
float result;
|
||||
if (val > cutoff_upper) {
|
||||
result = 1.0f;
|
||||
} else if (val < cutoff_lower) {
|
||||
result = std::exp(val);
|
||||
} else {
|
||||
result = 1.f / (1.f + std::exp(-val));
|
||||
}
|
||||
output_data[i] = result;
|
||||
}
|
||||
}
|
||||
|
||||
// Convenience version that allows, for example, generated-code calls to be
|
||||
// uniform between data types.
|
||||
inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& output_shape,
|
||||
float* output_data) {
|
||||
// Drop params: not needed.
|
||||
Logistic(input_shape, input_data, output_shape, output_data);
|
||||
}
|
||||
|
||||
inline void Logistic(const LogisticParams& params,
|
||||
const RuntimeShape& input_shape, const int16_t* input_data,
|
||||
const RuntimeShape& output_shape, int16_t* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
// F0 uses 0 integer bits, range [-1, 1].
|
||||
// This is the return type of math functions such as tanh, logistic,
|
||||
// whose range is in [-1, 1].
|
||||
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
||||
// F3 uses 3 integer bits, range [-8, 8], the input range expected here.
|
||||
using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
|
||||
|
||||
const F3 input = F3::FromRaw(input_data[i]);
|
||||
F0 output = gemmlowp::logistic(input);
|
||||
output_data[i] = output.raw();
|
||||
}
|
||||
}
|
||||
|
||||
// Quantized int8_t logistic activation. Cheats by dequantizing and
|
||||
// requantizing around the floating point logistic method. This implementation
|
||||
// is slow on platforms without a floating point unit.
|
||||
|
||||
// TODO(b/141211002): Delete this int8_t implementation once we can reuse the
|
||||
// approach used in TFLite for int8_t Logistic.
|
||||
inline void Logistic(const RuntimeShape& input_shape, const int8_t* input_data,
|
||||
float input_scale, int input_zero_point,
|
||||
const RuntimeShape& output_shape, int8_t* output_data,
|
||||
float output_scale, int output_zero_point) {
|
||||
const float cutoff_upper = 16.619047164916992188f;
|
||||
const float cutoff_lower = -9.f;
|
||||
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
// Rational for using approximation in reference kernel.
|
||||
// 0. This approximation gives enough precision for float.
|
||||
// 1. This works around an issue on an embedded chipset where exp() does not
|
||||
// return correctly as expected - exp(x) should return inf when overflown
|
||||
// not 1.701417 IEEE 754 defines representation for inf.
|
||||
// 2. This will speed up calculation and is matching the behavior in the
|
||||
// optimized kernels. (check the definition of scalar_logistic_op<float>)
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
// Dequantize.
|
||||
float val =
|
||||
static_cast<float>((input_data[i] - input_zero_point) * input_scale);
|
||||
float result;
|
||||
if (val > cutoff_upper) {
|
||||
result = 1.0f;
|
||||
} else if (val < cutoff_lower) {
|
||||
result = std::exp(val);
|
||||
} else {
|
||||
result = 1.f / (1.f + std::exp(-val));
|
||||
}
|
||||
// Requantize
|
||||
int8_t output =
|
||||
static_cast<int8_t>(result / output_scale + output_zero_point);
|
||||
output_data[i] = output;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
|
||||
@@ -0,0 +1,64 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T, typename Op, int N = 5>
|
||||
void MaximumMinimumBroadcastSlow(const RuntimeShape& unextended_input1_shape,
|
||||
const T* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const T* input2_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data, Op op) {
|
||||
// Uses element-wise calculation if broadcast is not required.
|
||||
if (unextended_input1_shape == unextended_input2_shape) {
|
||||
const int flat_size =
|
||||
MatchingElementsSize(unextended_input1_shape, unextended_input2_shape,
|
||||
unextended_output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = op(input1_data[i], input2_data[i]);
|
||||
}
|
||||
} else {
|
||||
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
|
||||
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(
|
||||
unextended_input1_shape, unextended_input2_shape, &desc1, &desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
|
||||
&output_desc);
|
||||
|
||||
auto maxmin_func = [&](int indexes[N]) {
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
op(input1_data[SubscriptToIndex(desc1, indexes)],
|
||||
input2_data[SubscriptToIndex(desc2, indexes)]);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, maxmin_func);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
|
||||
@@ -0,0 +1,166 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
// Element-wise mul that can often be used for inner loop of broadcast Mul as
|
||||
// well as the non-broadcast Mul.
|
||||
inline void MulElementwise(int size, const ArithmeticParams& params,
|
||||
const uint8_t* input1_data,
|
||||
const uint8_t* input2_data, uint8_t* output_data) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplier(input1_val * input2_val,
|
||||
params.output_multiplier,
|
||||
params.output_shift);
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[i] = static_cast<uint8_t>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void Mul(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const T* input1_data,
|
||||
const RuntimeShape& input2_shape, const T* input2_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
T output_activation_min;
|
||||
T output_activation_max;
|
||||
GetActivationParams(params, &output_activation_min, &output_activation_max);
|
||||
|
||||
const int flat_size =
|
||||
MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = ActivationFunctionWithMinMax(
|
||||
input1_data[i] * input2_data[i], output_activation_min,
|
||||
output_activation_max);
|
||||
}
|
||||
}
|
||||
|
||||
inline void Mul(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const uint8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8_t* input2_data,
|
||||
const RuntimeShape& output_shape, uint8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
MulElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void BroadcastMul4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const uint8_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const uint8_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
const RuntimeShape extended_output_shape =
|
||||
RuntimeShape::ExtendedShape(4, output_shape);
|
||||
|
||||
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
const int32_t input1_val =
|
||||
params.input1_offset +
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
|
||||
const int32_t input2_val =
|
||||
params.input2_offset +
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
|
||||
const int32_t unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplier(input1_val * input2_val,
|
||||
params.output_multiplier,
|
||||
params.output_shift);
|
||||
const int32_t clamped_output = std::min(
|
||||
params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
||||
static_cast<uint8_t>(clamped_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BroadcastMul4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& unextended_input1_shape,
|
||||
const T* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const T* input2_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
T output_activation_min;
|
||||
T output_activation_max;
|
||||
GetActivationParams(params, &output_activation_min, &output_activation_max);
|
||||
|
||||
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
||||
unextended_input2_shape, &desc1, &desc2);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
for (int b = 0; b < output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < output_shape.Dims(3); ++c) {
|
||||
output_data[Offset(output_shape, b, y, x, c)] =
|
||||
ActivationFunctionWithMinMax(
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)],
|
||||
output_activation_min, output_activation_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
|
||||
@@ -0,0 +1,37 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void Negate(const RuntimeShape& input_shape, const T* input_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = -input_data[i];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
|
||||
@@ -0,0 +1,169 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PAD_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PAD_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
// TFLite Pad supports activation tensors with up to 5 dimensions.
|
||||
constexpr int PadKernelMaxDimensionCount() { return 5; }
|
||||
|
||||
// There are two versions of pad: Pad and PadV2. In PadV2 there is a second
|
||||
// scalar input that provides the padding value. Therefore pad_value_ptr can be
|
||||
// equivalent to a simple input1_data. For Pad, it should point to a zero
|
||||
// value.
|
||||
//
|
||||
// Note that two typenames are required, so that T=P=int32_t is considered a
|
||||
// specialization distinct from P=int32_t.
|
||||
template <typename T, typename P>
|
||||
inline void PadImpl(const tflite::PadParams& op_params,
|
||||
const RuntimeShape& input_shape, const T* input_data,
|
||||
const P* pad_value_ptr, const RuntimeShape& output_shape,
|
||||
T* output_data) {
|
||||
const RuntimeShape ext_input_shape =
|
||||
RuntimeShape::ExtendedShape(PadKernelMaxDimensionCount(), input_shape);
|
||||
const RuntimeShape ext_output_shape =
|
||||
RuntimeShape::ExtendedShape(PadKernelMaxDimensionCount(), output_shape);
|
||||
TFLITE_DCHECK_LE(op_params.left_padding_count, PadKernelMaxDimensionCount());
|
||||
TFLITE_DCHECK_LE(op_params.right_padding_count, PadKernelMaxDimensionCount());
|
||||
|
||||
// Runtime calls are currently fixed at 5 dimensions. Copy inputs so we can
|
||||
// pad them to 5 dims (yes, we are "padding the padding").
|
||||
int left_padding_copy[PadKernelMaxDimensionCount()];
|
||||
for (int i = 0; i < PadKernelMaxDimensionCount(); i++) {
|
||||
left_padding_copy[i] = 0;
|
||||
}
|
||||
for (int i = 0; i < op_params.left_padding_count; ++i) {
|
||||
left_padding_copy[i + PadKernelMaxDimensionCount() -
|
||||
op_params.left_padding_count] = op_params.left_padding[i];
|
||||
}
|
||||
int right_padding_copy[PadKernelMaxDimensionCount()];
|
||||
for (int i = 0; i < PadKernelMaxDimensionCount(); i++) {
|
||||
right_padding_copy[i] = 0;
|
||||
}
|
||||
for (int i = 0; i < op_params.right_padding_count; ++i) {
|
||||
right_padding_copy[i + PadKernelMaxDimensionCount() -
|
||||
op_params.right_padding_count] =
|
||||
op_params.right_padding[i];
|
||||
}
|
||||
|
||||
const int output_batch = ext_output_shape.Dims(0);
|
||||
const int output_plane = ext_output_shape.Dims(1);
|
||||
const int output_height = ext_output_shape.Dims(2);
|
||||
const int output_width = ext_output_shape.Dims(3);
|
||||
const int output_depth = ext_output_shape.Dims(4);
|
||||
|
||||
const int left_b_padding = left_padding_copy[0];
|
||||
const int left_p_padding = left_padding_copy[1];
|
||||
const int left_h_padding = left_padding_copy[2];
|
||||
const int left_w_padding = left_padding_copy[3];
|
||||
const int left_d_padding = left_padding_copy[4];
|
||||
|
||||
const int right_b_padding = right_padding_copy[0];
|
||||
const int right_p_padding = right_padding_copy[1];
|
||||
const int right_h_padding = right_padding_copy[2];
|
||||
const int right_w_padding = right_padding_copy[3];
|
||||
const int right_d_padding = right_padding_copy[4];
|
||||
|
||||
const T pad_value = *pad_value_ptr;
|
||||
|
||||
const T* in_ptr = input_data;
|
||||
T* out_ptr = output_data;
|
||||
for (int out_b = 0; out_b < output_batch; ++out_b) {
|
||||
for (int out_p = 0; out_p < output_plane; ++out_p) {
|
||||
for (int out_h = 0; out_h < output_height; ++out_h) {
|
||||
for (int out_w = 0; out_w < output_width; ++out_w) {
|
||||
for (int out_d = 0; out_d < output_depth; ++out_d) {
|
||||
if (out_b < left_b_padding ||
|
||||
out_b >= output_batch - right_b_padding ||
|
||||
out_p < left_p_padding ||
|
||||
out_p >= output_plane - right_p_padding ||
|
||||
out_h < left_h_padding ||
|
||||
out_h >= output_height - right_h_padding ||
|
||||
out_w < left_w_padding ||
|
||||
out_w >= output_width - right_w_padding ||
|
||||
out_d < left_d_padding ||
|
||||
out_d >= output_depth - right_d_padding) {
|
||||
*out_ptr++ = pad_value;
|
||||
} else {
|
||||
*out_ptr++ = *in_ptr++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename P>
|
||||
inline void Pad(const tflite::PadParams& op_params,
|
||||
const RuntimeShape& input_shape, const T* input_data,
|
||||
const P* pad_value_ptr, const RuntimeShape& output_shape,
|
||||
T* output_data) {
|
||||
PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
|
||||
output_data);
|
||||
}
|
||||
|
||||
// The second (pad-value) input can be int32_t when, say, the first is uint8_t.
|
||||
template <typename T>
|
||||
inline void Pad(const tflite::PadParams& op_params,
|
||||
const RuntimeShape& input_shape, const T* input_data,
|
||||
const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
|
||||
T* output_data) {
|
||||
const T converted_pad_value = static_cast<T>(*pad_value_ptr);
|
||||
PadImpl(op_params, input_shape, input_data, &converted_pad_value,
|
||||
output_shape, output_data);
|
||||
}
|
||||
|
||||
// This version avoids conflicting template matching.
|
||||
template <>
|
||||
inline void Pad(const tflite::PadParams& op_params,
|
||||
const RuntimeShape& input_shape, const int32_t* input_data,
|
||||
const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
|
||||
int32_t* output_data) {
|
||||
PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
|
||||
output_data);
|
||||
}
|
||||
|
||||
template <typename T, typename P>
|
||||
inline void PadImageStyle(const tflite::PadParams& op_params,
|
||||
const RuntimeShape& input_shape, const T* input_data,
|
||||
const P* pad_value_ptr,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
|
||||
output_data);
|
||||
}
|
||||
|
||||
template <typename P>
|
||||
inline void PadImageStyle(const tflite::PadParams& op_params,
|
||||
const RuntimeShape& input_shape,
|
||||
const float* input_data, const P* pad_value_ptr,
|
||||
const RuntimeShape& output_shape,
|
||||
float* output_data) {
|
||||
Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
|
||||
output_data);
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PAD_H_
|
||||
@@ -0,0 +1,301 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline bool AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int stride_height = params.stride_height;
|
||||
const int stride_width = params.stride_width;
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int channel = 0; channel < depth; ++channel) {
|
||||
const int in_x_origin =
|
||||
(out_x * stride_width) - params.padding_values.width;
|
||||
const int in_y_origin =
|
||||
(out_y * stride_height) - params.padding_values.height;
|
||||
// Compute the boundaries of the filter region clamped so as to
|
||||
// ensure that the filter window fits in the input array.
|
||||
const int filter_x_start = std::max(0, -in_x_origin);
|
||||
const int filter_x_end =
|
||||
std::min(params.filter_width, input_width - in_x_origin);
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
float total = 0.f;
|
||||
float filter_count = 0;
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
for (int filter_x = filter_x_start; filter_x < filter_x_end;
|
||||
++filter_x) {
|
||||
const int in_x = in_x_origin + filter_x;
|
||||
const int in_y = in_y_origin + filter_y;
|
||||
total +=
|
||||
input_data[Offset(input_shape, batch, in_y, in_x, channel)];
|
||||
filter_count++;
|
||||
}
|
||||
}
|
||||
if (filter_count == 0) return false;
|
||||
const float average = total / filter_count;
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
ActivationFunctionWithMinMax(average, params.float_activation_min,
|
||||
params.float_activation_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int stride_height = params.stride_height;
|
||||
const int stride_width = params.stride_width;
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int channel = 0; channel < depth; ++channel) {
|
||||
const int in_x_origin =
|
||||
(out_x * stride_width) - params.padding_values.width;
|
||||
const int in_y_origin =
|
||||
(out_y * stride_height) - params.padding_values.height;
|
||||
// Compute the boundaries of the filter region clamped so as to
|
||||
// ensure that the filter window fits in the input array.
|
||||
const int filter_x_start = std::max(0, -in_x_origin);
|
||||
const int filter_x_end =
|
||||
std::min(params.filter_width, input_width - in_x_origin);
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
int32_t acc = 0;
|
||||
int filter_count = 0;
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
for (int filter_x = filter_x_start; filter_x < filter_x_end;
|
||||
++filter_x) {
|
||||
const int in_x = in_x_origin + filter_x;
|
||||
const int in_y = in_y_origin + filter_y;
|
||||
acc +=
|
||||
input_data[Offset(input_shape, batch, in_y, in_x, channel)];
|
||||
filter_count++;
|
||||
}
|
||||
}
|
||||
if (filter_count == 0) return false;
|
||||
acc = (acc + filter_count / 2) / filter_count;
|
||||
acc = std::max(acc, params.quantized_activation_min);
|
||||
acc = std::min(acc, params.quantized_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
static_cast<uint8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& output_shape,
|
||||
float* output_data) {
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int stride_height = params.stride_height;
|
||||
const int stride_width = params.stride_width;
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int channel = 0; channel < depth; ++channel) {
|
||||
const int in_x_origin =
|
||||
(out_x * stride_width) - params.padding_values.width;
|
||||
const int in_y_origin =
|
||||
(out_y * stride_height) - params.padding_values.height;
|
||||
// Compute the boundaries of the filter region clamped so as to
|
||||
// ensure that the filter window fits in the input array.
|
||||
const int filter_x_start = std::max(0, -in_x_origin);
|
||||
const int filter_x_end =
|
||||
std::min(params.filter_width, input_width - in_x_origin);
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
float sum_squares = 0.f;
|
||||
int filter_count = 0;
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
for (int filter_x = filter_x_start; filter_x < filter_x_end;
|
||||
++filter_x) {
|
||||
const int in_x = in_x_origin + filter_x;
|
||||
const int in_y = in_y_origin + filter_y;
|
||||
const float val =
|
||||
input_data[Offset(input_shape, batch, in_y, in_x, channel)];
|
||||
sum_squares += val * val;
|
||||
filter_count++;
|
||||
}
|
||||
}
|
||||
const float l2pool_result = std::sqrt(sum_squares / filter_count);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
ActivationFunctionWithMinMax(l2pool_result,
|
||||
params.float_activation_min,
|
||||
params.float_activation_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& output_shape,
|
||||
float* output_data) {
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int stride_height = params.stride_height;
|
||||
const int stride_width = params.stride_width;
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int channel = 0; channel < depth; ++channel) {
|
||||
const int in_x_origin =
|
||||
(out_x * stride_width) - params.padding_values.width;
|
||||
const int in_y_origin =
|
||||
(out_y * stride_height) - params.padding_values.height;
|
||||
// Compute the boundaries of the filter region clamped so as to
|
||||
// ensure that the filter window fits in the input array.
|
||||
const int filter_x_start = std::max(0, -in_x_origin);
|
||||
const int filter_x_end =
|
||||
std::min(params.filter_width, input_width - in_x_origin);
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
float max = std::numeric_limits<float>::lowest();
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
for (int filter_x = filter_x_start; filter_x < filter_x_end;
|
||||
++filter_x) {
|
||||
const int in_x = in_x_origin + filter_x;
|
||||
const int in_y = in_y_origin + filter_y;
|
||||
max = std::max(
|
||||
max,
|
||||
input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
|
||||
}
|
||||
}
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
ActivationFunctionWithMinMax(max, params.float_activation_min,
|
||||
params.float_activation_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
TFLITE_DCHECK_GE(params.quantized_activation_min, 0);
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_max, 255);
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int stride_height = params.stride_height;
|
||||
const int stride_width = params.stride_width;
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int channel = 0; channel < depth; ++channel) {
|
||||
const int in_x_origin =
|
||||
(out_x * stride_width) - params.padding_values.width;
|
||||
const int in_y_origin =
|
||||
(out_y * stride_height) - params.padding_values.height;
|
||||
// Compute the boundaries of the filter region clamped so as to
|
||||
// ensure that the filter window fits in the input array.
|
||||
const int filter_x_start = std::max(0, -in_x_origin);
|
||||
const int filter_x_end =
|
||||
std::min(params.filter_width, input_width - in_x_origin);
|
||||
const int filter_y_start = std::max(0, -in_y_origin);
|
||||
const int filter_y_end =
|
||||
std::min(params.filter_height, input_height - in_y_origin);
|
||||
uint8_t max = 0;
|
||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
||||
++filter_y) {
|
||||
for (int filter_x = filter_x_start; filter_x < filter_x_end;
|
||||
++filter_x) {
|
||||
const int in_x = in_x_origin + filter_x;
|
||||
const int in_y = in_y_origin + filter_y;
|
||||
max = std::max(
|
||||
max,
|
||||
input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
|
||||
}
|
||||
}
|
||||
max = std::max<uint8_t>(max, params.quantized_activation_min);
|
||||
max = std::min<uint8_t>(max, params.quantized_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
|
||||
static_cast<uint8_t>(max);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
|
||||
@@ -0,0 +1,774 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define __restrict__ __restrict
|
||||
#endif
|
||||
|
||||
namespace tflite {
|
||||
namespace tensor_utils {
|
||||
|
||||
namespace {
|
||||
const int32_t kInt16Max = std::numeric_limits<int16_t>::max();
|
||||
const int32_t kInt16Min = std::numeric_limits<int16_t>::min();
|
||||
} // namespace
|
||||
|
||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* min_value,
|
||||
float* max_value, float* scaling_factor) {
|
||||
auto minmax = std::minmax_element(values, values + size);
|
||||
*min_value = *minmax.first;
|
||||
*max_value = *minmax.second;
|
||||
|
||||
PortableSymmetricQuantizeFloats(values, size, quantized_values, *min_value,
|
||||
*max_value, scaling_factor);
|
||||
}
|
||||
|
||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float min_value,
|
||||
float max_value, float* scaling_factor) {
|
||||
const int32_t kScale = 127;
|
||||
const float range = std::max(std::abs(min_value), std::abs(max_value));
|
||||
if (range == 0) {
|
||||
memset(quantized_values, 0, size * sizeof(int8_t));
|
||||
*scaling_factor = 1;
|
||||
return;
|
||||
}
|
||||
*scaling_factor = range / kScale;
|
||||
const float scaling_factor_inv = kScale / range;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t quantized_value =
|
||||
static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
|
||||
// Clamp: just in case some odd numeric offset.
|
||||
quantized_values[i] = static_cast<int8_t>(
|
||||
std::min(kScale, std::max(-kScale, quantized_value)));
|
||||
}
|
||||
}
|
||||
|
||||
void PortableAsymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values,
|
||||
float* scaling_factor, int32_t* offset) {
|
||||
const int32_t kMinScale = -128;
|
||||
const int32_t kMaxScale = 127;
|
||||
const double qmin_double = kMinScale;
|
||||
const double qmax_double = kMaxScale;
|
||||
const auto minmax = std::minmax_element(values, values + size);
|
||||
const double rmin = static_cast<double>(std::min(0.0f, *minmax.first));
|
||||
const double rmax = static_cast<double>(std::max(0.0f, *minmax.second));
|
||||
if (rmin == rmax) {
|
||||
memset(quantized_values, 0, size * sizeof(int8_t));
|
||||
*scaling_factor = 1;
|
||||
*offset = 0;
|
||||
return;
|
||||
} else {
|
||||
double scale = (rmax - rmin) / (qmax_double - qmin_double);
|
||||
const double zero_point_from_min = qmin_double - rmin / scale;
|
||||
const double zero_point_from_max = qmax_double - rmax / scale;
|
||||
const double zero_point_from_min_error =
|
||||
std::abs(qmin_double) + std::abs(rmin / scale);
|
||||
const double zero_point_from_max_error =
|
||||
std::abs(qmax_double) + std::abs(rmax / scale);
|
||||
const double zero_point_double =
|
||||
zero_point_from_min_error < zero_point_from_max_error
|
||||
? zero_point_from_min
|
||||
: zero_point_from_max;
|
||||
int8_t nudged_zero_point = 0;
|
||||
if (zero_point_double <= qmin_double) {
|
||||
nudged_zero_point = kMinScale;
|
||||
} else if (zero_point_double >= qmax_double) {
|
||||
nudged_zero_point = kMaxScale;
|
||||
} else {
|
||||
nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
|
||||
}
|
||||
*scaling_factor = scale;
|
||||
*offset = nudged_zero_point;
|
||||
}
|
||||
const float scaling_factor_inv = 1.0f / *scaling_factor;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t quantized_value = static_cast<int32_t>(
|
||||
TfLiteRound(*offset + values[i] * scaling_factor_inv));
|
||||
quantized_values[i] =
|
||||
std::min(kMaxScale, std::max(kMinScale, quantized_value));
|
||||
}
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
|
||||
int m_rows, int m_cols,
|
||||
const float* vector,
|
||||
int n_batch, float* result) {
|
||||
float* result_in_batch = result;
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
const float* matrix_ptr = matrix;
|
||||
for (int r = 0; r < m_rows; r++) {
|
||||
float dot_prod = 0.0f;
|
||||
const float* vector_in_batch = vector + b * m_cols;
|
||||
for (int c = 0; c < m_cols; c++) {
|
||||
dot_prod += *matrix_ptr++ * *vector_in_batch++;
|
||||
}
|
||||
*result_in_batch += dot_prod;
|
||||
++result_in_batch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||
int n_batch, float* __restrict__ result) {
|
||||
for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
|
||||
const float batch_scaling_factor = scaling_factors[batch];
|
||||
// Get the address of the first row.
|
||||
const int8_t* row_ptr = matrix;
|
||||
for (int row = 0; row < m_rows; ++row) {
|
||||
// Initialize the dot product sum for the row to 0.
|
||||
int32_t dotprod = 0;
|
||||
#if defined(__GNUC__)
|
||||
// Prefetch the row to cache.
|
||||
__builtin_prefetch(row_ptr, 0 /* prefetch for read */,
|
||||
3 /* temporal locality */);
|
||||
#endif
|
||||
for (int col = 0; col < m_cols; ++col, ++row_ptr) {
|
||||
dotprod += (*row_ptr) * (vectors[col]);
|
||||
} // for col
|
||||
*result += dotprod * batch_scaling_factor;
|
||||
++result;
|
||||
} // for row
|
||||
} // for batch
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||
int n_batch, float* __restrict__ result, const float* per_channel_scale,
|
||||
const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
|
||||
bool* compute_row_sums, CpuBackendContext* context) {
|
||||
if (input_offset == nullptr) {
|
||||
PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
|
||||
return;
|
||||
}
|
||||
if (!compute_row_sums || *compute_row_sums) {
|
||||
PortableReductionSumVector(matrix, row_sums, m_rows, m_cols);
|
||||
if (compute_row_sums) {
|
||||
*compute_row_sums = false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
|
||||
const float batch_scaling_factor = scaling_factors[batch];
|
||||
const int32_t batch_offset = input_offset[batch];
|
||||
const int8_t* row_ptr = matrix;
|
||||
for (int row = 0; row < m_rows; ++row) {
|
||||
int32_t dotprod = 0;
|
||||
float scale = batch_scaling_factor;
|
||||
if (per_channel_scale) {
|
||||
scale *= per_channel_scale[row];
|
||||
}
|
||||
#if defined(__GNUC__)
|
||||
// Prefetch the row to cache.
|
||||
__builtin_prefetch(row_ptr, 0 /* prefetch for read */,
|
||||
3 /* temporal locality */);
|
||||
#endif
|
||||
for (int col = 0; col < m_cols; ++col, ++row_ptr) {
|
||||
dotprod += (*row_ptr) * vectors[col];
|
||||
} // for col
|
||||
dotprod -= row_sums[row] * batch_offset;
|
||||
*result += dotprod * scale;
|
||||
++result;
|
||||
} // for row
|
||||
} // for batch
|
||||
}
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
|
||||
const float* __restrict__ matrix, const int32_t* __restrict__ segments,
|
||||
const int32_t* __restrict__ indices, int m_rows, int m_cols,
|
||||
const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
|
||||
const int kBlockSize = 4;
|
||||
TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
|
||||
for (int batch = 0; batch < n_batch; batch++) {
|
||||
const float* matrix_ptr = matrix;
|
||||
for (int row = 0; row < m_rows; row++) {
|
||||
float dot_prod = 0.0f;
|
||||
const float* vector_in_batch = vector + batch * m_cols;
|
||||
for (int i = segments[row]; i < segments[row + 1]; i++) {
|
||||
const int block_start_index = indices[i] * kBlockSize;
|
||||
const float* vector_block_in_batch_ptr =
|
||||
vector_in_batch + block_start_index;
|
||||
for (int c = 0; c < kBlockSize; c++) {
|
||||
dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
|
||||
}
|
||||
}
|
||||
result[batch * m_rows + row] += dot_prod;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
||||
float* __restrict__ result) {
|
||||
const int kBlockSize = 16;
|
||||
TFLITE_DCHECK_EQ( // NOLINT
|
||||
m_cols % kBlockSize, 0);
|
||||
for (int batch = 0; batch < n_batch; batch++) {
|
||||
const float* matrix_ptr = matrix;
|
||||
const uint8_t* ledger_ptr = ledger;
|
||||
for (int row = 0; row < m_rows; row++) {
|
||||
float dot_prod = 0.0f;
|
||||
int num_nonzero_blocks = *ledger_ptr++;
|
||||
if (num_nonzero_blocks > 0) {
|
||||
const float* vector_in_batch = vector + batch * m_cols;
|
||||
for (int i = 0; i < num_nonzero_blocks; i++) {
|
||||
const int block_start_index = *ledger_ptr++ * kBlockSize;
|
||||
const float* vector_block_in_batch_ptr =
|
||||
vector_in_batch + block_start_index;
|
||||
for (int c = 0; c < kBlockSize; c++) {
|
||||
dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
|
||||
}
|
||||
}
|
||||
}
|
||||
result[batch * m_rows + row] += dot_prod;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
||||
const int m_cols, const int8_t* __restrict__ vectors,
|
||||
const float* scaling_factors, int n_batch, float* __restrict__ result) {
|
||||
static const int kBlockSize = 16;
|
||||
TFLITE_DCHECK_EQ( // NOLINT
|
||||
m_cols % kBlockSize, 0);
|
||||
for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
|
||||
const float batch_scaling_factor = scaling_factors[batch];
|
||||
const uint8_t* ledger_ptr = ledger;
|
||||
// Get the address of the first row.
|
||||
const int8_t* row_ptr = matrix;
|
||||
for (int row = 0; row < m_rows; ++row) {
|
||||
// Initialize the dot product sum for the row to 0.
|
||||
int32_t dotprod = 0;
|
||||
#if defined(__GNUC__)
|
||||
// Prefetch the row to cache.
|
||||
__builtin_prefetch(row_ptr, 0 /* prefetch for read */,
|
||||
3 /* temporal locality */);
|
||||
#endif
|
||||
int num_nonzero_blocks = *ledger_ptr++;
|
||||
for (int i = 0; i < num_nonzero_blocks; i++) {
|
||||
const int block_start_index = *ledger_ptr++ * kBlockSize;
|
||||
const int8_t* vector_block_ptr = vectors + block_start_index;
|
||||
for (int c = 0; c < kBlockSize; c++) {
|
||||
dotprod += (*row_ptr++) * (*vector_block_ptr++);
|
||||
} // for block
|
||||
} // for num_nonzero_blocks
|
||||
result[batch * m_rows + row] += dotprod * batch_scaling_factor;
|
||||
} // for row
|
||||
} // for batch
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void PortableMatrixBatchVectorMultiplyAccumulateImpl(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
||||
T* output) {
|
||||
const int16_t output_max = std::numeric_limits<T>::max();
|
||||
const int16_t output_min = std::numeric_limits<T>::min();
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int row = 0; row < n_output; ++row) {
|
||||
int32_t acc = bias[row];
|
||||
for (int col = 0; col < n_input; ++col) {
|
||||
int8_t input_val = input[batch * n_input + col];
|
||||
int8_t weights_val = input_to_gate_weights[row * n_input + col];
|
||||
acc += input_val * weights_val;
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
|
||||
acc += output_zp;
|
||||
acc += output[batch * n_output + row];
|
||||
if (acc > output_max) {
|
||||
acc = output_max;
|
||||
}
|
||||
if (acc < output_min) {
|
||||
acc = output_min;
|
||||
}
|
||||
output[batch * n_output + row] = static_cast<T>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
||||
int32_t* scratch, int16_t* output, CpuBackendContext* context) {
|
||||
PortableMatrixBatchVectorMultiplyAccumulateImpl(
|
||||
input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
|
||||
n_output, output_zp, output);
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
||||
int32_t* scratch, int8_t* output, CpuBackendContext* context) {
|
||||
PortableMatrixBatchVectorMultiplyAccumulateImpl(
|
||||
input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
|
||||
n_output, output_zp, output);
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiply(const int8_t* input,
|
||||
int32_t input_zeropoint,
|
||||
const int8_t* input_to_gate_weights,
|
||||
int32_t input_to_gate_effective_scale_a,
|
||||
int32_t input_to_gate_effective_scale_b,
|
||||
int32_t n_batch, int32_t n_input,
|
||||
int32_t n_cell, int8_t* gate_output,
|
||||
int8_t gate_output_zp) {
|
||||
const int32_t int8_max = std::numeric_limits<int8_t>::max();
|
||||
const int32_t int8_min = std::numeric_limits<int8_t>::min();
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int row = 0; row < n_cell; ++row) {
|
||||
int32_t acc = 0;
|
||||
for (int col = 0; col < n_input; ++col) {
|
||||
int32_t input_val = input[batch * n_input + col];
|
||||
int8_t weights_val = input_to_gate_weights[row * n_input + col];
|
||||
acc += (input_val - input_zeropoint) * weights_val;
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(acc, input_to_gate_effective_scale_a,
|
||||
input_to_gate_effective_scale_b);
|
||||
acc += gate_output_zp;
|
||||
if (acc > int8_max) {
|
||||
acc = int8_max;
|
||||
}
|
||||
if (acc < int8_min) {
|
||||
acc = int8_min;
|
||||
}
|
||||
gate_output[batch * n_cell + row] = static_cast<int8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableMatrixBatchVectorMultiply(
|
||||
const int16_t* hidden, const int8_t* hidden_to_output_weights,
|
||||
int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
|
||||
const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
|
||||
int32_t n_output, int32_t output_zp, int8_t* proj_output) {
|
||||
const int16_t int8_max = std::numeric_limits<int8_t>::max();
|
||||
const int16_t int8_min = std::numeric_limits<int8_t>::min();
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int row = 0; row < n_output; ++row) {
|
||||
int64_t acc = gate_bias[row];
|
||||
for (int col = 0; col < n_hidden; ++col) {
|
||||
int16_t input_val = hidden[batch * n_hidden + col];
|
||||
int8_t weights_val = hidden_to_output_weights[row * n_hidden + col];
|
||||
int64_t curr = acc;
|
||||
acc += input_val * weights_val;
|
||||
if (input_val * weights_val > 0 && acc < curr) {
|
||||
acc = std::numeric_limits<int32_t>::max();
|
||||
}
|
||||
if (input_val * weights_val < 0 && acc > curr) {
|
||||
acc = std::numeric_limits<int32_t>::min();
|
||||
}
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
|
||||
proj_effective_scale_b);
|
||||
acc += output_zp;
|
||||
if (acc > int8_max) {
|
||||
acc = int8_max;
|
||||
}
|
||||
if (acc < int8_min) {
|
||||
acc = int8_min;
|
||||
}
|
||||
proj_output[batch * n_output + row] = acc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableApplyLayerNorm(const int16_t* input,
|
||||
const int16_t* layer_norm_weights,
|
||||
const int32_t* bias, int32_t layer_norm_scale_a,
|
||||
int32_t layer_norm_scale_b, int32_t variance_limit,
|
||||
int n_batch, int n_input, int16_t* output) {
|
||||
// The square of std::pow(2, 10), which is the extra factor that makes sure
|
||||
// normalized values has enough resolution.
|
||||
static const int kTwoToPower20 = 1 << 20;
|
||||
for (int i = 0; i < n_batch; ++i) {
|
||||
int64_t sum = 0;
|
||||
int64_t sum_sq = 0;
|
||||
for (int j = 0; j < n_input; ++j) {
|
||||
const int32_t index = i * n_input + j;
|
||||
int32_t val = static_cast<int32_t>(input[index]);
|
||||
sum += val;
|
||||
sum_sq += val * val;
|
||||
}
|
||||
int32_t mean =
|
||||
static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
|
||||
// TODO(b/173994730): Avoids overflow but only works for POT n_input.
|
||||
int32_t temp = kTwoToPower20 / n_input;
|
||||
int64_t variance =
|
||||
sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
|
||||
int32_t variance2 = static_cast<int32_t>(variance / kTwoToPower20);
|
||||
if (variance2 < 1) {
|
||||
variance2 = variance_limit;
|
||||
}
|
||||
int32_t stddev_inverse_a;
|
||||
int stddev_inverse_b;
|
||||
GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
|
||||
&stddev_inverse_a, &stddev_inverse_b);
|
||||
|
||||
for (int j = 0; j < n_input; ++j) {
|
||||
const int32_t index = i * n_input + j;
|
||||
int32_t val = static_cast<int32_t>(input[index]);
|
||||
int32_t shifted = 1024 * val - mean;
|
||||
int32_t rescaled = MultiplyByQuantizedMultiplier(
|
||||
shifted, stddev_inverse_a, stddev_inverse_b);
|
||||
// TODO(jianlijianli): Saturate this.
|
||||
int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
|
||||
int32_t val4 =
|
||||
static_cast<int32_t>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
|
||||
int32_t val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
|
||||
layer_norm_scale_b + 12);
|
||||
val5 = std::min(std::max(kInt16Min, val5), kInt16Max);
|
||||
output[index] = static_cast<int16_t>(val5);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableApplyLayerNormFloat(const int16_t* input,
|
||||
const int16_t* layer_norm_weights,
|
||||
int32_t layer_norm_scale_a,
|
||||
int32_t layer_norm_scale_b,
|
||||
const int32_t* bias, int n_batch, int n_input,
|
||||
int16_t* output) {
|
||||
const int32_t int16_max = std::numeric_limits<int16_t>::max();
|
||||
const int32_t int16_min = std::numeric_limits<int16_t>::min();
|
||||
const float layer_norm_scale =
|
||||
layer_norm_scale_a *
|
||||
std::pow(2.0, static_cast<double>(layer_norm_scale_b - 31));
|
||||
const float bias_scale =
|
||||
static_cast<float>(std::pow(2.0, -10)) * layer_norm_scale;
|
||||
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
float sum = 0.0f;
|
||||
float sum_sq = 0.0f;
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
const float value = static_cast<float>(input[index]);
|
||||
sum += value;
|
||||
sum_sq += value * value;
|
||||
}
|
||||
const float mean = sum / n_input;
|
||||
float stddev_inv = 0.0f;
|
||||
const float variance = sum_sq / n_input - mean * mean;
|
||||
if (variance == 0) {
|
||||
stddev_inv = 1.0f / std::sqrt(1e-8f);
|
||||
} else {
|
||||
stddev_inv = 1.0f / std::sqrt(variance);
|
||||
}
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
const float normalized_value =
|
||||
(static_cast<float>(input[index]) - mean) * stddev_inv;
|
||||
const float weighted_normalized_value =
|
||||
normalized_value * layer_norm_weights[i] * layer_norm_scale +
|
||||
bias[i] * bias_scale;
|
||||
const int32_t quant_output = static_cast<int32_t>(round(
|
||||
weighted_normalized_value * static_cast<float>(std::pow(2, 12))));
|
||||
output[index] = std::min(int16_max, std::max(int16_min, quant_output));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
|
||||
int32_t scalar, int32_t n_row,
|
||||
int32_t n_col, int32_t* output) {
|
||||
for (int i = 0; i < n_row; ++i) {
|
||||
int32_t row_sum = 0;
|
||||
for (int j = 0; j < n_col; ++j) {
|
||||
row_sum += *matrix++;
|
||||
}
|
||||
output[i] += row_sum * scalar;
|
||||
}
|
||||
}
|
||||
|
||||
void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int16_t* output) {
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int c = 0; c < n_input; c++) {
|
||||
using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
|
||||
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
||||
const int index = batch * n_input + c;
|
||||
F3 sigmoid_input = F3::FromRaw(input[index]);
|
||||
F0 sigmoid_output = gemmlowp::logistic(sigmoid_input);
|
||||
output[index] = sigmoid_output.raw();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int16_t* output) {
|
||||
const int32_t int16_max = std::numeric_limits<int16_t>::max();
|
||||
const int32_t int16_min = std::numeric_limits<int16_t>::min();
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
const float float_input =
|
||||
input[index] * static_cast<float>(std::pow(2, -12));
|
||||
const float float_output = 1.0f / (1.0f + std::exp(-float_input));
|
||||
const int32_t quant_output = static_cast<int32_t>(
|
||||
float_output * static_cast<float>(std::pow(2, 15)));
|
||||
const int32_t quant_output_clamped =
|
||||
std::min(int16_max, std::max(int16_min, quant_output));
|
||||
output[index] = static_cast<int16_t>(quant_output_clamped);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int IntegerBits>
|
||||
void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int16_t* output) {
|
||||
using FX = gemmlowp::FixedPoint<std::int16_t, IntegerBits>;
|
||||
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
FX tanh_input = FX::FromRaw(input[index]);
|
||||
F0 tanh_output = gemmlowp::tanh(tanh_input);
|
||||
output[index] = tanh_output.raw();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
|
||||
int32_t n_batch, int32_t n_input, int16_t* output) {
|
||||
assert(integer_bits <= 6);
|
||||
#define DISPATCH_TANH(i) \
|
||||
case i: \
|
||||
PortableApplyTanhImpl<i>(input, n_batch, n_input, output); \
|
||||
break;
|
||||
switch (integer_bits) {
|
||||
DISPATCH_TANH(0);
|
||||
DISPATCH_TANH(1);
|
||||
DISPATCH_TANH(2);
|
||||
DISPATCH_TANH(3);
|
||||
DISPATCH_TANH(4);
|
||||
DISPATCH_TANH(5);
|
||||
DISPATCH_TANH(6);
|
||||
default:
|
||||
return;
|
||||
}
|
||||
#undef DISPATCH_TANH
|
||||
}
|
||||
|
||||
void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int32_t integer_bits,
|
||||
int16_t* output) {
|
||||
const int32_t int16_max = std::numeric_limits<int16_t>::max();
|
||||
const int32_t int16_min = std::numeric_limits<int16_t>::min();
|
||||
const double two = 2.0;
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
const float float_input =
|
||||
input[index] * std::pow(two, static_cast<double>(integer_bits));
|
||||
const float float_output = std::tanh(float_input);
|
||||
const int32_t quant_output = static_cast<int32_t>(
|
||||
float_output * static_cast<float>(std::pow(2, 15)));
|
||||
const int32_t quant_output_clamped =
|
||||
std::min(int16_max, std::max(int16_min, quant_output));
|
||||
output[index] = static_cast<int16_t>(quant_output_clamped);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
|
||||
int n_batch, int n_input, int shift, int16_t* output) {
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
const int16_t a = input_1[index];
|
||||
const int16_t b = input_2[index];
|
||||
const int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
|
||||
output[index] =
|
||||
static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
|
||||
int32_t multiplier, int32_t shift, int32_t n_batch,
|
||||
int32_t n_input, int32_t output_zp, int8_t* output) {
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
const int16_t a = input_1[index];
|
||||
const int16_t b = input_2[index];
|
||||
int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
|
||||
value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
|
||||
value -= output_zp;
|
||||
value = std::min(std::max(static_cast<int32_t>(-128), value),
|
||||
static_cast<int32_t>(127));
|
||||
|
||||
output[index] = static_cast<int8_t>(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
|
||||
int n_batch, int n_input, int16_t* output) {
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
for (int i = 0; i < n_input; ++i) {
|
||||
const int index = batch * n_input + i;
|
||||
int32_t sum = input_1[index] + input_2[index];
|
||||
const int32_t sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum));
|
||||
output[index] = static_cast<int16_t>(sum_clamped);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size) {
|
||||
float result = 0.0;
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
result += *vector1++ * *vector2++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
namespace {
|
||||
inline int32_t VectorVectorDotProduct(const int16_t* vector1,
|
||||
const int16_t* vector2, int v_size) {
|
||||
int32_t result = 0;
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
result += *vector1++ * *vector2++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
|
||||
const int16_t* vector2,
|
||||
int v_size, int n_batch,
|
||||
int32_t* result) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
|
||||
vector1 += v_size;
|
||||
vector2 += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
void PortableVectorBatchVectorCwiseProductAccumulate(
|
||||
const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
|
||||
int32_t multiplier, int shift, int16_t* result) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
int32_t prod = vector[v] * *batch_vector++;
|
||||
prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
|
||||
int32_t output = prod + *result;
|
||||
output = std::max(std::min(static_cast<int32_t>(32767), output),
|
||||
static_cast<int32_t>(-32768));
|
||||
*result++ = output;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortableSub1Vector(const float* vector, int v_size, float* result) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
*result++ = 1.0f - *vector++;
|
||||
}
|
||||
}
|
||||
|
||||
void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result) {
|
||||
static const int16_t kOne = 32767;
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
*result++ = kOne - *vector++;
|
||||
}
|
||||
}
|
||||
|
||||
void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
|
||||
const float scale, float* result) {
|
||||
for (int v = 0; v < v_size; ++v) {
|
||||
*result++ = scale * *vector++;
|
||||
}
|
||||
}
|
||||
|
||||
void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
|
||||
float* __restrict__ output_vector,
|
||||
int v_size, int n_batch) {
|
||||
for (int batch = 0; batch < n_batch; ++batch) {
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < v_size; ++i) {
|
||||
sum += input_vector[i];
|
||||
}
|
||||
const float mean = sum / v_size;
|
||||
float sum_diff_sq = 0.0f;
|
||||
for (int i = 0; i < v_size; ++i) {
|
||||
const float diff = input_vector[i] - mean;
|
||||
sum_diff_sq += diff * diff;
|
||||
}
|
||||
const float variance = sum_diff_sq / v_size;
|
||||
constexpr float kNormalizationConstant = 1e-8f;
|
||||
const float stddev_inv =
|
||||
1.0f / std::sqrt(variance + kNormalizationConstant);
|
||||
for (int i = 0; i < v_size; ++i) {
|
||||
output_vector[i] = (input_vector[i] - mean) * stddev_inv;
|
||||
}
|
||||
input_vector += v_size;
|
||||
output_vector += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
|
||||
const int8_t* recurrent, int8_t recurrent_zp,
|
||||
int32_t input_effective_scale_a,
|
||||
int32_t input_effective_scale_b,
|
||||
int32_t recurrent_effective_scale_a,
|
||||
int32_t recurrent_effective_scale_b,
|
||||
int32_t n_batch, int32_t n_cell,
|
||||
int16_t* output) {
|
||||
const int32_t int16_max = std::numeric_limits<int16_t>::max();
|
||||
const int32_t int16_min = std::numeric_limits<int16_t>::min();
|
||||
for (int i = 0; i < n_batch * n_cell; ++i) {
|
||||
int32_t x = static_cast<int32_t>(input[i]) - static_cast<int32_t>(input_zp);
|
||||
int32_t h =
|
||||
static_cast<int32_t>(recurrent[i]) - static_cast<int32_t>(recurrent_zp);
|
||||
int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
|
||||
input_effective_scale_b);
|
||||
int32_t h_scaled = MultiplyByQuantizedMultiplier(
|
||||
h, recurrent_effective_scale_a, recurrent_effective_scale_b);
|
||||
int32_t y = h_scaled + x_scaled;
|
||||
if (y > int16_max) {
|
||||
y = int16_max;
|
||||
}
|
||||
if (y < int16_min) {
|
||||
y = int16_min;
|
||||
}
|
||||
output[i] = static_cast<int16_t>(y);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tensor_utils
|
||||
} // namespace tflite
|
||||
@@ -0,0 +1,235 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define __restrict__ __restrict
|
||||
#endif
|
||||
|
||||
namespace tflite {
|
||||
|
||||
// Not all backends support CpuBackendContext usage, so forward declare to avoid
|
||||
// pulling in its implementation.
|
||||
class CpuBackendContext;
|
||||
|
||||
namespace tensor_utils {
|
||||
|
||||
template <typename T>
|
||||
bool PortableIsZeroVector(const T* vector, int v_size) {
|
||||
for (int i = 0; i < v_size; ++i) {
|
||||
if (vector[i] != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* min_value,
|
||||
float* max_value, float* scaling_factor);
|
||||
|
||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float min_value,
|
||||
float max_value, float* scaling_factor);
|
||||
|
||||
void PortableAsymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values,
|
||||
float* scaling_factor, int32_t* offset);
|
||||
|
||||
// Multiply a matrix by a batch vector, and store results in a batch-size
|
||||
// vector.
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
|
||||
int m_rows, int m_cols,
|
||||
const float* vector,
|
||||
int n_batch, float* result);
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||
int n_batch, float* __restrict__ result);
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||
int n_batch, float* __restrict__ result, const float* per_channel_scale,
|
||||
const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
|
||||
bool* compute_row_sums, CpuBackendContext* context);
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vector, const float* scaling_factors,
|
||||
int n_batch, int32_t* scratch, float* __restrict__ result,
|
||||
CpuBackendContext* context);
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
|
||||
const float* __restrict__ matrix, const int32_t* __restrict__ segments,
|
||||
const int32_t* __restrict__ indices, int m_rows, int m_cols,
|
||||
const float* __restrict__ vector, int n_batch, float* __restrict__ result);
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
||||
float* __restrict__ result);
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
||||
const int m_cols, const int8_t* __restrict__ vectors,
|
||||
const float* scaling_factors, int n_batch, float* __restrict__ result);
|
||||
|
||||
// Dot product of two vectors.
|
||||
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size);
|
||||
|
||||
void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
|
||||
const int16_t* vector2,
|
||||
int v_size, int n_batch,
|
||||
int32_t* result);
|
||||
|
||||
void PortableVectorBatchVectorCwiseProductAccumulate(
|
||||
const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
|
||||
int32_t multiplier, int shift, int16_t* result);
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
||||
int32_t* scratch, int16_t* output, CpuBackendContext* context);
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
||||
int32_t* scratch, int8_t* output, CpuBackendContext* context);
|
||||
|
||||
void PortableMatrixBatchVectorMultiply(const int8_t* input,
|
||||
int32_t input_zeropoint,
|
||||
const int8_t* input_to_gate_weights,
|
||||
int32_t input_to_gate_effective_scale_a,
|
||||
int32_t input_to_gate_effective_scale_b,
|
||||
int32_t n_batch, int32_t n_input,
|
||||
int32_t n_cell, int8_t* gate_output,
|
||||
int8_t gate_output_zp);
|
||||
|
||||
void PortableMatrixBatchVectorMultiply(
|
||||
const int16_t* hidden, const int8_t* hidden_to_output_weights,
|
||||
int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
|
||||
const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
|
||||
int32_t n_output, int32_t output_zp, int8_t* proj_output);
|
||||
|
||||
void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
|
||||
int32_t scalar, int32_t n_row,
|
||||
int32_t n_col, int32_t* output);
|
||||
|
||||
void PortableApplyLayerNorm(const int16_t* input,
|
||||
const int16_t* layer_norm_weights,
|
||||
const int32_t* bias, int32_t layer_norm_scale_a,
|
||||
int32_t layer_norm_scale_b, int32_t variance_limit,
|
||||
int n_batch, int n_input, int16_t* output);
|
||||
|
||||
void PortableApplyLayerNormFloat(const int16_t* input,
|
||||
const int16_t* layer_norm_weights,
|
||||
int32_t layer_norm_scale_a,
|
||||
int32_t layer_norm_scale_b,
|
||||
const int32_t* bias, int n_batch, int n_input,
|
||||
int16_t* output);
|
||||
|
||||
void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int16_t* output);
|
||||
|
||||
void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int16_t* output);
|
||||
|
||||
void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
|
||||
int32_t n_batch, int32_t n_input, int16_t* output);
|
||||
|
||||
void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int32_t integer_bits,
|
||||
int16_t* output);
|
||||
|
||||
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
|
||||
int n_batch, int n_input, int shift, int16_t* output);
|
||||
|
||||
void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
|
||||
int32_t multiplier, int32_t shift, int32_t n_batch,
|
||||
int32_t n_input, int32_t output_zp, int8_t* output);
|
||||
|
||||
void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
|
||||
int n_batch, int n_input, int16_t* output);
|
||||
|
||||
template <typename T>
|
||||
void PortableCwiseClipping(T* vector, const int v_size,
|
||||
const T& clipping_value) {
|
||||
for (int i = 0; i < v_size; i++) {
|
||||
vector[i] = std::max(std::min(clipping_value, vector[i]),
|
||||
static_cast<T>(-clipping_value));
|
||||
}
|
||||
}
|
||||
|
||||
// Batch vector initialization with another vector.
|
||||
void PortableVectorBatchVectorAssign(const float* vector, int v_size,
|
||||
int n_batch, float* batch_vector);
|
||||
|
||||
// Compute "1.0f - elements of vector" (used in CIFG).
|
||||
void PortableSub1Vector(const float* vector, int v_size, float* result);
|
||||
|
||||
void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result);
|
||||
|
||||
// Multiply all elements of vector with a scalar.
|
||||
void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
||||
float* result);
|
||||
|
||||
// Reduce-sum on a vector:
|
||||
// input_vector: pointer to input vector.
|
||||
// output_vector: pointer to vector.
|
||||
// output_size: output vector size.
|
||||
// reduction_size: number of consecutive elements from input vector which are
|
||||
// added to get one element of output.
|
||||
template <typename INPUT, typename OUTPUT>
|
||||
void PortableReductionSumVector(const INPUT* input_vector,
|
||||
OUTPUT* output_vector, int output_size,
|
||||
int reduction_size) {
|
||||
for (int o = 0; o < output_size; o++) {
|
||||
OUTPUT result = 0;
|
||||
for (int r = 0; r < reduction_size; r++) {
|
||||
result += input_vector[r];
|
||||
}
|
||||
output_vector[o] = result;
|
||||
input_vector += reduction_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Layer norm for each batch.
|
||||
void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
|
||||
float* __restrict__ output_vector,
|
||||
int v_size, int n_batch);
|
||||
|
||||
// Saturate Add.
|
||||
void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
|
||||
const int8_t* recurrent, int8_t recurrent_zp,
|
||||
int32_t input_effective_scale_a,
|
||||
int32_t input_effective_scale_b,
|
||||
int32_t recurrent_effective_scale_a,
|
||||
int32_t recurrent_effective_scale_b,
|
||||
int32_t n_batch, int32_t n_cell,
|
||||
int16_t* output);
|
||||
|
||||
} // namespace tensor_utils
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
|
||||
@@ -0,0 +1,109 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
// Broadcast prelu to output_shape for quantized uint8_t/int8_t data.
|
||||
template <typename T>
|
||||
inline void BroadcastPrelu4DSlow(
|
||||
const PreluParams& params, const RuntimeShape& input_shape,
|
||||
const T* input_data, const RuntimeShape& alpha_shape, const T* alpha_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape extended_output_shape =
|
||||
RuntimeShape::ExtendedShape(4, output_shape);
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
|
||||
|
||||
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
int output_index = Offset(extended_output_shape, b, y, x, c);
|
||||
int input_index = SubscriptToIndex(desc1, b, y, x, c);
|
||||
const int32_t input_value =
|
||||
params.input_offset + input_data[input_index];
|
||||
int32_t output_value;
|
||||
if (input_value >= 0) {
|
||||
output_value = MultiplyByQuantizedMultiplier(
|
||||
input_value, params.output_multiplier_1, params.output_shift_1);
|
||||
} else {
|
||||
auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
|
||||
const int32_t alpha_value =
|
||||
params.alpha_offset + alpha_data[alpha_index];
|
||||
|
||||
output_value = MultiplyByQuantizedMultiplier(
|
||||
input_value * alpha_value, params.output_multiplier_2,
|
||||
params.output_shift_2);
|
||||
}
|
||||
output_value += params.output_offset;
|
||||
|
||||
const int32_t quantized_min = std::numeric_limits<T>::min();
|
||||
const int32_t quantized_max = std::numeric_limits<T>::max();
|
||||
const int32_t clamped_output =
|
||||
std::min(quantized_max, std::max(quantized_min, output_value));
|
||||
output_data[output_index] = static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
|
||||
const T* input_data, const RuntimeShape& alpha_shape,
|
||||
const T* alpha_data, const RuntimeShape& output_shape,
|
||||
T* output_data) {
|
||||
const int32_t quantized_min = std::numeric_limits<T>::min();
|
||||
const int32_t quantized_max = std::numeric_limits<T>::max();
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input_shape, alpha_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
const int32_t input_value = params.input_offset + input_data[i];
|
||||
int32_t output_value;
|
||||
if (input_value >= 0) {
|
||||
output_value = MultiplyByQuantizedMultiplier(
|
||||
input_value, params.output_multiplier_1, params.output_shift_1);
|
||||
} else {
|
||||
const int32_t alpha_value = params.alpha_offset + alpha_data[i];
|
||||
|
||||
output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value,
|
||||
params.output_multiplier_2,
|
||||
params.output_shift_2);
|
||||
}
|
||||
output_value += params.output_offset;
|
||||
|
||||
const int32_t clamped_output =
|
||||
std::min(quantized_max, std::max(quantized_min, output_value));
|
||||
output_data[i] = static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
|
||||
@@ -0,0 +1,138 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
// Consolidates dimensions in broadcast inputs, checks for five-fold pattern.
|
||||
//
|
||||
// For example, if sequence of dimensions of one input is
|
||||
// ..., 1, 3, 1, 7, 9, 5,... and the other is ..., 2, 3, 1, 7, 1, 1, ...
|
||||
// we can consolidate these as
|
||||
// ..., 1, 3*7, 9*5, ... and 2, 3*7, 1.
|
||||
//
|
||||
// The category is updated in the less-frequent case of shapes that are
|
||||
// not suited to a fivefold-loop broadcast.
|
||||
//
|
||||
// Falls back to generic pattern when it does not know how to process properly.
|
||||
//
|
||||
// Returns true iff there is some sort of broadcast, which includes five-fold
|
||||
// patterns and falling back to generic broadcast.
|
||||
inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
|
||||
const RuntimeShape& shape1,
|
||||
tflite::ArithmeticParams* params) {
|
||||
const int dims_count =
|
||||
std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
|
||||
|
||||
params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
|
||||
RuntimeShape scalar_shape(dims_count, 1);
|
||||
|
||||
auto extended_shape0 = RuntimeShape::ExtendedShape(dims_count, shape0);
|
||||
auto extended_shape1 = RuntimeShape::ExtendedShape(dims_count, shape1);
|
||||
|
||||
// Check for "exact" match, implicitly accepting any scalar shapes.
|
||||
if (extended_shape0 == extended_shape1) {
|
||||
params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = dims_count - 1; i >= 0; --i) {
|
||||
if (extended_shape0.Dims(i) == extended_shape1.Dims(i)) {
|
||||
continue;
|
||||
} else if (extended_shape0.Dims(i) == 1) {
|
||||
params->broadcast_category =
|
||||
BroadcastableOpCategory::kFirstInputBroadcastsFast;
|
||||
break;
|
||||
} else if (extended_shape1.Dims(i) == 1) {
|
||||
params->broadcast_category =
|
||||
BroadcastableOpCategory::kSecondInputBroadcastsFast;
|
||||
break;
|
||||
} else {
|
||||
// This case is erroneous: there is a dimension that does not match and
|
||||
// is not a broadcast from one shape to the other.
|
||||
params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (params->broadcast_category !=
|
||||
BroadcastableOpCategory::kFirstInputBroadcastsFast &&
|
||||
params->broadcast_category !=
|
||||
BroadcastableOpCategory::kSecondInputBroadcastsFast) {
|
||||
// This is unreachable because at least one else clause in the above loop
|
||||
// must be reached.
|
||||
TFLITE_DCHECK(false);
|
||||
params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
|
||||
return false;
|
||||
}
|
||||
|
||||
// From this point it is assumed contractually that corresponding dimensions
|
||||
// in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
|
||||
const bool swap_inputs = params->broadcast_category ==
|
||||
BroadcastableOpCategory::kSecondInputBroadcastsFast;
|
||||
const RuntimeShape* shape_a =
|
||||
swap_inputs ? &extended_shape1 : &extended_shape0;
|
||||
const RuntimeShape* shape_b =
|
||||
swap_inputs ? &extended_shape0 : &extended_shape1;
|
||||
|
||||
int i = dims_count - 1;
|
||||
params->broadcast_shape[0] = 1;
|
||||
params->broadcast_shape[1] = 1;
|
||||
params->broadcast_shape[2] = 1;
|
||||
params->broadcast_shape[3] = 1;
|
||||
params->broadcast_shape[4] = 1;
|
||||
// y_0 is greedy: include dims if both or neither equal 1: in other words,
|
||||
// test for equality rather than (shape_a->Dims(i) != 1).
|
||||
while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
|
||||
params->broadcast_shape[4] *= shape_b->Dims(i);
|
||||
--i;
|
||||
}
|
||||
// Here either input_a or input_b has dim of 1 (if i >= 0). If it is input_b
|
||||
// that has the unit dimension, the next two loops are not entered.
|
||||
while (i >= 0 && shape_a->Dims(i) == 1) {
|
||||
params->broadcast_shape[3] *= shape_b->Dims(i);
|
||||
--i;
|
||||
}
|
||||
while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
|
||||
params->broadcast_shape[2] *= shape_a->Dims(i);
|
||||
--i;
|
||||
}
|
||||
// Here either input_a or input_b has dim of 1 (if i >= 0).
|
||||
while (i >= 0 && shape_b->Dims(i) == 1) {
|
||||
params->broadcast_shape[1] *= shape_a->Dims(i);
|
||||
--i;
|
||||
}
|
||||
while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
|
||||
params->broadcast_shape[0] *= shape_b->Dims(i);
|
||||
--i;
|
||||
}
|
||||
|
||||
// Rarer case is when the broadcast dimensions cannot be handled by a fivefold
|
||||
// loop.
|
||||
if (i >= 0) {
|
||||
params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
|
||||
@@ -0,0 +1,89 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename InputT, typename OutputT>
|
||||
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
|
||||
const RuntimeShape& input_shape,
|
||||
const InputT* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
OutputT* output_data) {
|
||||
const int32_t zero_point = op_params.zero_point;
|
||||
const double scale = op_params.scale;
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
|
||||
static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
const InputT val = input_data[i];
|
||||
int32_t unclamped =
|
||||
static_cast<int32_t>(TfLiteRound(val / static_cast<float>(scale))) +
|
||||
zero_point;
|
||||
int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
|
||||
output_data[i] = clamped;
|
||||
}
|
||||
}
|
||||
|
||||
// Quantizes per-channel.
|
||||
template <typename InputT, typename OutputT>
|
||||
inline void PerChannelQuantize(
|
||||
const tflite::PerChannelQuantizationParams& op_params,
|
||||
const RuntimeShape& input_shape, const InputT* input_data,
|
||||
const RuntimeShape& output_shape, OutputT* output_data) {
|
||||
// Ensure flat size is same.
|
||||
MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
const int32_t* zero_point = op_params.zero_point;
|
||||
const float* scale = op_params.scale;
|
||||
const int32_t quantized_dimension = op_params.quantized_dimension;
|
||||
const int32_t num_dims = input_shape.DimensionsCount();
|
||||
const int32_t* dims_data = input_shape.DimsData();
|
||||
std::vector<int> current_dim(num_dims, 0);
|
||||
static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
|
||||
static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();
|
||||
|
||||
do {
|
||||
size_t offset =
|
||||
ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
|
||||
current_dim.data(), 0, nullptr);
|
||||
const InputT val = input_data[offset];
|
||||
const int channel = current_dim[quantized_dimension];
|
||||
int32_t unclamped = static_cast<int32_t>(TfLiteRound(
|
||||
val / static_cast<float>(scale[channel]))) +
|
||||
zero_point[channel];
|
||||
int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
|
||||
output_data[offset] = static_cast<OutputT>(clamped);
|
||||
} while (NextIndex(num_dims, reinterpret_cast<const int*>(dims_data),
|
||||
current_dim.data()));
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
|
||||
} // namespace tflite
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
|
||||
@@ -0,0 +1,524 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/max.h"
|
||||
#include "tensorflow/lite/kernels/internal/min.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
// Check if the reduction at index is the first one along the dimensions given
|
||||
// in axis.
|
||||
inline bool IsFirstReduction(const int* index, const int num_axis,
|
||||
const int* axis) {
|
||||
if (num_axis == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
TFLITE_DCHECK(index != nullptr);
|
||||
TFLITE_DCHECK(axis != nullptr);
|
||||
for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
|
||||
if (index[axis[axis_idx]] != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
// A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
|
||||
// This method iterates through input data and reduce elements along the
|
||||
// dimensions given in axis.
|
||||
template <typename In, typename Out>
|
||||
inline bool Reduce(const In* input_data, const int* input_dims,
|
||||
const int* output_dims, const int input_num_dims,
|
||||
const int output_num_dims, const int* axis,
|
||||
const int num_axis, int* input_iter,
|
||||
Out reducer(Out current, const In in), Out* output_data) {
|
||||
// Reset input iterator.
|
||||
for (int idx = 0; idx < input_num_dims; ++idx) {
|
||||
input_iter[idx] = 0;
|
||||
}
|
||||
// Iterate through input_data.
|
||||
do {
|
||||
size_t input_offset =
|
||||
ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
|
||||
size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
|
||||
input_iter, num_axis, axis);
|
||||
output_data[output_offset] =
|
||||
reducer(output_data[output_offset], input_data[input_offset]);
|
||||
} while (NextIndex(input_num_dims, input_dims, input_iter));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Similar to above Reduce function but takes two reducer functions.
|
||||
// The 'reducer_first' is called with the first value of the reduction,
|
||||
// 'reducer_next' is then called for all the others.
|
||||
template <typename In, typename Out>
|
||||
inline bool Reduce(const In* input_data, const int* input_dims,
|
||||
const int* output_dims, const int input_num_dims,
|
||||
const int output_num_dims, const int* axis,
|
||||
const int num_axis, int* input_iter,
|
||||
const std::function<Out(In in)>& reducer_first,
|
||||
const std::function<Out(Out current, In in)>& reducer_next,
|
||||
Out* output_data) {
|
||||
// Reset input iterator.
|
||||
for (int idx = 0; idx < input_num_dims; ++idx) {
|
||||
input_iter[idx] = 0;
|
||||
}
|
||||
// Iterate through input_data.
|
||||
do {
|
||||
size_t input_offset =
|
||||
ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
|
||||
size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
|
||||
input_iter, num_axis, axis);
|
||||
if (IsFirstReduction(input_iter, num_axis, axis)) {
|
||||
output_data[output_offset] = reducer_first(input_data[input_offset]);
|
||||
} else {
|
||||
output_data[output_offset] =
|
||||
reducer_next(output_data[output_offset], input_data[input_offset]);
|
||||
}
|
||||
} while (NextIndex(input_num_dims, input_dims, input_iter));
|
||||
return true;
|
||||
}
|
||||
|
||||
// This method parses the input 'axis' to remove duplicates and handle negative
|
||||
// values, and returns a valid 'out_axis'
|
||||
inline bool ResolveAxis(const int num_dims, const int* axis,
|
||||
const int64_t num_axis, int* out_axis,
|
||||
int* out_num_axis) {
|
||||
*out_num_axis = 0; // Just in case.
|
||||
// Short-circuit axis resolution for scalars; the axis will go unused.
|
||||
if (num_dims == 0) {
|
||||
return true;
|
||||
}
|
||||
// o(n^2) is fine since out_num_axis should be really small, mostly <= 4
|
||||
for (int64_t idx = 0; idx < num_axis; ++idx) {
|
||||
// Handle negative index. A positive index 'p_idx' can be represented as a
|
||||
// negative index 'n_idx' as: n_idx = p_idx-num_dims
|
||||
// eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1] */
|
||||
int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
|
||||
TFLITE_DCHECK(current >= 0 && current < num_dims);
|
||||
if (current < 0 || current >= num_dims) {
|
||||
return false;
|
||||
}
|
||||
bool is_dup = false;
|
||||
for (int j = 0; j < *out_num_axis; ++j) {
|
||||
if (out_axis[j] == current) {
|
||||
is_dup = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!is_dup) {
|
||||
out_axis[*out_num_axis] = current;
|
||||
*out_num_axis += 1;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// This method expects that output_data has been initialized.
|
||||
template <typename In, typename Out>
|
||||
inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
|
||||
const int* output_dims, const int input_num_dims,
|
||||
const int output_num_dims, const int* axis,
|
||||
const int num_axis, int* input_iter,
|
||||
Out* output_data) {
|
||||
auto reducer = [](const Out current, const In in) -> Out {
|
||||
const Out actual_in = static_cast<Out>(in);
|
||||
return current + actual_in;
|
||||
};
|
||||
return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
|
||||
output_num_dims, axis, num_axis, input_iter, reducer,
|
||||
output_data);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
|
||||
const T init_value, T* data) {
|
||||
size_t num_elements = 1;
|
||||
for (int idx = 0; idx < num_dims; ++idx) {
|
||||
size_t current = static_cast<size_t>(dims[idx]);
|
||||
// Overflow prevention.
|
||||
if (current > 0 &&
|
||||
num_elements > std::numeric_limits<size_t>::max() / current) {
|
||||
return false;
|
||||
}
|
||||
num_elements *= current;
|
||||
}
|
||||
for (size_t idx = 0; idx < num_elements; ++idx) {
|
||||
data[idx] = init_value;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Computes the generic value (i.e., sum/max/min/prod) of elements across
|
||||
// dimensions given in axis. It needs to pass in init_value and reducer.
|
||||
template <typename T>
|
||||
inline bool ReduceGeneric(const T* input_data, const int* input_dims,
|
||||
const int input_num_dims, T* output_data,
|
||||
const int* output_dims, const int output_num_dims,
|
||||
const int* axis, const int64_t num_axis_dimensions,
|
||||
bool keep_dims, int* temp_index, int* resolved_axis,
|
||||
T init_value,
|
||||
T reducer(const T current, const T in)) {
|
||||
// Reset output data.
|
||||
if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
|
||||
output_data)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Return early when input shape has zero dim. This is done after initializing
|
||||
// data for output tensor because there are cases that the input tensor is
|
||||
// empty but output tensor is not. In that case, output tensor should be
|
||||
// filled with init_value.
|
||||
for (int i = 0; i < input_num_dims; ++i) {
|
||||
if (input_dims[i] == 0) return true;
|
||||
}
|
||||
|
||||
// Resolve axis.
|
||||
int num_resolved_axis = 0;
|
||||
if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
|
||||
&num_resolved_axis)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
|
||||
output_num_dims, resolved_axis, num_resolved_axis,
|
||||
temp_index, reducer, output_data);
|
||||
}
|
||||
|
||||
// Computes the mean of elements across dimensions given in axis.
|
||||
// It does so in two stages, first calculates the sum of elements along the axis
|
||||
// then divides it by the number of element in axis.
|
||||
template <typename T, typename U>
|
||||
inline bool Mean(const T* input_data, const int* input_dims,
|
||||
const int input_num_dims, T* output_data,
|
||||
const int* output_dims, const int output_num_dims,
|
||||
const int* axis, const int num_axis_dimensions, bool keep_dims,
|
||||
int* temp_index, int* resolved_axis, U* temp_sum) {
|
||||
ruy::profiler::ScopeLabel label("Mean");
|
||||
// Reset output data.
|
||||
size_t num_outputs = 1;
|
||||
for (int idx = 0; idx < output_num_dims; ++idx) {
|
||||
size_t current = static_cast<size_t>(output_dims[idx]);
|
||||
// Overflow prevention.
|
||||
if (num_outputs > std::numeric_limits<size_t>::max() / current) {
|
||||
return false;
|
||||
}
|
||||
num_outputs *= current;
|
||||
}
|
||||
for (size_t idx = 0; idx < num_outputs; ++idx) {
|
||||
output_data[idx] = T();
|
||||
temp_sum[idx] = U();
|
||||
}
|
||||
|
||||
// Resolve axis.
|
||||
int num_resolved_axis = 0;
|
||||
if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
|
||||
&num_resolved_axis)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
|
||||
output_num_dims, resolved_axis, num_resolved_axis,
|
||||
temp_index, temp_sum)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Calculate mean by dividing output_data by num of aggregated element.
|
||||
size_t num_elements_in_axis = 1;
|
||||
for (int idx = 0; idx < num_resolved_axis; ++idx) {
|
||||
size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
|
||||
// Overflow prevention.
|
||||
if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
|
||||
return false;
|
||||
}
|
||||
num_elements_in_axis *= current;
|
||||
}
|
||||
|
||||
if (num_elements_in_axis > 0) {
|
||||
for (size_t idx = 0; idx < num_outputs; ++idx) {
|
||||
output_data[idx] =
|
||||
static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void Mean(const tflite::MeanParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& unextended_output_shape, T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("Mean4D");
|
||||
|
||||
// Current implementation only supports dimension equals 4 and simultaneous
|
||||
// reduction over width and height.
|
||||
TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
const int output_batch = output_shape.Dims(0);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_depth = output_shape.Dims(3);
|
||||
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
|
||||
TFLITE_CHECK_EQ(op_params.axis_count, 2);
|
||||
TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
|
||||
(op_params.axis[0] == 2 && op_params.axis[1] == 1));
|
||||
TFLITE_CHECK_EQ(output_height, 1);
|
||||
TFLITE_CHECK_EQ(output_width, 1);
|
||||
|
||||
for (int out_b = 0; out_b < output_batch; ++out_b) {
|
||||
for (int out_d = 0; out_d < output_depth; ++out_d) {
|
||||
float value = 0;
|
||||
for (int in_h = 0; in_h < input_height; ++in_h) {
|
||||
for (int in_w = 0; in_w < input_width; ++in_w) {
|
||||
value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
|
||||
}
|
||||
}
|
||||
output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
|
||||
value / (input_width * input_height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void Mean(const tflite::MeanParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const uint8_t* input_data, int32_t input_zero_point,
|
||||
float input_scale, const RuntimeShape& unextended_output_shape,
|
||||
uint8_t* output_data, int32_t output_zero_point,
|
||||
float output_scale) {
|
||||
ruy::profiler::ScopeLabel label("Mean4D/Uint8");
|
||||
|
||||
// Current implementation only supports dimension equals 4 and simultaneous
|
||||
// reduction over width and height.
|
||||
TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
const int output_batch = output_shape.Dims(0);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_depth = output_shape.Dims(3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const float num_elements_in_axis = input_width * input_height;
|
||||
|
||||
TFLITE_CHECK_EQ(op_params.axis_count, 2);
|
||||
TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
|
||||
(op_params.axis[0] == 2 && op_params.axis[1] == 1));
|
||||
TFLITE_CHECK_EQ(output_height, 1);
|
||||
TFLITE_CHECK_EQ(output_width, 1);
|
||||
|
||||
constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
|
||||
constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
|
||||
|
||||
float temp = input_zero_point * input_scale / output_scale;
|
||||
temp = temp > 0 ? temp + 0.5f : temp - 0.5f;
|
||||
int32_t bias = output_zero_point - static_cast<int32_t>(temp);
|
||||
double real_scale =
|
||||
static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
|
||||
|
||||
int32_t multiplier;
|
||||
int shift;
|
||||
QuantizeMultiplier(real_scale, &multiplier, &shift);
|
||||
for (int out_b = 0; out_b < output_batch; ++out_b) {
|
||||
for (int out_d = 0; out_d < output_depth; ++out_d) {
|
||||
int32_t acc = 0;
|
||||
for (int in_h = 0; in_h < input_height; ++in_h) {
|
||||
for (int in_w = 0; in_w < input_width; ++in_w) {
|
||||
acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
|
||||
}
|
||||
}
|
||||
acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
|
||||
acc += bias;
|
||||
acc = std::min(std::max(acc, kMinValue), kMaxValue);
|
||||
output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
|
||||
static_cast<uint8_t>(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Computes the mean of elements across dimensions given in axis.
|
||||
// It does so in two stages, first calculates the sum of elements along the axis
|
||||
// then divides it by the number of element in axis for quantized values.
|
||||
template <typename T, typename U>
|
||||
inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
|
||||
float input_scale, const int* input_dims,
|
||||
const int input_num_dims, T* output_data,
|
||||
int32_t output_zero_point, float output_scale,
|
||||
const int* output_dims,
|
||||
const int output_num_dims, const int* axis,
|
||||
const int num_axis_dimensions, bool keep_dims,
|
||||
int* temp_index, int* resolved_axis, U* temp_sum,
|
||||
bool compute_sum) {
|
||||
const bool uint8_case = std::is_same<T, uint8_t>::value;
|
||||
const bool int16_case = std::is_same<T, int16_t>::value;
|
||||
if (uint8_case) {
|
||||
ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
|
||||
} else if (int16_case) {
|
||||
ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
|
||||
} else {
|
||||
ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
|
||||
}
|
||||
// Reset output data.
|
||||
size_t num_outputs = 1;
|
||||
for (int idx = 0; idx < output_num_dims; ++idx) {
|
||||
size_t current = static_cast<size_t>(output_dims[idx]);
|
||||
// Overflow prevention.
|
||||
if (num_outputs > std::numeric_limits<size_t>::max() / current) {
|
||||
return false;
|
||||
}
|
||||
num_outputs *= current;
|
||||
}
|
||||
for (size_t idx = 0; idx < num_outputs; ++idx) {
|
||||
output_data[idx] = T();
|
||||
temp_sum[idx] = U();
|
||||
}
|
||||
|
||||
// Return early when input shape has zero dim. This is done after initializing
|
||||
// data for output tensor because there are cases that the input tensor is
|
||||
// empty but output tensor is not. In that case, output tensor should be
|
||||
// filled with init_value.
|
||||
for (int i = 0; i < input_num_dims; ++i) {
|
||||
if (input_dims[i] == 0) return true;
|
||||
}
|
||||
|
||||
// Resolve axis.
|
||||
int num_resolved_axis = 0;
|
||||
if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
|
||||
&num_resolved_axis)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
|
||||
output_num_dims, resolved_axis, num_resolved_axis,
|
||||
temp_index, temp_sum)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Calculate mean by dividing output_data by num of aggregated element.
|
||||
size_t num_elements_in_axis = 1;
|
||||
for (int idx = 0; idx < num_resolved_axis; ++idx) {
|
||||
size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
|
||||
// Overflow prevention.
|
||||
if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
|
||||
return false;
|
||||
}
|
||||
num_elements_in_axis *= current;
|
||||
}
|
||||
|
||||
if (num_elements_in_axis > 0) {
|
||||
const float scale = input_scale / output_scale;
|
||||
if (compute_sum) {
|
||||
// TODO(b/116341117): Eliminate float and do this completely in 8bit.
|
||||
const float bias = -input_zero_point * scale * num_elements_in_axis;
|
||||
for (size_t idx = 0; idx < num_outputs; ++idx) {
|
||||
const U value =
|
||||
static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
|
||||
output_zero_point;
|
||||
output_data[idx] = static_cast<T>(value);
|
||||
}
|
||||
} else {
|
||||
const float bias = -input_zero_point * scale;
|
||||
for (size_t idx = 0; idx < num_outputs; ++idx) {
|
||||
float float_mean = static_cast<float>(temp_sum[idx]) /
|
||||
static_cast<float>(num_elements_in_axis);
|
||||
float result = TfLiteMin(
|
||||
TfLiteRound(float_mean * scale + bias) + output_zero_point,
|
||||
static_cast<float>(std::numeric_limits<T>::max()));
|
||||
result = TfLiteMax(result,
|
||||
static_cast<float>(std::numeric_limits<T>::min()));
|
||||
output_data[idx] = static_cast<T>(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline bool QuantizedReduceProd(const T* input_data, int32_t input_zero_point,
|
||||
const RuntimeShape& input_shape, T* output_data,
|
||||
int32_t output_zero_point,
|
||||
const RuntimeShape& output_shape,
|
||||
const int* axis,
|
||||
const int64_t num_axis_dimensions,
|
||||
bool keep_dims, int* temp_index,
|
||||
int* resolved_axis, int32_t* temp_prod,
|
||||
int32_t scaling_multiplier, int scaling_shift) {
|
||||
const int32_t kMinValue = std::numeric_limits<T>::min();
|
||||
const int32_t kMaxValue = std::numeric_limits<T>::max();
|
||||
|
||||
// Resolve axis.
|
||||
int num_resolved_axis = 0;
|
||||
if (!ResolveAxis(input_shape.DimensionsCount(), axis, num_axis_dimensions,
|
||||
resolved_axis, &num_resolved_axis)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Calculate the reduced product by rescaling each multiplication step to
|
||||
// avoid an overflow.
|
||||
auto reducer_first = [&](T in) -> int32_t { return in - input_zero_point; };
|
||||
|
||||
auto reducer_next = [&](int32_t current, T in) -> int32_t {
|
||||
const int64_t result =
|
||||
static_cast<int64_t>(current) * (in - input_zero_point);
|
||||
return MultiplyByQuantizedMultiplier(result, scaling_multiplier,
|
||||
scaling_shift);
|
||||
};
|
||||
|
||||
if (!Reduce<T, int32_t>(
|
||||
input_data, input_shape.DimsData(), output_shape.DimsData(),
|
||||
input_shape.DimensionsCount(), output_shape.DimensionsCount(),
|
||||
resolved_axis, num_resolved_axis, temp_index, reducer_first,
|
||||
reducer_next, temp_prod)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < output_shape.FlatSize(); i++) {
|
||||
int32_t result =
|
||||
MultiplyByQuantizedMultiplier(static_cast<int64_t>(temp_prod[i]),
|
||||
scaling_multiplier, scaling_shift) +
|
||||
output_zero_point;
|
||||
result = std::min(std::max(result, kMinValue), kMaxValue);
|
||||
output_data[i] = static_cast<T>(result);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
|
||||
@@ -0,0 +1,68 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REQUANTIZE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REQUANTIZE_H_
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename input_type, typename output_type>
|
||||
inline void Requantize(const input_type* input_data, int32_t size,
|
||||
int32_t effective_scale_multiplier,
|
||||
int32_t effective_scale_shift, int32_t input_zeropoint,
|
||||
int32_t output_zeropoint, output_type* output_data) {
|
||||
ruy::profiler::ScopeLabel label("Requantize");
|
||||
const bool same_scale =
|
||||
(effective_scale_multiplier == 1 << 30 && effective_scale_shift == 1);
|
||||
if (same_scale) {
|
||||
const bool mixed_type_int8_uint8 =
|
||||
std::is_same<input_type, int8_t>::value &&
|
||||
std::is_same<output_type, uint8_t>::value;
|
||||
const bool mixed_type_uint8_int8 =
|
||||
std::is_same<input_type, uint8_t>::value &&
|
||||
std::is_same<output_type, int8_t>::value;
|
||||
const int32_t zero_point_diff = input_zeropoint - output_zeropoint;
|
||||
// Fast path to do requantization for the case when just a shift of 128 is
|
||||
// needed.
|
||||
if ((mixed_type_int8_uint8 && zero_point_diff == -128) ||
|
||||
(mixed_type_uint8_int8 && zero_point_diff == 128)) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
output_data[i] = input_data[i] ^ 0x80;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
static constexpr int32_t kMinOutput = std::numeric_limits<output_type>::min();
|
||||
static constexpr int32_t kMaxOutput = std::numeric_limits<output_type>::max();
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t input = input_data[i] - input_zeropoint;
|
||||
const int32_t output =
|
||||
MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
|
||||
effective_scale_shift) +
|
||||
output_zeropoint;
|
||||
const int32_t clamped_output =
|
||||
std::max(std::min(output, kMaxOutput), kMinOutput);
|
||||
output_data[i] = static_cast<output_type>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REQUANTIZE_H_
|
||||
@@ -0,0 +1,228 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void ComputeInterpolationValues(const float value, const float scale,
|
||||
const bool half_pixel_centers,
|
||||
int32_t input_size, float* scaled_value,
|
||||
int32_t* lower_bound,
|
||||
int32_t* upper_bound) {
|
||||
if (half_pixel_centers) {
|
||||
*scaled_value = (value + 0.5f) * scale - 0.5f;
|
||||
} else {
|
||||
*scaled_value = value * scale;
|
||||
}
|
||||
float scaled_value_floor = std::floor(*scaled_value);
|
||||
*lower_bound = std::max(static_cast<int32_t>(scaled_value_floor),
|
||||
static_cast<int32_t>(0));
|
||||
*upper_bound =
|
||||
std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& unextended_output_size_shape,
|
||||
const int32_t* output_size_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
// If half_pixel_centers is True, align_corners must be False.
|
||||
TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
|
||||
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_size_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
int32_t input_height = input_shape.Dims(1);
|
||||
int32_t input_width = input_shape.Dims(2);
|
||||
int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
|
||||
int32_t output_height =
|
||||
output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
|
||||
int32_t output_width =
|
||||
output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
|
||||
|
||||
float height_scale = static_cast<float>(input_height) / output_height;
|
||||
float width_scale = static_cast<float>(input_width) / output_width;
|
||||
if (op_params.align_corners && output_height > 1) {
|
||||
height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
|
||||
}
|
||||
if (op_params.align_corners && output_width > 1) {
|
||||
width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
|
||||
}
|
||||
const float rounding_offset = std::numeric_limits<T>::is_integer ? .5f : .0f;
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int y = 0; y < output_height; ++y) {
|
||||
float input_y;
|
||||
int32_t y0, y1;
|
||||
ComputeInterpolationValues(y, height_scale, op_params.half_pixel_centers,
|
||||
input_height, &input_y, &y0, &y1);
|
||||
for (int x = 0; x < output_width; ++x) {
|
||||
float input_x;
|
||||
int32_t x0, x1;
|
||||
ComputeInterpolationValues(x, width_scale, op_params.half_pixel_centers,
|
||||
input_width, &input_x, &x0, &x1);
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
T interpolation =
|
||||
static_cast<T>(input_data[Offset(input_shape, b, y0, x0, c)] *
|
||||
(1 - (input_y - y0)) * (1 - (input_x - x0)) +
|
||||
input_data[Offset(input_shape, b, y1, x0, c)] *
|
||||
(input_y - y0) * (1 - (input_x - x0)) +
|
||||
input_data[Offset(input_shape, b, y0, x1, c)] *
|
||||
(1 - (input_y - y0)) * (input_x - x0) +
|
||||
input_data[Offset(input_shape, b, y1, x1, c)] *
|
||||
(input_y - y0) * (input_x - x0) +
|
||||
rounding_offset);
|
||||
output_data[Offset(output_shape, b, y, x, c)] = interpolation;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void ComputeInterpolationValuesInteger(
|
||||
const int32_t value, const int32_t scale_10, const bool half_pixel_centers,
|
||||
int32_t input_size, int32_t* scaled_value, int32_t* lower_bound,
|
||||
int32_t* upper_bound) {
|
||||
if (half_pixel_centers) {
|
||||
*scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
|
||||
} else {
|
||||
*scaled_value = value * scale_10;
|
||||
}
|
||||
constexpr int32_t zero = 0;
|
||||
*lower_bound = std::max(*scaled_value / (1 << 10), zero);
|
||||
*upper_bound =
|
||||
std::min((*scaled_value + (1 << 10) - 1) / (1 << 10), input_size - 1);
|
||||
}
|
||||
|
||||
// Same as above but doesn't use any floating-point for the resize
|
||||
template <typename T>
|
||||
inline void ResizeBilinearInteger(
|
||||
const tflite::ResizeBilinearParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape, const T* input_data,
|
||||
const RuntimeShape& unextended_output_size_shape,
|
||||
const int32_t* output_size_data,
|
||||
const RuntimeShape& unextended_output_shape, T* output_data) {
|
||||
// If half_pixel_centers is True, align_corners must be False.
|
||||
TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
|
||||
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_size_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int32_t input_height = input_shape.Dims(1);
|
||||
const int32_t input_width = input_shape.Dims(2);
|
||||
const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
|
||||
TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
|
||||
const int32_t output_height =
|
||||
output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
|
||||
const int32_t output_width =
|
||||
output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
|
||||
|
||||
int32_t height_scale_10 =
|
||||
((1 << 10) * input_height + output_height / 2) / output_height;
|
||||
int32_t width_scale_10 =
|
||||
((1 << 10) * input_width + output_width / 2) / output_width;
|
||||
if (op_params.align_corners && output_height > 1) {
|
||||
height_scale_10 =
|
||||
((1 << 10) * (input_height - 1) + (output_height - 1) / 2) /
|
||||
(output_height - 1);
|
||||
}
|
||||
if (op_params.align_corners && output_width > 1) {
|
||||
width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) /
|
||||
(output_width - 1);
|
||||
}
|
||||
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int y = 0; y < output_height; ++y) {
|
||||
int32_t input_y, y0, y1;
|
||||
ComputeInterpolationValuesInteger(y, height_scale_10,
|
||||
op_params.half_pixel_centers,
|
||||
input_height, &input_y, &y0, &y1);
|
||||
for (int x = 0; x < output_width; ++x) {
|
||||
int32_t input_x, x0, x1;
|
||||
ComputeInterpolationValuesInteger(x, width_scale_10,
|
||||
op_params.half_pixel_centers,
|
||||
input_width, &input_x, &x0, &x1);
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
const int64_t output_20_ll =
|
||||
static_cast<int64_t>(
|
||||
input_data[Offset(input_shape, b, y0, x0, c)]) *
|
||||
((1 << 10) - (input_y - (1 << 10) * y0)) *
|
||||
((1 << 10) - (input_x - (1 << 10) * x0));
|
||||
const int64_t output_20_lu =
|
||||
static_cast<int64_t>(
|
||||
input_data[Offset(input_shape, b, y1, x0, c)]) *
|
||||
(input_y - (1 << 10) * y0) *
|
||||
((1 << 10) - (input_x - (1 << 10) * x0));
|
||||
const int64_t output_20_rl =
|
||||
static_cast<int64_t>(
|
||||
input_data[Offset(input_shape, b, y0, x1, c)]) *
|
||||
((1 << 10) - (input_y - (1 << 10) * y0)) *
|
||||
(input_x - (1 << 10) * x0);
|
||||
const int64_t output_20_ru =
|
||||
static_cast<int64_t>(
|
||||
input_data[Offset(input_shape, b, y1, x1, c)]) *
|
||||
(input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
|
||||
const int64_t output_20 =
|
||||
output_20_ll + output_20_lu + output_20_rl + output_20_ru;
|
||||
const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
|
||||
const T interpolation =
|
||||
static_cast<T>((output_20 + round) / (1 << 20));
|
||||
output_data[Offset(output_shape, b, y, x, c)] = interpolation;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_BILINEAR_H_
|
||||
@@ -0,0 +1,101 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
inline int32_t GetNearestNeighbor(const int input_value,
|
||||
const int32_t input_size,
|
||||
const int32_t output_size,
|
||||
const bool align_corners,
|
||||
const bool half_pixel_centers) {
|
||||
const float scale =
|
||||
(align_corners && output_size > 1)
|
||||
? (input_size - 1) / static_cast<float>(output_size - 1)
|
||||
: input_size / static_cast<float>(output_size);
|
||||
const float offset = half_pixel_centers ? 0.5f : 0.0f;
|
||||
int32_t output_value = std::min(
|
||||
align_corners
|
||||
? static_cast<int32_t>(TfLiteRound((input_value + offset) * scale))
|
||||
: static_cast<int32_t>(std::floor((input_value + offset) * scale)),
|
||||
input_size - 1);
|
||||
if (half_pixel_centers) {
|
||||
output_value = std::max(static_cast<int32_t>(0), output_value);
|
||||
}
|
||||
return output_value;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void ResizeNearestNeighbor(
|
||||
const tflite::ResizeNearestNeighborParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape, const T* input_data,
|
||||
const RuntimeShape& output_size_shape, const int32_t* output_size_data,
|
||||
const RuntimeShape& unextended_output_shape, T* output_data) {
|
||||
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
int32_t input_height = input_shape.Dims(1);
|
||||
int32_t input_width = input_shape.Dims(2);
|
||||
int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
|
||||
|
||||
// The Tensorflow version of this op allows resize on the width and height
|
||||
// axis only.
|
||||
TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
|
||||
int32_t output_height = output_size_data[0];
|
||||
int32_t output_width = output_size_data[1];
|
||||
|
||||
const int col_offset = input_shape.Dims(3);
|
||||
const int row_offset = input_shape.Dims(2) * col_offset;
|
||||
const int batch_offset = input_shape.Dims(1) * row_offset;
|
||||
|
||||
const T* input_ptr = input_data;
|
||||
T* output_ptr = output_data;
|
||||
for (int b = 0; b < batches; ++b) {
|
||||
for (int y = 0; y < output_height; ++y) {
|
||||
int32_t in_y = GetNearestNeighbor(y, input_height, output_height,
|
||||
op_params.align_corners,
|
||||
op_params.half_pixel_centers);
|
||||
const T* y_input_ptr = input_ptr + in_y * row_offset;
|
||||
for (int x = 0; x < output_width; ++x) {
|
||||
int32_t in_x = GetNearestNeighbor(x, input_width, output_width,
|
||||
op_params.align_corners,
|
||||
op_params.half_pixel_centers);
|
||||
const T* x_input_ptr = y_input_ptr + in_x * col_offset;
|
||||
memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
|
||||
output_ptr += depth;
|
||||
}
|
||||
}
|
||||
input_ptr += batch_offset;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
|
||||
@@ -0,0 +1,51 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
inline float RoundToNearest(float value) {
|
||||
auto floor_val = std::floor(value);
|
||||
auto diff = value - floor_val;
|
||||
if ((diff < 0.5f) ||
|
||||
((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0))) {
|
||||
return floor_val;
|
||||
} else {
|
||||
return floor_val = floor_val + 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
inline void Round(const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
// Note that this implementation matches that of tensorFlow tf.round
|
||||
// and corresponds to the bankers rounding method.
|
||||
// cfenv (for fesetround) is not yet supported universally on Android, so
|
||||
// using a work around.
|
||||
output_data[i] = RoundToNearest(input_data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
|
||||
@@ -0,0 +1,80 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/portable_tensor.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void Slice(const tflite::SliceParams& op_params,
|
||||
const RuntimeShape& input_shape,
|
||||
const RuntimeShape& output_shape,
|
||||
SequentialTensorWriter<T>* writer) {
|
||||
const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(5, input_shape);
|
||||
TFLITE_DCHECK_LE(op_params.begin_count, 5);
|
||||
TFLITE_DCHECK_LE(op_params.size_count, 5);
|
||||
const int begin_count = op_params.begin_count;
|
||||
const int size_count = op_params.size_count;
|
||||
// We front-pad the begin and size vectors.
|
||||
int start[5];
|
||||
int stop[5];
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
int padded_i = 5 - i;
|
||||
start[i] =
|
||||
begin_count < padded_i ? 0 : op_params.begin[begin_count - padded_i];
|
||||
stop[i] =
|
||||
(size_count < padded_i || op_params.size[size_count - padded_i] == -1)
|
||||
? ext_shape.Dims(i)
|
||||
: start[i] + op_params.size[size_count - padded_i];
|
||||
}
|
||||
|
||||
for (int i0 = start[0]; i0 < stop[0]; ++i0) {
|
||||
for (int i1 = start[1]; i1 < stop[1]; ++i1) {
|
||||
for (int i2 = start[2]; i2 < stop[2]; ++i2) {
|
||||
for (int i3 = start[3]; i3 < stop[3]; ++i3) {
|
||||
for (int i4 = start[4]; i4 < stop[4]; ++i4) {
|
||||
writer->Write(Offset(ext_shape, i0, i1, i2, i3, i4));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void Slice(const tflite::SliceParams& op_params,
|
||||
const RuntimeShape& input_shape, const T* input_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
SequentialTensorWriter<T> writer(input_data, output_data);
|
||||
return Slice(op_params, input_shape, output_shape, &writer);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void Slice(const tflite::SliceParams& op_params,
|
||||
const RuntimeShape& input_shape, const TfLiteTensor* input,
|
||||
const RuntimeShape& output_shape, TfLiteTensor* output) {
|
||||
SequentialTensorWriter<T> writer(input, output);
|
||||
return Slice(op_params, input_shape, output_shape, &writer);
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_
|
||||
@@ -0,0 +1,232 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
#include "tensorflow/lite/kernels/op_macros.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void Softmax(const SoftmaxParams& params,
|
||||
const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
||||
const int outer_size =
|
||||
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
||||
const int depth =
|
||||
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
||||
|
||||
for (int i = 0; i < outer_size; ++i) {
|
||||
// Find max element value which we'll use to ensure numerical stability
|
||||
// taking advantage of the following equality:
|
||||
// exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
|
||||
float max = std::numeric_limits<float>::lowest();
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
max = std::max(max, input_data[i * depth + c]);
|
||||
}
|
||||
|
||||
// Compute sum.
|
||||
float sum = 0.f;
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
const float exp_c = std::exp((input_data[i * depth + c] - max) *
|
||||
static_cast<float>(params.beta));
|
||||
output_data[i * depth + c] = exp_c;
|
||||
sum += exp_c;
|
||||
}
|
||||
|
||||
// Compute result.
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
output_data[i * depth + c] = output_data[i * depth + c] / sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Quantized softmax with int8_t/uint8_t input and int8_t/uint8_t/int16_t
|
||||
// output.
|
||||
template <typename InputT, typename OutputT>
|
||||
inline void Softmax(const SoftmaxParams& params,
|
||||
const RuntimeShape& input_shape, const InputT* input_data,
|
||||
const RuntimeShape& output_shape, OutputT* output_data) {
|
||||
const int32_t input_beta_multiplier = params.input_multiplier;
|
||||
const int32_t input_beta_left_shift = params.input_left_shift;
|
||||
const int diff_min = params.diff_min;
|
||||
// The representation chosen for the input to the exp() function is Q5.26.
|
||||
// We need to leave extra space since values that we skip might be as large as
|
||||
// -32 before multiplying by input_beta_multiplier, and therefore as large as
|
||||
// -16 afterwards. Note that exp(-8) is definitely not insignificant to
|
||||
// accumulation, but exp(-16) definitely is.
|
||||
static const int kScaledDiffIntegerBits = 5;
|
||||
static const int kAccumulationIntegerBits = 12;
|
||||
using FixedPointScaledDiff =
|
||||
gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
|
||||
using FixedPointAccum =
|
||||
gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
|
||||
using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
|
||||
|
||||
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
||||
const int outer_size =
|
||||
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
||||
const int depth =
|
||||
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
||||
|
||||
for (int i = 0; i < outer_size; ++i) {
|
||||
InputT max_in_row = std::numeric_limits<InputT>::min();
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
max_in_row = std::max(max_in_row, input_data[i * depth + c]);
|
||||
}
|
||||
|
||||
FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
int32_t input_diff =
|
||||
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
|
||||
if (input_diff >= diff_min) {
|
||||
const int32_t input_diff_rescaled =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(
|
||||
input_diff, input_beta_multiplier, input_beta_left_shift);
|
||||
const FixedPointScaledDiff scaled_diff_f8 =
|
||||
FixedPointScaledDiff::FromRaw(input_diff_rescaled);
|
||||
sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
|
||||
exp_on_negative_values(scaled_diff_f8));
|
||||
}
|
||||
}
|
||||
|
||||
int num_bits_over_unit;
|
||||
FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
|
||||
sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
|
||||
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
int32_t input_diff =
|
||||
static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
|
||||
if (input_diff >= diff_min) {
|
||||
const int32_t input_diff_rescaled =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(
|
||||
input_diff, input_beta_multiplier, input_beta_left_shift);
|
||||
const FixedPointScaledDiff scaled_diff_f8 =
|
||||
FixedPointScaledDiff::FromRaw(input_diff_rescaled);
|
||||
|
||||
FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
|
||||
int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
|
||||
(shifted_scale * exp_in_0).raw(),
|
||||
num_bits_over_unit + 31 - (sizeof(OutputT) * 8));
|
||||
|
||||
const int32_t shifted_output =
|
||||
unsat_output +
|
||||
static_cast<int32_t>(std::numeric_limits<OutputT>::min());
|
||||
|
||||
output_data[i * depth + c] = static_cast<OutputT>(std::max(
|
||||
std::min(shifted_output,
|
||||
static_cast<int32_t>(std::numeric_limits<OutputT>::max())),
|
||||
static_cast<int32_t>(std::numeric_limits<OutputT>::min())));
|
||||
} else {
|
||||
output_data[i * depth + c] = std::numeric_limits<OutputT>::min();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Computes exp(input - max_input)
|
||||
inline int16_t SoftMaxCalculateExp(const SoftmaxParams& params,
|
||||
const int16_t* input_data, const int depth,
|
||||
int16_t max_in_row, int i, int c) {
|
||||
int32_t input_diff = input_data[i * depth + c] - max_in_row;
|
||||
// scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
|
||||
// exp lut generated with range [-10, 0], as exp(-10) is negligible.
|
||||
int32_t scaled_diff = MultiplyByQuantizedMultiplier(
|
||||
input_diff, params.input_multiplier, params.input_left_shift);
|
||||
// recenter to [-32768, 32767]
|
||||
int32_t sym_scaled_diff = scaled_diff + 32767;
|
||||
int16_t sat_sym_scaled_diff =
|
||||
std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
|
||||
static_cast<int32_t>(32767));
|
||||
// apply the exp() LUT activation function
|
||||
return lut_lookup(sat_sym_scaled_diff, params.exp_lut);
|
||||
}
|
||||
// Quantized softmax with int16_t input and int16_t output.
|
||||
inline void SoftmaxInt16(const SoftmaxParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const int16_t* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
||||
const int outer_size =
|
||||
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
||||
const int depth =
|
||||
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
||||
|
||||
for (int i = 0; i < outer_size; ++i) {
|
||||
// Find the largest element
|
||||
int16_t max_in_row = std::numeric_limits<int16_t>::min();
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
max_in_row = std::max(max_in_row, input_data[i * depth + c]);
|
||||
}
|
||||
|
||||
// This loops computes the exp values and their sum. We will need the exp
|
||||
// values later on in the function so we cache them in the output_data
|
||||
// buffer. This is an optimization done to avoid calculating the exp values
|
||||
// twice making use of the output_data buffer as scratch memory.
|
||||
int32_t sum_of_exps = 0; // Q16.15 fixed point format.
|
||||
int16_t* exp_results_Q015 = output_data + i * depth;
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
exp_results_Q015[c] =
|
||||
SoftMaxCalculateExp(params, input_data, depth, max_in_row, i, c);
|
||||
sum_of_exps += exp_results_Q015[c];
|
||||
}
|
||||
|
||||
// Compute the reciprocal 1/sum_of_exps
|
||||
uint8_t headroom_plus_one =
|
||||
CountLeadingZeros(static_cast<uint32_t>(sum_of_exps));
|
||||
int32_t shifted_sum =
|
||||
((static_cast<int64_t>(sum_of_exps) << (headroom_plus_one - 1)) +
|
||||
(1 << 13)) >>
|
||||
14;
|
||||
// since the LUT computes 1/(1 + x) we need to first compute x = (sum - 1).
|
||||
// also, the LUT expects a symmetrical input, so we must also recenter x
|
||||
// from [0, 65535] to [-32768, 32767].
|
||||
int32_t sym_shifted_sum = shifted_sum + (-((1 << 15) + (1 << 16)));
|
||||
int16_t sat_sym_shifted_sum = static_cast<int16_t>(
|
||||
std::min(std::max(sym_shifted_sum, static_cast<int32_t>(-32768)),
|
||||
static_cast<int32_t>(32767)));
|
||||
// apply 1/(1 + x) LUT activation function
|
||||
int16_t reciprocal_scale_Q015 =
|
||||
lut_lookup(sat_sym_shifted_sum, params.one_over_one_plus_x_lut);
|
||||
|
||||
// Rescale the exp_result with reciprocal
|
||||
// range of output is [0, 32767] correspond to [0.0, 1.0]
|
||||
for (int c = 0; c < depth; ++c) {
|
||||
uint8_t right_shift = 31 - headroom_plus_one;
|
||||
int64_t round = 1 << (right_shift - 1);
|
||||
int32_t result = (static_cast<int64_t>(exp_results_Q015[c]) *
|
||||
static_cast<int64_t>(reciprocal_scale_Q015) +
|
||||
round) >>
|
||||
right_shift;
|
||||
output_data[i * depth + c] = static_cast<int16_t>(
|
||||
std::min(std::max(result, static_cast<int32_t>(0)),
|
||||
static_cast<int32_t>(32767)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
|
||||
@@ -0,0 +1,109 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
// TODO(b/135760455): Move this method anonymous namespace in a cc file.
|
||||
inline RuntimeShape ExtendShapeSpaceToBatch(const RuntimeShape& shape) {
|
||||
if (shape.DimensionsCount() == 4) {
|
||||
return shape;
|
||||
}
|
||||
RuntimeShape new_shape(4, 1);
|
||||
new_shape.SetDim(0, shape.Dims(0));
|
||||
new_shape.SetDim(1, shape.Dims(1));
|
||||
new_shape.SetDim(3, shape.Dims(2));
|
||||
return new_shape;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void SpaceToBatchND(const SpaceToBatchParams& params,
|
||||
const RuntimeShape& unextended_input1_shape,
|
||||
const T* input1_data,
|
||||
const RuntimeShape& unextended_input2_shape,
|
||||
const int32_t* block_shape_data,
|
||||
const RuntimeShape& unextended_input3_shape,
|
||||
const int32_t* paddings_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("SpaceToBatchND");
|
||||
TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
|
||||
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
|
||||
unextended_output_shape.DimensionsCount());
|
||||
|
||||
// Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
|
||||
const RuntimeShape input1_shape =
|
||||
ExtendShapeSpaceToBatch(unextended_input1_shape);
|
||||
const RuntimeShape output_shape =
|
||||
ExtendShapeSpaceToBatch(unextended_output_shape);
|
||||
|
||||
const int depth = input1_shape.Dims(3);
|
||||
const int input_width = input1_shape.Dims(2);
|
||||
const int input_height = input1_shape.Dims(1);
|
||||
const int input_batch_size = input1_shape.Dims(0);
|
||||
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_batch_size = output_shape.Dims(0);
|
||||
|
||||
const int block_shape_height = block_shape_data[0];
|
||||
const int block_shape_width =
|
||||
unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
|
||||
const int padding_top = paddings_data[0];
|
||||
const int padding_left =
|
||||
unextended_input1_shape.DimensionsCount() == 4 ? paddings_data[2] : 0;
|
||||
|
||||
// For uint8 quantized, the correct padding "zero value" is the output offset.
|
||||
const int32_t pad_value = params.output_offset;
|
||||
for (int out_b = 0; out_b < output_batch_size; ++out_b) {
|
||||
int input_batch = out_b % input_batch_size;
|
||||
int shift_w = (out_b / input_batch_size) % block_shape_width;
|
||||
int shift_h = (out_b / input_batch_size) / block_shape_width;
|
||||
for (int out_h = 0; out_h < output_height; ++out_h) {
|
||||
for (int out_w = 0; out_w < output_width; ++out_w) {
|
||||
T* out = output_data + Offset(output_shape, out_b, out_h, out_w, 0);
|
||||
if (out_h * block_shape_height + shift_h < padding_top ||
|
||||
out_h * block_shape_height + shift_h >=
|
||||
padding_top + input_height ||
|
||||
out_w * block_shape_width + shift_w < padding_left ||
|
||||
out_w * block_shape_width + shift_w >= padding_left + input_width) {
|
||||
// This may not execute correctly when pad_value != 0 and T != uint8.
|
||||
memset(out, pad_value, depth * sizeof(T));
|
||||
} else {
|
||||
const T* in =
|
||||
input1_data +
|
||||
Offset(input1_shape, input_batch,
|
||||
(out_h * block_shape_height + shift_h) - padding_top,
|
||||
(out_w * block_shape_width + shift_w) - padding_left, 0);
|
||||
memcpy(out, in, depth * sizeof(T));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
|
||||
@@ -0,0 +1,80 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
||||
|
||||
const int input_depth = input_shape.Dims(3);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_batch = input_shape.Dims(0);
|
||||
|
||||
const int output_depth = output_shape.Dims(3);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_batch = output_shape.Dims(0);
|
||||
|
||||
const int32_t block_size = op_params.block_size;
|
||||
|
||||
TFLITE_DCHECK_EQ(input_width, output_width * block_size);
|
||||
TFLITE_DCHECK_EQ(input_height, output_height * block_size);
|
||||
TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth);
|
||||
TFLITE_DCHECK_EQ(input_batch, output_batch);
|
||||
|
||||
for (int in_b = 0; in_b < input_batch; ++in_b) {
|
||||
for (int in_h = 0; in_h < input_height; ++in_h) {
|
||||
for (int in_w = 0; in_w < input_width; ++in_w) {
|
||||
for (int in_d = 0; in_d < input_depth; ++in_d) {
|
||||
const int out_d =
|
||||
in_d + ((in_h % block_size) * block_size + in_w % block_size) *
|
||||
input_depth;
|
||||
const int out_w = in_w / block_size;
|
||||
const int out_h = in_h / block_size;
|
||||
const int out_b = in_b;
|
||||
|
||||
const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
|
||||
const int output_index =
|
||||
Offset(output_shape, out_b, out_h, out_w, out_d);
|
||||
|
||||
output_data[output_index] = input_data[input_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_DEPTH_H_
|
||||
@@ -0,0 +1,121 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/portable_tensor.h"
|
||||
#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T>
|
||||
inline void StridedSlice(const tflite::StridedSliceParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
SequentialTensorWriter<T>* writer) {
|
||||
using strided_slice::LoopCondition;
|
||||
using strided_slice::StartForAxis;
|
||||
using strided_slice::StopForAxis;
|
||||
|
||||
ruy::profiler::ScopeLabel label("StridedSlice");
|
||||
|
||||
// Note that the output_shape is not used herein.
|
||||
tflite::StridedSliceParams params_copy = op_params;
|
||||
|
||||
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 5);
|
||||
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 5);
|
||||
const RuntimeShape input_shape =
|
||||
RuntimeShape::ExtendedShape(5, unextended_input_shape);
|
||||
const RuntimeShape output_shape =
|
||||
RuntimeShape::ExtendedShape(5, unextended_output_shape);
|
||||
|
||||
// Reverse and pad to 5 dimensions because that is what the runtime code
|
||||
// requires (ie. all shapes must be 5D and are given backwards).
|
||||
strided_slice::StridedSlicePadIndices(¶ms_copy, 5);
|
||||
|
||||
const int start_0 = StartForAxis(params_copy, input_shape, 0);
|
||||
const int stop_0 = StopForAxis(params_copy, input_shape, 0, start_0);
|
||||
const int start_1 = StartForAxis(params_copy, input_shape, 1);
|
||||
const int stop_1 = StopForAxis(params_copy, input_shape, 1, start_1);
|
||||
const int start_2 = StartForAxis(params_copy, input_shape, 2);
|
||||
const int stop_2 = StopForAxis(params_copy, input_shape, 2, start_2);
|
||||
const int start_3 = StartForAxis(params_copy, input_shape, 3);
|
||||
const int stop_3 = StopForAxis(params_copy, input_shape, 3, start_3);
|
||||
const int start_4 = StartForAxis(params_copy, input_shape, 4);
|
||||
const int stop_4 = StopForAxis(params_copy, input_shape, 4, start_4);
|
||||
|
||||
for (int offset_0 = start_0 * input_shape.Dims(1),
|
||||
end_0 = stop_0 * input_shape.Dims(1),
|
||||
step_0 = params_copy.strides[0] * input_shape.Dims(1);
|
||||
!LoopCondition(offset_0, end_0, params_copy.strides[0]);
|
||||
offset_0 += step_0) {
|
||||
for (int offset_1 = (offset_0 + start_1) * input_shape.Dims(2),
|
||||
end_1 = (offset_0 + stop_1) * input_shape.Dims(2),
|
||||
step_1 = params_copy.strides[1] * input_shape.Dims(2);
|
||||
!LoopCondition(offset_1, end_1, params_copy.strides[1]);
|
||||
offset_1 += step_1) {
|
||||
for (int offset_2 = (offset_1 + start_2) * input_shape.Dims(3),
|
||||
end_2 = (offset_1 + stop_2) * input_shape.Dims(3),
|
||||
step_2 = params_copy.strides[2] * input_shape.Dims(3);
|
||||
!LoopCondition(offset_2, end_2, params_copy.strides[2]);
|
||||
offset_2 += step_2) {
|
||||
for (int offset_3 = (offset_2 + start_3) * input_shape.Dims(4),
|
||||
end_3 = (offset_2 + stop_3) * input_shape.Dims(4),
|
||||
step_3 = params_copy.strides[3] * input_shape.Dims(4);
|
||||
!LoopCondition(offset_3, end_3, params_copy.strides[3]);
|
||||
offset_3 += step_3) {
|
||||
for (int offset_4 = offset_3 + start_4, end_4 = offset_3 + stop_4;
|
||||
!LoopCondition(offset_4, end_4, params_copy.strides[4]);
|
||||
offset_4 += params_copy.strides[4]) {
|
||||
writer->Write(offset_4);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void StridedSlice(const tflite::StridedSliceParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
SequentialTensorWriter<T> writer(input_data, output_data);
|
||||
StridedSlice<T>(op_params, unextended_input_shape, unextended_output_shape,
|
||||
&writer);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void StridedSlice(const tflite::StridedSliceParams& op_params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const TfLiteTensor* input,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
TfLiteTensor* output) {
|
||||
SequentialTensorWriter<T> writer(input, output);
|
||||
StridedSlice<T>(op_params, unextended_input_shape, unextended_output_shape,
|
||||
&writer);
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
|
||||
@@ -0,0 +1,476 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#include "ruy/profiler/instrumentation.h" // from @ruy
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
inline void SubNonBroadcast(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const float* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const float* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
float* output_data) {
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = ActivationFunctionWithMinMax(
|
||||
input1_data[i] - input2_data[i], params.float_activation_min,
|
||||
params.float_activation_max);
|
||||
}
|
||||
}
|
||||
|
||||
inline void SubNonBroadcast(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int32_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int32_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int32_t* output_data) {
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = ActivationFunctionWithMinMax(
|
||||
input1_data[i] - input2_data[i], params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
|
||||
// dimensionality if the runtime code does a single loop over one dimension
|
||||
// that handles broadcasting as the base case. The code generator would then
|
||||
// generate max(D1, D2) nested for loops.
|
||||
template <int N = 5>
|
||||
inline void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const float* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const float* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
float* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastSubSlow/float");
|
||||
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
auto sub_func = [&](int indexes[N]) {
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
ActivationFunctionWithMinMax(
|
||||
input1_data[SubscriptToIndex(desc1, indexes)] -
|
||||
input2_data[SubscriptToIndex(desc2, indexes)],
|
||||
params.float_activation_min, params.float_activation_max);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, sub_func);
|
||||
}
|
||||
|
||||
template <int N = 5>
|
||||
inline void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int32_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int32_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int32_t* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
|
||||
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
auto sub_func = [&](int indexes[N]) {
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
ActivationFunctionWithMinMax(
|
||||
input1_data[SubscriptToIndex(desc1, indexes)] -
|
||||
input2_data[SubscriptToIndex(desc2, indexes)],
|
||||
params.quantized_activation_min, params.quantized_activation_max);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, sub_func);
|
||||
}
|
||||
|
||||
template <int N = 5>
|
||||
void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int64_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int64_t* input2_data,
|
||||
const RuntimeShape& output_shape, int64_t* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
|
||||
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
auto sub_func = [&](int indexes[N]) {
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
ActivationFunctionWithMinMax(
|
||||
input1_data[SubscriptToIndex(desc1, indexes)] -
|
||||
input2_data[SubscriptToIndex(desc2, indexes)],
|
||||
params.int64_activation_min, params.int64_activation_max);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, sub_func);
|
||||
}
|
||||
|
||||
template <typename T, int N = 5>
|
||||
void BroadcastSubSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const T* input1_data,
|
||||
const RuntimeShape& input2_shape, const T* input2_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastSubSlow/templated");
|
||||
TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
|
||||
TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
auto sub_func = [&](int indexes[N]) {
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
ActivationFunctionWithMinMax(
|
||||
input1_data[SubscriptToIndex(desc1, indexes)] -
|
||||
input2_data[SubscriptToIndex(desc2, indexes)],
|
||||
params.quantized_activation_min, params.quantized_activation_max);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, sub_func);
|
||||
}
|
||||
|
||||
template <int N = 5>
|
||||
inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const int16_t* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const int16_t* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
auto sub_func = [&](int indexes[N]) {
|
||||
const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
|
||||
const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
|
||||
const int32_t scaled_input1_val =
|
||||
gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
|
||||
const int32_t raw_output = scaled_input1_val - scaled_input2_val;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
static_cast<int16_t>(clamped_output);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, sub_func);
|
||||
}
|
||||
|
||||
template <typename T, int N = 5>
|
||||
void BroadcastQuantSubSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const T* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const T* input2_data,
|
||||
const RuntimeShape& output_shape, T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T");
|
||||
NdArrayDesc<N> desc1;
|
||||
NdArrayDesc<N> desc2;
|
||||
NdArrayDesc<N> output_desc;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
auto sub_func = [&](int indexes[N]) {
|
||||
const int32_t input1_val =
|
||||
params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
|
||||
const int32_t input2_val =
|
||||
params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sub, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
static_cast<T>(clamped_output);
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, sub_func);
|
||||
}
|
||||
|
||||
// Element-wise add that can often be used for inner loop of broadcast add as
|
||||
// well as the non-broadcast add.
|
||||
template <typename T>
|
||||
inline void SubElementwise(int size, const ArithmeticParams& params,
|
||||
const T* input1_data, const T* input2_data,
|
||||
T* output_data) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32_t input1_val = params.input1_offset + input1_data[i];
|
||||
const int32_t input2_val = params.input2_offset + input2_data[i];
|
||||
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
||||
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
||||
const int32_t scaled_input1_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
||||
const int32_t scaled_input2_val =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
||||
const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
|
||||
const int32_t raw_output =
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
raw_sub, params.output_multiplier, params.output_shift) +
|
||||
params.output_offset;
|
||||
const int32_t clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, raw_output));
|
||||
output_data[i] = static_cast<T>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
inline void Sub(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const uint8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8_t* input2_data,
|
||||
const RuntimeShape& output_shape, uint8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
||||
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
||||
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
||||
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
||||
SubElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void Sub(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const int8_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const int8_t* input2_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
TFLITE_DCHECK_GE(params.input1_offset, -128);
|
||||
TFLITE_DCHECK_GE(params.input2_offset, -128);
|
||||
// offset = -quantization_params.zero_point in PrepareGeneralSubOp().
|
||||
// So it's maximum can be 128 not 127.
|
||||
TFLITE_DCHECK_LE(params.input1_offset, 128);
|
||||
TFLITE_DCHECK_LE(params.input2_offset, 128);
|
||||
SubElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void Sub(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const int16_t* input1_data,
|
||||
const RuntimeShape& input2_shape, const int16_t* input2_data,
|
||||
const RuntimeShape& output_shape, int16_t* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
TFLITE_DCHECK_EQ(params.input1_offset, 0);
|
||||
TFLITE_DCHECK_EQ(params.input2_offset, 0);
|
||||
SubElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
||||
const T* input1_data, const RuntimeShape& input2_shape,
|
||||
const T* input2_data, const RuntimeShape& output_shape,
|
||||
T* output_data) {
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
const RuntimeShape extended_output_shape =
|
||||
RuntimeShape::ExtendedShape(4, output_shape);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void SetActivationMinMax(const ArithmeticParams& params,
|
||||
int32_t* activation_min,
|
||||
int32_t* activation_max) {
|
||||
*activation_min = params.quantized_activation_min;
|
||||
*activation_max = params.quantized_activation_max;
|
||||
}
|
||||
|
||||
inline void SetActivationMinMax(const ArithmeticParams& params,
|
||||
float* activation_min, float* activation_max) {
|
||||
*activation_min = params.float_activation_min;
|
||||
*activation_max = params.float_activation_max;
|
||||
}
|
||||
|
||||
inline void SetActivationMinMax(const ArithmeticParams& params,
|
||||
int64_t* activation_min,
|
||||
int64_t* activation_max) {
|
||||
*activation_min = params.int64_activation_min;
|
||||
*activation_max = params.int64_activation_max;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void SubWithActivation(
|
||||
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
||||
const T* input1_data, const RuntimeShape& input2_shape,
|
||||
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
|
||||
ruy::profiler::ScopeLabel label("SubWithActivation");
|
||||
const int flat_size =
|
||||
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
||||
T activation_min, activation_max;
|
||||
SetActivationMinMax(params, &activation_min, &activation_max);
|
||||
|
||||
for (int i = 0; i < flat_size; ++i) {
|
||||
output_data[i] = ActivationFunctionWithMinMax(
|
||||
input1_data[i] - input2_data[i], activation_min, activation_max);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
|
||||
@@ -0,0 +1,129 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
#include "tensorflow/lite/kernels/op_macros.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
|
||||
inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
float val = input_data[i];
|
||||
float result = std::tanh(val);
|
||||
output_data[i] = result;
|
||||
}
|
||||
}
|
||||
|
||||
// Convenience version that allows, for example, generated-code calls to be
|
||||
// uniform between data types.
|
||||
inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& output_shape,
|
||||
float* output_data) {
|
||||
// Drop params: not needed.
|
||||
Tanh(input_shape, input_data, output_shape, output_data);
|
||||
}
|
||||
|
||||
inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
|
||||
const int16_t* input_data, const RuntimeShape& output_shape,
|
||||
int16_t* output_data) {
|
||||
const int input_left_shift = params.input_left_shift;
|
||||
// Support for shifts is limited until we have a parameterized version of
|
||||
// SaturatingRoundingMultiplyByPOT().
|
||||
TFLITE_DCHECK_GE(input_left_shift, 0);
|
||||
TFLITE_DCHECK_LE(input_left_shift, 1);
|
||||
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
// F0 uses 0 integer bits, range [-1, 1].
|
||||
// This is the return type of math functions such as tanh, logistic,
|
||||
// whose range is in [-1, 1].
|
||||
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
||||
// F3 uses 3 integer bits, range [-8, 8], the input range expected here.
|
||||
using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
|
||||
|
||||
if (input_left_shift == 0) {
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
F3 input = F3::FromRaw(input_data[i]);
|
||||
F0 output = gemmlowp::tanh(input);
|
||||
output_data[i] = output.raw();
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
F3 input = F3::FromRaw(
|
||||
gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
|
||||
F0 output = gemmlowp::tanh(input);
|
||||
output_data[i] = output.raw();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data) {
|
||||
const int32_t input_zero_point = params.input_zero_point;
|
||||
const int32_t input_range_radius = params.input_range_radius;
|
||||
const int32_t input_multiplier = params.input_multiplier;
|
||||
const int input_left_shift = params.input_left_shift;
|
||||
const int32_t output_zero_point = 128;
|
||||
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
||||
|
||||
for (int i = 0; i < flat_size; i++) {
|
||||
const uint8_t input_val_u8 = input_data[i];
|
||||
const int32_t input_val_centered =
|
||||
static_cast<int32_t>(input_val_u8) - input_zero_point;
|
||||
uint8_t output_val;
|
||||
if (input_val_centered <= -input_range_radius) {
|
||||
output_val = 0;
|
||||
} else if (input_val_centered >= input_range_radius) {
|
||||
output_val = 255;
|
||||
} else {
|
||||
const int32_t input_val_rescaled =
|
||||
MultiplyByQuantizedMultiplierGreaterThanOne(
|
||||
input_val_centered, input_multiplier, input_left_shift);
|
||||
using FixedPoint4 = gemmlowp::FixedPoint<int32_t, 4>;
|
||||
using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
|
||||
const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
|
||||
const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
|
||||
// Convert from Q0.31 to Q24.7.
|
||||
using gemmlowp::RoundingDivideByPOT;
|
||||
int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
|
||||
output_val_s32 += output_zero_point;
|
||||
if (output_val_s32 == 256) {
|
||||
output_val_s32 = 255;
|
||||
}
|
||||
// Reinterpret as Q0.7, encoded in uint8_t.
|
||||
TFLITE_DCHECK_GE(output_val_s32, 0);
|
||||
TFLITE_DCHECK_LE(output_val_s32, 255);
|
||||
output_val = static_cast<uint8_t>(output_val_s32);
|
||||
}
|
||||
output_data[i] = output_val;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
|
||||
@@ -0,0 +1,111 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
template <typename T, int N>
|
||||
void TransposeImpl(const TransposeParams& params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const T* input_data,
|
||||
const RuntimeShape& unextended_output_shape,
|
||||
T* output_data) {
|
||||
const int unextended_input_size = unextended_input_shape.DimensionsCount();
|
||||
const int unextended_output_size = unextended_output_shape.DimensionsCount();
|
||||
TFLITE_DCHECK_LE(unextended_input_size, N);
|
||||
TFLITE_DCHECK_LE(unextended_output_size, N);
|
||||
TFLITE_DCHECK_EQ(unextended_output_size, params.perm_count);
|
||||
const int input_ext_size = N - unextended_input_size;
|
||||
const int output_ext_size = N - unextended_output_size;
|
||||
NdArrayDesc<N> input_desc;
|
||||
NdArrayDesc<N> output_desc;
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_input_shape),
|
||||
&input_desc);
|
||||
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
|
||||
&output_desc);
|
||||
|
||||
// The perm data is extended to match the output, each index incremented by
|
||||
// the amount of front padding of the input shape.
|
||||
int extended_perm[N];
|
||||
for (int i = 0; i < N; ++i) {
|
||||
extended_perm[i] = i < output_ext_size
|
||||
? i
|
||||
: params.perm[i - output_ext_size] + input_ext_size;
|
||||
}
|
||||
|
||||
// Permutes the input shape so we don't need to permute the indexes inside
|
||||
// the loop. Check to make sure output_dims is matching input_dims.
|
||||
NdArrayDesc<N> perm_input_desc;
|
||||
for (int k = 0; k < N; ++k) {
|
||||
TFLITE_DCHECK_EQ(input_desc.extents[extended_perm[k]],
|
||||
output_desc.extents[k]);
|
||||
perm_input_desc.extents[k] = input_desc.extents[extended_perm[k]];
|
||||
perm_input_desc.strides[k] = input_desc.strides[extended_perm[k]];
|
||||
}
|
||||
|
||||
// Naive transpose loop (iterate on output index and compute input index).
|
||||
auto tranpose_func = [&](int indexes[N]) {
|
||||
output_data[SubscriptToIndex(output_desc, indexes)] =
|
||||
input_data[SubscriptToIndex(perm_input_desc, indexes)];
|
||||
};
|
||||
NDOpsHelper<N>(output_desc, tranpose_func);
|
||||
}
|
||||
|
||||
template <typename T, int N = 5>
|
||||
void Transpose(const TransposeParams& params,
|
||||
const RuntimeShape& unextended_input_shape, const T* input_data,
|
||||
const RuntimeShape& unextended_output_shape, T* output_data) {
|
||||
// Transpose kernel only does rearranging values not numeric evaluations on
|
||||
// each cell. It's safe to implement per size of scalar type and this trick
|
||||
// keeps the total code size in a reasonable range.
|
||||
switch (sizeof(T)) {
|
||||
case 1:
|
||||
TransposeImpl<int8_t, N>(params, unextended_input_shape,
|
||||
reinterpret_cast<const int8_t*>(input_data),
|
||||
unextended_output_shape,
|
||||
reinterpret_cast<int8_t*>(output_data));
|
||||
break;
|
||||
case 2:
|
||||
TransposeImpl<int16_t, N>(params, unextended_input_shape,
|
||||
reinterpret_cast<const int16_t*>(input_data),
|
||||
unextended_output_shape,
|
||||
reinterpret_cast<int16_t*>(output_data));
|
||||
break;
|
||||
|
||||
case 4:
|
||||
TransposeImpl<int32_t, N>(params, unextended_input_shape,
|
||||
reinterpret_cast<const int32_t*>(input_data),
|
||||
unextended_output_shape,
|
||||
reinterpret_cast<int32_t*>(output_data));
|
||||
break;
|
||||
case 8:
|
||||
TransposeImpl<int64_t, N>(params, unextended_input_shape,
|
||||
reinterpret_cast<const int64_t*>(input_data),
|
||||
unextended_output_shape,
|
||||
reinterpret_cast<int64_t*>(output_data));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_H_
|
||||
@@ -0,0 +1,217 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace reference_ops {
|
||||
|
||||
inline void TransposeConv(
|
||||
const ConvParams& params, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& filter_shape,
|
||||
const float* filter_data, const RuntimeShape& bias_shape,
|
||||
const float* bias_data, const RuntimeShape& output_shape,
|
||||
float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
|
||||
// Although transpose convolution simplifies to convolution with transposed
|
||||
// weights for strides of 1, non-unitary striding complicates matters. To
|
||||
// keep this reference implementation as clear as possible, we use a
|
||||
// "scatter" access pattern, where we loop through all the input elements,
|
||||
// computing their influence on the output, rather than looping through the
|
||||
// output elements in the typical "gather" access pattern of a conv. We
|
||||
// therefore must initialize the output array to zero.
|
||||
const int num_elements = output_shape.FlatSize();
|
||||
for (int i = 0; i < num_elements; i++) {
|
||||
output_data[i] = 0.0f;
|
||||
}
|
||||
|
||||
// Loop through input elements one at a time.
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int in_y = 0; in_y < input_height; ++in_y) {
|
||||
for (int in_x = 0; in_x < input_width; ++in_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
// Loop through the output elements it will influence
|
||||
const int out_x_origin = (in_x * stride_width) - pad_width;
|
||||
const int out_y_origin = (in_y * stride_height) - pad_height;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth;
|
||||
++out_channel) {
|
||||
// Compute output element location
|
||||
const int out_x = out_x_origin + filter_x;
|
||||
const int out_y = out_y_origin + filter_y;
|
||||
// We cannot accumulate out of bounds
|
||||
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
|
||||
(out_y < output_height)) {
|
||||
float input_value = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
float filter_value =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
output_data[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)] +=
|
||||
input_value * filter_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bias_data) {
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
output_data[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)] += bias_data[out_channel];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void TransposeConv(
|
||||
const ConvParams& params, const RuntimeShape& input_shape,
|
||||
const uint8_t* input_data, const RuntimeShape& filter_shape,
|
||||
const uint8_t* filter_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, const RuntimeShape& output_shape,
|
||||
uint8_t* output_data, const RuntimeShape& im2col_shape,
|
||||
uint8_t* im2col_data, int32_t* scratch_buffer) {
|
||||
const int stride_width = params.stride_width;
|
||||
const int stride_height = params.stride_height;
|
||||
const int pad_width = params.padding_values.width;
|
||||
const int pad_height = params.padding_values.height;
|
||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
||||
(void)im2col_data; // only used in optimized code.
|
||||
(void)im2col_shape; // only used in optimized code.
|
||||
|
||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
||||
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
||||
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
|
||||
const int input_height = input_shape.Dims(1);
|
||||
const int input_width = input_shape.Dims(2);
|
||||
const int filter_height = filter_shape.Dims(1);
|
||||
const int filter_width = filter_shape.Dims(2);
|
||||
const int output_height = output_shape.Dims(1);
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int32_t input_offset = params.input_offset;
|
||||
const int32_t filter_offset = params.weights_offset;
|
||||
const int32_t output_offset = params.output_offset;
|
||||
const int32_t output_multiplier = params.output_multiplier;
|
||||
const int output_shift = params.output_shift;
|
||||
const int32_t output_activation_min = params.quantized_activation_min;
|
||||
const int32_t output_activation_max = params.quantized_activation_max;
|
||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
||||
if (bias_data) {
|
||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
||||
}
|
||||
|
||||
const int num_elements = output_shape.FlatSize();
|
||||
// We need to initialize scratch_buffer to all 0s, as we apply the same
|
||||
// 'scatter' based trick as in float version.
|
||||
memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
|
||||
|
||||
// Loop through input elements one at a time.
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int in_y = 0; in_y < input_height; ++in_y) {
|
||||
for (int in_x = 0; in_x < input_width; ++in_x) {
|
||||
for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
|
||||
// Loop through the output elements it will influence.
|
||||
const int out_x_origin = (in_x * stride_width) - pad_width;
|
||||
const int out_y_origin = (in_y * stride_height) - pad_height;
|
||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth;
|
||||
++out_channel) {
|
||||
// Compute output element location.
|
||||
const int out_x = out_x_origin + filter_x;
|
||||
const int out_y = out_y_origin + filter_y;
|
||||
// We cannot accumulate out of bounds.
|
||||
if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
|
||||
(out_y < output_height)) {
|
||||
uint8_t input_value = input_data[Offset(
|
||||
input_shape, batch, in_y, in_x, in_channel)];
|
||||
uint8_t filter_value =
|
||||
filter_data[Offset(filter_shape, out_channel, filter_y,
|
||||
filter_x, in_channel)];
|
||||
scratch_buffer[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)] +=
|
||||
(input_value + input_offset) *
|
||||
(filter_value + filter_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
||||
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
|
||||
int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
|
||||
out_channel)];
|
||||
if (bias_data) {
|
||||
acc += bias_data[out_channel];
|
||||
}
|
||||
int32_t scaled_acc = MultiplyByQuantizedMultiplier(
|
||||
acc, output_multiplier, output_shift);
|
||||
scaled_acc += output_offset;
|
||||
scaled_acc = std::max(scaled_acc, output_activation_min);
|
||||
scaled_acc = std::min(scaled_acc, output_activation_max);
|
||||
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
|
||||
static_cast<uint8_t>(scaled_acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
|
||||
@@ -0,0 +1,155 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
|
||||
|
||||
namespace tflite {
|
||||
|
||||
template <int N>
|
||||
struct Dims {
|
||||
int sizes[N];
|
||||
int strides[N];
|
||||
};
|
||||
|
||||
class RuntimeShape {
|
||||
public:
|
||||
RuntimeShape& operator=(RuntimeShape const&) = delete;
|
||||
|
||||
RuntimeShape() : size_(0) {}
|
||||
|
||||
explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {}
|
||||
|
||||
RuntimeShape(int shape_size, int32_t value) : size_(shape_size) {
|
||||
for (int i = 0; i < shape_size; ++i) {
|
||||
SetDim(i, value);
|
||||
}
|
||||
}
|
||||
|
||||
RuntimeShape(int dimensions_count, const int32_t* dims_data)
|
||||
: size_(dimensions_count) {
|
||||
ReplaceWith(dimensions_count, dims_data);
|
||||
}
|
||||
|
||||
bool operator==(const RuntimeShape& comp) const {
|
||||
return this->size_ == comp.size_ &&
|
||||
std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32_t)) ==
|
||||
0;
|
||||
}
|
||||
|
||||
~RuntimeShape() {}
|
||||
|
||||
int32_t DimensionsCount() const { return size_; }
|
||||
int32_t Dims(int i) const {
|
||||
TFLITE_DCHECK_GE(i, 0);
|
||||
TFLITE_DCHECK_LT(i, size_);
|
||||
return dims_[i];
|
||||
}
|
||||
void SetDim(int i, int32_t val) {
|
||||
TFLITE_DCHECK_GE(i, 0);
|
||||
TFLITE_DCHECK_LT(i, size_);
|
||||
dims_[i] = val;
|
||||
}
|
||||
|
||||
static RuntimeShape ExtendedShape(int new_shape_size,
|
||||
const RuntimeShape& shape) {
|
||||
return RuntimeShape(new_shape_size, shape, 1);
|
||||
}
|
||||
int32_t* DimsData() { return dims_; }
|
||||
const int32_t* DimsData() const { return dims_; }
|
||||
const int32_t* DimsDataUpTo5D() const { return dims_; }
|
||||
|
||||
void ReplaceWith(int dimensions_count, const int32_t* dims_data) {
|
||||
size_ = dimensions_count;
|
||||
int32_t* dst_dims = DimsData();
|
||||
std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
|
||||
}
|
||||
|
||||
// Returns the total count of elements, that is the size when flattened into a
|
||||
// vector.
|
||||
int FlatSize() const {
|
||||
int buffer_size = 1;
|
||||
const int* dims_data = reinterpret_cast<const int*>(DimsData());
|
||||
for (int i = 0; i < size_; i++) {
|
||||
buffer_size *= dims_data[i];
|
||||
}
|
||||
return buffer_size;
|
||||
}
|
||||
|
||||
private:
|
||||
// For use only by ExtendedShape(), written to guarantee (return-value) copy
|
||||
// elision in C++17.
|
||||
// This creates a shape padded to the desired size with the specified value.
|
||||
RuntimeShape(int new_shape_size, const RuntimeShape& shape, int pad_value)
|
||||
: size_(new_shape_size) {
|
||||
// If the following check fails, it is likely because a 4D-only kernel is
|
||||
// being used with an array of larger dimension count.
|
||||
TFLITE_CHECK_GE(new_shape_size, shape.DimensionsCount());
|
||||
const int size_increase = new_shape_size - shape.DimensionsCount();
|
||||
for (int i = 0; i < size_increase; ++i) {
|
||||
SetDim(i, pad_value);
|
||||
}
|
||||
std::memcpy(DimsData() + size_increase, shape.DimsData(),
|
||||
sizeof(int32_t) * shape.DimensionsCount());
|
||||
}
|
||||
|
||||
// A maximum of 4 dimensions are supported on TFLM.
|
||||
static constexpr int kMaxSize = 5;
|
||||
int32_t size_;
|
||||
union {
|
||||
int32_t dims_[kMaxSize];
|
||||
};
|
||||
};
|
||||
|
||||
// Since tensors with '0' in their shape are valid in TF, these offset functions
|
||||
// allow that as long as the corresponding index is also 0. It is upto the
|
||||
// calling ops to ensure that they perform verification checks on tensor shapes
|
||||
// if they don't support a particular behavior.
|
||||
|
||||
inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) {
|
||||
TFLITE_DCHECK_EQ(shape.DimensionsCount(), 4);
|
||||
const int* dims_data = reinterpret_cast<const int*>(shape.DimsData());
|
||||
TFLITE_DCHECK((dims_data[0] == 0 && i0 == 0) ||
|
||||
(i0 >= 0 && i0 < dims_data[0]));
|
||||
TFLITE_DCHECK((dims_data[1] == 0 && i1 == 0) ||
|
||||
(i1 >= 0 && i1 < dims_data[1]));
|
||||
TFLITE_DCHECK((dims_data[2] == 0 && i2 == 0) ||
|
||||
(i2 >= 0 && i2 < dims_data[2]));
|
||||
TFLITE_DCHECK((dims_data[3] == 0 && i3 == 0) ||
|
||||
(i3 >= 0 && i3 < dims_data[3]));
|
||||
return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
|
||||
}
|
||||
|
||||
inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3,
|
||||
int i4) {
|
||||
TFLITE_DCHECK_EQ(shape.DimensionsCount(), 5);
|
||||
const int* dims_data = reinterpret_cast<const int*>(shape.DimsData());
|
||||
TFLITE_DCHECK((dims_data[0] == 0 && i0 == 0) ||
|
||||
(i0 >= 0 && i0 < dims_data[0]));
|
||||
TFLITE_DCHECK((dims_data[1] == 0 && i1 == 0) ||
|
||||
(i1 >= 0 && i1 < dims_data[1]));
|
||||
TFLITE_DCHECK((dims_data[2] == 0 && i2 == 0) ||
|
||||
(i2 >= 0 && i2 < dims_data[2]));
|
||||
TFLITE_DCHECK((dims_data[3] == 0 && i3 == 0) ||
|
||||
(i3 >= 0 && i3 < dims_data[3]));
|
||||
TFLITE_DCHECK((dims_data[4] == 0 && i4 == 0) ||
|
||||
(i4 >= 0 && i4 < dims_data[4]));
|
||||
return (((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3) *
|
||||
dims_data[4] +
|
||||
i4;
|
||||
}
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_RUNTIME_SHAPE_H_
|
||||
@@ -0,0 +1,211 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
|
||||
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace strided_slice {
|
||||
|
||||
// Use until std::clamp() is available from C++17.
|
||||
inline int Clamp(const int v, const int lo, const int hi) {
|
||||
TFLITE_DCHECK(!(hi < lo));
|
||||
if (hi < v) return hi;
|
||||
if (v < lo) return lo;
|
||||
return v;
|
||||
}
|
||||
|
||||
inline void StridedSlicePadIndices(tflite::StridedSliceParams* p,
|
||||
int dim_count) {
|
||||
// Add indices and mask bits to fully include extra dimensions
|
||||
TFLITE_CHECK_LE(dim_count, 5);
|
||||
TFLITE_CHECK_GE(dim_count, p->start_indices_count);
|
||||
TFLITE_CHECK_EQ(p->start_indices_count, p->stop_indices_count);
|
||||
TFLITE_CHECK_EQ(p->stop_indices_count, p->strides_count);
|
||||
|
||||
const int pad_count = dim_count - p->start_indices_count;
|
||||
|
||||
// Pad indices at start, so move arrays by pad_count.
|
||||
for (int i = p->start_indices_count - 1; i >= 0; --i) {
|
||||
p->strides[i + pad_count] = p->strides[i];
|
||||
p->start_indices[i + pad_count] = p->start_indices[i];
|
||||
p->stop_indices[i + pad_count] = p->stop_indices[i];
|
||||
}
|
||||
for (int i = 0; i < pad_count; ++i) {
|
||||
p->start_indices[i] = 0;
|
||||
p->stop_indices[i] = 1;
|
||||
p->strides[i] = 1;
|
||||
}
|
||||
|
||||
// Pad masks with 0s or 1s as required.
|
||||
p->shrink_axis_mask <<= pad_count;
|
||||
p->ellipsis_mask <<= pad_count;
|
||||
p->new_axis_mask <<= pad_count;
|
||||
p->begin_mask <<= pad_count;
|
||||
p->end_mask <<= pad_count;
|
||||
p->begin_mask |= (1 << pad_count) - 1;
|
||||
p->end_mask |= (1 << pad_count) - 1;
|
||||
|
||||
p->start_indices_count = dim_count;
|
||||
p->stop_indices_count = dim_count;
|
||||
p->strides_count = dim_count;
|
||||
}
|
||||
|
||||
// Return the index for the first element along that axis. This index will be a
|
||||
// positive integer between [0, axis_size] (or [-1, axis_size -1] if stride < 0)
|
||||
// that can be used to index directly into the data.
|
||||
inline int StartForAxis(const tflite::StridedSliceParams& params,
|
||||
const RuntimeShape& input_shape, int axis) {
|
||||
const auto begin_mask = params.begin_mask;
|
||||
const auto* start_indices = params.start_indices;
|
||||
const auto* strides = params.strides;
|
||||
const int axis_size = input_shape.Dims(axis);
|
||||
if (axis_size == 0) {
|
||||
return 0;
|
||||
}
|
||||
// Begin with the specified index.
|
||||
int start = start_indices[axis];
|
||||
|
||||
// begin_mask override
|
||||
if (begin_mask & 1 << axis) {
|
||||
if (strides[axis] > 0) {
|
||||
// Forward iteration - use the first element. These values will get
|
||||
// clamped below (Note: We could have set them to 0 and axis_size-1, but
|
||||
// use lowest() and max() to maintain symmetry with StopForAxis())
|
||||
start = std::numeric_limits<int>::lowest();
|
||||
} else {
|
||||
// Backward iteration - use the last element.
|
||||
start = std::numeric_limits<int>::max();
|
||||
}
|
||||
}
|
||||
|
||||
// Handle negative indices
|
||||
if (start < 0) {
|
||||
start += axis_size;
|
||||
}
|
||||
|
||||
// Clamping
|
||||
if (strides[axis] > 0) {
|
||||
// Forward iteration
|
||||
start = Clamp(start, 0, axis_size);
|
||||
} else {
|
||||
// Backward iteration
|
||||
start = Clamp(start, -1, axis_size - 1);
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
// Return the "real" index for the end of iteration along that axis. This is an
|
||||
// "end" in the traditional C sense, in that it points to one past the last
|
||||
// element. ie. So if you were iterating through all elements of a 1D array of
|
||||
// size 4, this function would return 4 as the stop, because it is one past the
|
||||
// "real" indices of 0, 1, 2 & 3.
|
||||
inline int StopForAxis(const tflite::StridedSliceParams& params,
|
||||
const RuntimeShape& input_shape, int axis,
|
||||
int start_for_axis) {
|
||||
const auto end_mask = params.end_mask;
|
||||
const auto shrink_axis_mask = params.shrink_axis_mask;
|
||||
const auto* stop_indices = params.stop_indices;
|
||||
const auto* strides = params.strides;
|
||||
const int axis_size = input_shape.Dims(axis);
|
||||
if (axis_size == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Begin with the specified index
|
||||
const bool shrink_axis = shrink_axis_mask & (1 << axis);
|
||||
int stop = stop_indices[axis];
|
||||
|
||||
// When shrinking an axis, the end position does not matter (and can be
|
||||
// incorrect when negative indexing is used, see Issue #19260). Always use
|
||||
// start_for_axis + 1 to generate a length 1 slice, since start_for_axis has
|
||||
// already been adjusted for negative indices.
|
||||
if (shrink_axis) {
|
||||
return start_for_axis + 1;
|
||||
}
|
||||
|
||||
// end_mask override
|
||||
if (end_mask & (1 << axis)) {
|
||||
if (strides[axis] > 0) {
|
||||
// Forward iteration - use the last element. These values will get
|
||||
// clamped below
|
||||
stop = std::numeric_limits<int>::max();
|
||||
} else {
|
||||
// Backward iteration - use the first element.
|
||||
stop = std::numeric_limits<int>::lowest();
|
||||
}
|
||||
}
|
||||
|
||||
// Handle negative indices
|
||||
if (stop < 0) {
|
||||
stop += axis_size;
|
||||
}
|
||||
|
||||
// Clamping
|
||||
// Because the end index points one past the last element, we need slightly
|
||||
// different clamping ranges depending on the direction.
|
||||
if (strides[axis] > 0) {
|
||||
// Forward iteration
|
||||
stop = Clamp(stop, 0, axis_size);
|
||||
} else {
|
||||
// Backward iteration
|
||||
stop = Clamp(stop, -1, axis_size - 1);
|
||||
}
|
||||
|
||||
return stop;
|
||||
}
|
||||
|
||||
inline bool LoopCondition(int index, int stop, int stride) {
|
||||
// True when we have reached the end of an axis and should loop.
|
||||
return stride > 0 ? index >= stop : index <= stop;
|
||||
}
|
||||
|
||||
inline tflite::StridedSliceParams BuildStridedSliceParams(
|
||||
int begin_mask, int end_mask, int shrink_axis_mask,
|
||||
const std::vector<int>& start_indices, const std::vector<int>& stop_indices,
|
||||
const std::vector<int>& strides) {
|
||||
tflite::StridedSliceParams op_params;
|
||||
const int dims_count = start_indices.size();
|
||||
|
||||
op_params.start_indices_count = dims_count;
|
||||
op_params.stop_indices_count = dims_count;
|
||||
op_params.strides_count = dims_count;
|
||||
for (int i = 0; i < dims_count; ++i) {
|
||||
op_params.start_indices[i] = start_indices[i];
|
||||
op_params.stop_indices[i] = stop_indices[i];
|
||||
op_params.strides[i] = strides[i];
|
||||
}
|
||||
|
||||
op_params.begin_mask = begin_mask;
|
||||
op_params.ellipsis_mask = 0;
|
||||
op_params.end_mask = end_mask;
|
||||
op_params.new_axis_mask = 0;
|
||||
op_params.shrink_axis_mask = shrink_axis_mask;
|
||||
|
||||
return op_params;
|
||||
}
|
||||
|
||||
} // namespace strided_slice
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
|
||||
@@ -0,0 +1,47 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
|
||||
|
||||
#include "tensorflow/lite/c/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
||||
template <typename T>
|
||||
inline T* GetTensorData(TfLiteTensor* tensor) {
|
||||
return tensor != nullptr ? reinterpret_cast<T*>(tensor->data.raw) : nullptr;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const T* GetTensorData(const TfLiteTensor* tensor) {
|
||||
return tensor != nullptr ? reinterpret_cast<const T*>(tensor->data.raw)
|
||||
: nullptr;
|
||||
}
|
||||
|
||||
inline RuntimeShape GetTensorShape(const TfLiteTensor* tensor) {
|
||||
if (tensor == nullptr) {
|
||||
return RuntimeShape();
|
||||
}
|
||||
|
||||
TfLiteIntArray* dims = tensor->dims;
|
||||
const int dims_size = dims->size;
|
||||
const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
|
||||
return RuntimeShape(dims_size, dims_data);
|
||||
}
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
|
||||
1065
code/components/tflite-lib/tensorflow/lite/kernels/internal/types.h
Normal file
1065
code/components/tflite-lib/tensorflow/lite/kernels/internal/types.h
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user