Rolling 20210420

2026-01-29 05:40:40 +03:00 · 2021-04-20 19:44:16 +02:00
parent 520f818adc
commit ea2305de47
156 changed files with 11095 additions and 8601 deletions
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/common.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/common.h
@@ -178,14 +178,54 @@ inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
  // - input x is in the range -(1<<47) <= x < (1<<47)
  assert(quantized_multiplier >= 0);
  assert(shift >= -31 && shift < 8);
+  assert(x >= -(static_cast<int64_t>(1) << 47) &&
+         x < (static_cast<int64_t>(1) << 47));

-  int32_t reduced_multiplier = (quantized_multiplier + (1 << 15)) >> 16;
+  int32_t reduced_multiplier = (quantized_multiplier < 0x7FFF0000)
+                                   ? ((quantized_multiplier + (1 << 15)) >> 16)
+                                   : 0x7FFF;
  int total_shift = 15 - shift;
  x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1));
  int32_t result = x >> total_shift;
  return result;
 }

+#ifdef USE_NEON
+// Round uses ARM's rounding shift right.
+inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
+    int32x4x4_t input_val, int32_t quantized_multiplier, int shift) {
+  const int left_shift = std::max(shift, 0);
+  const int right_shift = std::min(shift, 0);
+  int32x4x4_t result;
+
+  int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+  int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+  int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+  result.val[0] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[1] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[2] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[3] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  return result;
+}
+#endif
+
 template <typename T>
 int CountLeadingZeros(T integer_input) {
  static_assert(std::is_unsigned<T>::value,
@@ -261,10 +301,11 @@ inline void gen_lut(double (*func)(double), double min, double max,
        TfLiteRound(func(min + i * step + half_step) * 32768.0);
    double midpoint_err = midpoint_interp_val - midpoint_val;
    double bias = TfLiteRound(midpoint_err / 2.0);
-    table[i] = std::min(std::max(sample_val - bias, -32768.0), 32767.0);
+    table[i] = std::min<double>(std::max<double>(sample_val - bias, -32768.0),
+                                32767.0);
  }
-  table[num - 1] =
-      std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
+  table[num - 1] = std::min<double>(
+      std::max<double>(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
 }

 // generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
@@ -289,10 +330,11 @@ inline void gen_lut(float (*func)(float), float min, float max, int16_t* table,
        TfLiteRound(func(min + i * step + half_step) * 32768.0f);
    float midpoint_err = midpoint_interp_val - midpoint_val;
    float bias = TfLiteRound(midpoint_err / 2.0f);
-    table[i] = std::min(std::max(sample_val - bias, -32768.0f), 32767.0f);
+    table[i] = std::min<float>(std::max<float>(sample_val - bias, -32768.0f),
+                               32767.0f);
  }
-  table[num - 1] = std::min(
-      std::max(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
+  table[num - 1] = std::min<float>(
+      std::max<float>(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
 }

 // int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/cppmath.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/cppmath.h
@@ -34,6 +34,7 @@ namespace tflite {
  }

 DECLARE_STD_GLOBAL_SWITCH1(TfLiteRound, round);
+DECLARE_STD_GLOBAL_SWITCH1(TfLiteExpm1, expm1);

 }  // namespace tflite

--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/portable_tensor.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/portable_tensor.h
@@ -15,7 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_

-#include <complex>
 #include <vector>

 #include "tensorflow/lite/c/common.h"
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/quantization_util.cc
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/quantization_util.cc
@@ -289,7 +289,7 @@ void PreprocessSoftmaxScaling(double beta, double input_scale,
    input_beta_real_multiplier = (1ll << 31) - 1.0;
  }
 #else   // TFLITE_EMULATE_FLOAT
-  const double input_beta_real_multiplier = std::min(
+  const double input_beta_real_multiplier = std::min<double>(
      beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
 #endif  // TFLITE_EMULATE_FLOAT

--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/add.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/add.h
@@ -202,14 +202,6 @@ inline void Add(const ArithmeticParams& params,
  }
 }

-// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                               const RuntimeShape& input1_shape,
                               const float* input1_data,
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/add_n.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/add_n.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+// T is expected to be either float or int.
+template <typename T>
+inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
+                 const T* const* input_data, T* output_data) {
+  // All inputs and output should have the same shape, this is checked during
+  // Prepare stage.
+  const size_t size = input_shape.FlatSize();
+  for (size_t i = 0; i < size; ++i) {
+    T x = 0;
+    for (size_t j = 0; j < num_inputs; ++j) {
+      x += input_data[j][i];
+    }
+    output_data[i] = x;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_N_H_
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/arg_min_max.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/arg_min_max.h
@@ -15,12 +15,23 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_

+#include <functional>
+
 #include "tensorflow/lite/kernels/internal/types.h"

 namespace tflite {

 namespace reference_ops {

+template <typename T>
+std::function<bool(T, T)> GetComparefunction(bool is_arg_max) {
+  if (is_arg_max) {
+    return std::greater<T>();
+  } else {
+    return std::less<T>();
+  }
+}
+
 template <typename T1, typename T2, typename T3, typename Cmp>
 void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
               const T3* input2_data, const RuntimeShape& output_shape,
@@ -62,6 +73,15 @@ void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
    }
  }
 }
+
+template <typename T1, typename T2, typename T3>
+void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
+               const T3* input2_data, const RuntimeShape& output_shape,
+               T2* output_data, const bool is_arg_max) {
+  ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data,
+            GetComparefunction<T1>(is_arg_max));
+}
+
 }  // namespace reference_ops
 }  // namespace tflite

--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h
@@ -0,0 +1,101 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
+
+#include <cmath>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+// TODO(b/135760455): Move this method anonymous namespace in a cc file.
+inline RuntimeShape ExtendShapeBatchToSpace(const RuntimeShape& shape) {
+  if (shape.DimensionsCount() == 4) {
+    return shape;
+  }
+  RuntimeShape new_shape(4, 1);
+  new_shape.SetDim(0, shape.Dims(0));
+  new_shape.SetDim(1, shape.Dims(1));
+  new_shape.SetDim(3, shape.Dims(2));
+  return new_shape;
+}
+
+template <typename T>
+inline void BatchToSpaceND(const RuntimeShape& unextended_input1_shape,
+                           const T* input1_data,
+                           const RuntimeShape& unextended_input2_shape,
+                           const int32_t* block_shape_data,
+                           const RuntimeShape& unextended_input3_shape,
+                           const int32_t* crops_data,
+                           const RuntimeShape& unextended_output_shape,
+                           T* output_data) {
+  ruy::profiler::ScopeLabel label("BatchToSpaceND");
+  TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
+                   unextended_output_shape.DimensionsCount());
+
+  const RuntimeShape input1_shape =
+      ExtendShapeBatchToSpace(unextended_input1_shape);
+  const RuntimeShape output_shape =
+      ExtendShapeBatchToSpace(unextended_output_shape);
+
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch_size = output_shape.Dims(0);
+
+  const int depth = input1_shape.Dims(3);
+  const int input_width = input1_shape.Dims(2);
+  const int input_height = input1_shape.Dims(1);
+  const int input_batch_size = input1_shape.Dims(0);
+
+  const int block_shape_height = block_shape_data[0];
+  const int block_shape_width =
+      unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
+  const int crops_top = crops_data[0];
+  const int crops_left =
+      unextended_input1_shape.DimensionsCount() == 4 ? crops_data[2] : 0;
+  for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
+    const int out_batch = in_batch % output_batch_size;
+    const int spatial_offset = in_batch / output_batch_size;
+    for (int in_h = 0; in_h < input_height; ++in_h) {
+      const int out_h = in_h * block_shape_height +
+                        spatial_offset / block_shape_width - crops_top;
+      if (out_h < 0 || out_h >= output_height) {
+        continue;
+      }
+      for (int in_w = 0; in_w < input_width; ++in_w) {
+        const int out_w = in_w * block_shape_width +
+                          spatial_offset % block_shape_width - crops_left;
+
+        if (out_w < 0 || out_w >= output_width) {
+          continue;
+        }
+        T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
+        const T* in =
+            input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
+        memcpy(out, in, depth * sizeof(T));
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_TO_SPACE_ND_H_
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/binary_function.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/binary_function.h
@@ -23,9 +23,6 @@ namespace tflite {

 namespace reference_ops {

-// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
-// generalized and efficient BroadcastBinaryFunction.
-//
 // Also appears to duplicate MinimumMaximum.
 //
 // R: Result type. T1: Input 1 type. T2: Input 2 type.
@@ -63,7 +60,6 @@ inline void BroadcastBinaryFunction4DSlow(
 }

 // R: Result type. T1: Input 1 type. T2: Input 2 type.
-// TODO(renjieliu): Refactor other binary functions to use this one.
 template <typename R, typename T1, typename T2>
 inline void BinaryFunction(const RuntimeShape& input1_shape,
                           const T1* input1_data,
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/concatenation.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/concatenation.h
@@ -68,8 +68,7 @@ inline void Concatenation(const ConcatenationParams& params,
  }
 }

-// TODO(prabhumk): This is the same as the optimized implementation.
-// TODO(prabhumk): The quantized implementation of concatentation isn't fully
+// TODO(b/174275780): The quantized implementation of concatentation isn't fully
 // quantized as it takes scale as a floating point value. This should be fixed
 // when optimizng this routine further.
 inline void ConcatenationWithScaling(const ConcatenationParams& params,
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/conv.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/conv.h
@@ -15,16 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_

-#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/internal/common.h"
-
-
+#include "tensorflow/lite/kernels/internal/types.h"

 namespace tflite {

 namespace reference_ops {

-
 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                 const float* input_data, const RuntimeShape& filter_shape,
                 const float* filter_data, const RuntimeShape& bias_shape,
@@ -108,8 +105,8 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                 uint8_t* output_data, const RuntimeShape& im2col_shape,
                 uint8_t* im2col_data, void* cpu_backend_context) {
  (void)cpu_backend_context;  // only used in optimized code.
-  (void)im2col_data;   // only used in optimized code.
-  (void)im2col_shape;  // only used in optimized code.
+  (void)im2col_data;          // only used in optimized code.
+  (void)im2col_shape;         // only used in optimized code.
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
  const int dilation_width_factor = params.dilation_width_factor;
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/div.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/div.h
@@ -0,0 +1,239 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline void DivCheckArithmeticParams(const ArithmeticParams& params) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  // Input offset is negative input zero point. Activation tensors are
+  // asymmetric quantized so they span the full int8 range.
+  constexpr int32_t max_value =
+      static_cast<int32_t>(std::numeric_limits<T>::max());
+  TFLITE_DCHECK_GE(params.input1_offset, -max_value);
+  TFLITE_DCHECK_LE(params.input1_offset, max_value);
+  TFLITE_DCHECK_GE(params.input2_offset, -max_value);
+  TFLITE_DCHECK_LE(params.input2_offset, max_value);
+  TFLITE_DCHECK_GE(params.output_offset, -max_value);
+  TFLITE_DCHECK_LE(params.output_offset, max_value);
+}
+
+// Element-wise div that can often be used for inner loop of broadcast Div as
+// well as the non-broadcast Div.
+template <typename T>
+inline void DivElementwise(int size, const ArithmeticParams& params,
+                           const T* input1_data, const T* input2_data,
+                           T* output_data) {
+  DivCheckArithmeticParams<T>(params);
+
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    TFLITE_DCHECK_NE(input2_val, 0);
+    int recip_shift;
+    const int32_t input2_inv =
+        (input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
+                         : -GetReciprocal(-input2_val, 31, &recip_shift);
+    const int headroom = CountLeadingSignBits(input1_val);
+    const int32_t unscaled_quotient =
+        MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
+                                                    headroom);
+    const int total_shift = params.output_shift - recip_shift - headroom;
+    const int32_t unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            unscaled_quotient, params.output_multiplier, total_shift);
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
+inline void Div(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  DivElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Div(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  DivElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+template <typename T, int N = 5>
+inline void BroadcastDivSlowQuantized(
+    const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape,
+    const T* input1_data, const RuntimeShape& unextended_input2_shape,
+    const T* input2_data, const RuntimeShape& unextended_output_shape,
+    T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
+
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                 &output_desc);
+
+  DivCheckArithmeticParams<T>(params);
+
+  auto div_func = [&](int indexes[N]) {
+    const int32_t input1_val =
+        params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
+    const int32_t input2_val =
+        params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
+    TFLITE_DCHECK_NE(input2_val, 0);
+    int recip_shift;
+    const int32_t input2_inv =
+        (input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
+                         : -GetReciprocal(-input2_val, 31, &recip_shift);
+    const int headroom = CountLeadingSignBits(input1_val);
+    const int32_t unscaled_quotient =
+        MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
+                                                    headroom);
+    const int total_shift = params.output_shift - recip_shift - headroom;
+    const int32_t unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            unscaled_quotient, params.output_multiplier, total_shift);
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        static_cast<T>(clamped_output);
+  };
+  NDOpsHelper<N>(output_desc, div_func);
+}
+
+template <int N = 5>
+inline void BroadcastDivSlow(const ArithmeticParams& params,
+                             const RuntimeShape& unextended_input1_shape,
+                             const uint8_t* input1_data,
+                             const RuntimeShape& unextended_input2_shape,
+                             const uint8_t* input2_data,
+                             const RuntimeShape& unextended_output_shape,
+                             uint8_t* output_data) {
+  BroadcastDivSlowQuantized<uint8_t, N>(
+      params, unextended_input1_shape, input1_data, unextended_input2_shape,
+      input2_data, unextended_output_shape, output_data);
+}
+
+template <int N = 5>
+inline void BroadcastDivSlow(const ArithmeticParams& params,
+                             const RuntimeShape& unextended_input1_shape,
+                             const int8_t* input1_data,
+                             const RuntimeShape& unextended_input2_shape,
+                             const int8_t* input2_data,
+                             const RuntimeShape& unextended_output_shape,
+                             int8_t* output_data) {
+  BroadcastDivSlowQuantized<int8_t, N>(
+      params, unextended_input1_shape, input1_data, unextended_input2_shape,
+      input2_data, unextended_output_shape, output_data);
+}
+
+// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <typename T, int N = 5>
+void BroadcastDivSlow(const ArithmeticParams& params,
+                      const RuntimeShape& unextended_input1_shape,
+                      const T* input1_data,
+                      const RuntimeShape& unextended_input2_shape,
+                      const T* input2_data,
+                      const RuntimeShape& unextended_output_shape,
+                      T* output_data) {
+  T output_activation_min;
+  T output_activation_max;
+  GetActivationParams(params, &output_activation_min, &output_activation_max);
+
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
+
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                 &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+
+  auto div_func = [&](int indexes[N]) {
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        ActivationFunctionWithMinMax(
+            input1_data[SubscriptToIndex(desc1, indexes)] /
+                input2_data[SubscriptToIndex(desc2, indexes)],
+            output_activation_min, output_activation_max);
+  };
+  NDOpsHelper<N>(output_desc, div_func);
+}
+
+template <typename T>
+inline void Div(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  T output_activation_min;
+  T output_activation_max;
+  GetActivationParams(params, &output_activation_min, &output_activation_max);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] / input2_data[i], output_activation_min,
+        output_activation_max);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/elu.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/elu.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
+
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void Elu(const RuntimeShape& input_shape, const float* input_data,
+                const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    output_data[i] = val < 0.0f ? TfLiteExpm1(val) : val;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ELU_H_
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/exp.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/exp.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
+
+#include <cmath>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T>
+inline void Exp(const T* input_data, const size_t num_elements,
+                T* output_data) {
+  ruy::profiler::ScopeLabel label("Exp");
+  for (size_t idx = 0; idx < num_elements; ++idx) {
+    output_data[idx] = std::exp(input_data[idx]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_EXP_H_
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/fill.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/fill.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T>
+void Fill(const RuntimeShape& value_shape, const T* value_data,
+          const RuntimeShape& output_shape, T* output_data) {
+  TFLITE_DCHECK_EQ(value_shape.DimensionsCount(), 0);
+  const int flat_size = output_shape.FlatSize();
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = *value_data;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FILL_H_
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/fully_connected.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/fully_connected.h
@@ -31,7 +31,7 @@ inline void FullyConnected(
    float* output_data) {
  const float output_activation_min = params.float_activation_min;
  const float output_activation_max = params.float_activation_max;
-  // TODO(benoitjacob): This really should be:
+  // TODO(b/62193649): This really should be:
  //     const int batches = ArraySize(output_dims, 1);
  // but the current --variable_batch hack consists in overwriting the 3rd
  // dimension with the runtime batch size, as we don't keep track for each
@@ -76,7 +76,7 @@ inline void FullyConnected(
  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);

  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  // TODO(benoitjacob): This really should be:
+  // TODO(b/62193649): This really should be:
  //     const int batches = ArraySize(output_dims, 1);
  // but the current --variable_batch hack consists in overwriting the 3rd
  // dimension with the runtime batch size, as we don't keep track for each
@@ -123,7 +123,7 @@ inline void FullyConnected(

  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  TFLITE_DCHECK_EQ(output_offset, 0);
-  // TODO(benoitjacob): This really should be:
+  // TODO(b/62193649): This really should be:
  //     const int batches = ArraySize(output_dims, 1);
  // but the current --variable_batch hack consists in overwriting the 3rd
  // dimension with the runtime batch size, as we don't keep track for each
@@ -176,7 +176,7 @@ inline void ShuffledFullyConnected(
  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
-  // TODO(benoitjacob): This really should be:
+  // TODO(b/62193649): This really should be:
  //     const int batches = ArraySize(output_dims, 1);
  // but the current --variable_batch hack consists in overwriting the 3rd
  // dimension with the runtime batch size, as we don't keep track for each
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -34,55 +34,24 @@ inline void CheckArithmeticParams(const ArithmeticParams& params) {
  TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
 }

-// Element-wise add that can often be used for inner loop of broadcast add as
-// well as the non-broadcast add.
-inline void AddElementwise(int size, const ArithmeticParams& params,
-                           const int8_t* input1_data, const int8_t* input2_data,
-                           int8_t* output_data) {
+inline void ElementWise(
+    int size, const ArithmeticParams& params, const int8_t* input1_data,
+    const int8_t* input2_data, int8_t* output_data,
+    void (*check_arithmetic_params)(const ArithmeticParams&),
+    int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
  CheckArithmeticParams(params);
-
  for (int i = 0; i < size; ++i) {
-    const int32_t input1_val = params.input1_offset + input1_data[i];
-    const int32_t input2_val = params.input2_offset + input2_data[i];
-    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32_t scaled_input1_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32_t scaled_input2_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32_t raw_output =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            raw_sum, params.output_multiplier, params.output_shift) +
-        params.output_offset;
-    const int32_t clamped_output =
-        std::min(params.quantized_activation_max,
-                 std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<int8_t>(clamped_output);
+    output_data[i] = binary_func(input1_data[i], input2_data[i], params);
  }
 }

-inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int8_t* input1_data,
-                const RuntimeShape& input2_shape, const int8_t* input2_data,
-                const RuntimeShape& output_shape, int8_t* output_data) {
-  CheckArithmeticParams(params);
-
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-
-  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
-}
-
-inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
-                               const RuntimeShape& input1_shape,
-                               const int8_t* input1_data,
-                               const RuntimeShape& input2_shape,
-                               const int8_t* input2_data,
-                               const RuntimeShape& output_shape,
-                               int8_t* output_data) {
+inline void BroadcastBinaryFunction4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const int8_t* input1_data, const RuntimeShape& input2_shape,
+    const int8_t* input2_data, const RuntimeShape& output_shape,
+    int8_t* output_data,
+    void (*check_arithmetic_params)(const ArithmeticParams&),
+    int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -105,40 +74,70 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32_t input1_val =
-              params.input1_offset +
-              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32_t input2_val =
-              params.input2_offset +
-              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32_t shifted_input1_val =
-              input1_val * (1 << params.left_shift);
-          const int32_t shifted_input2_val =
-              input2_val * (1 << params.left_shift);
-          const int32_t scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input1_val, params.input1_multiplier,
-                  params.input1_shift);
-          const int32_t scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input2_val, params.input2_multiplier,
-                  params.input2_shift);
-          const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
-          const int32_t raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  raw_sum, params.output_multiplier, params.output_shift) +
-              params.output_offset;
-          const int32_t clamped_output =
-              std::min(params.quantized_activation_max,
-                       std::max(params.quantized_activation_min, raw_output));
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<int8_t>(clamped_output);
+          output_data[Offset(extended_output_shape, b, y, x, c)] = binary_func(
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)], params);
        }
      }
    }
  }
 }

+inline int8_t AddFunc(int8_t x, int8_t y, const ArithmeticParams& params) {
+  const int32_t input1_val = params.input1_offset + x;
+  const int32_t input2_val = params.input2_offset + y;
+  const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+  const int32_t scaled_input1_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input1_val, params.input1_multiplier, params.input1_shift);
+  const int32_t scaled_input2_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input2_val, params.input2_multiplier, params.input2_shift);
+  const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+  const int32_t raw_output =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          raw_sum, params.output_multiplier, params.output_shift) +
+      params.output_offset;
+  const int32_t clamped_output =
+      std::min(params.quantized_activation_max,
+               std::max(params.quantized_activation_min, raw_output));
+  return static_cast<int8_t>(clamped_output);
+}
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const int8_t* input1_data, const int8_t* input2_data,
+                           int8_t* output_data) {
+  ElementWise(size, params, input1_data, input2_data, output_data,
+              CheckArithmeticParams, AddFunc);
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  CheckArithmeticParams(params);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  BroadcastBinaryFunction4DSlow(params, input1_shape, input1_data, input2_shape,
+                                input2_data, output_shape, output_data,
+                                CheckArithmeticParams, AddFunc);
+}
+
 }  // namespace reference_integer_ops
 }  // namespace tflite

--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -101,7 +101,7 @@ inline void ConvPerChannel(
                // long as the filter size (filter_y * filter_x * in_channel)
                // does not exceed 2^16, which is the case in all the models
                // we have seen so far.
-                // TODO(jianlijianli): Add a check to make sure the
+                // TODO(b/174275578): Add a check to make sure the
                // accumulator depth is smaller than 2^16.
                acc += filter_val * (input_val + input_offset);
              }
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -95,7 +95,7 @@ inline void DepthwiseConvPerChannel(
                  // long as the filter size (filter_y * filter_x * in_channel)
                  // does not exceed 2^16, which is the case in all the models
                  // we have seen so far.
-                  // TODO(jianlijianli): Add a check to make sure the
+                  // TODO(b/174275578): Add a check to make sure the
                  // accumulator depth is smaller than 2^16.
                  acc += filter_val * (input_val + input_offset);
                }
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -58,23 +58,36 @@ inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
  }
 }

-inline void Logistic(int32_t input_multiplier, int32_t input_size,
-                     const int16_t* ptr_input_data, int16_t* ptr_output_data) {
+inline void Logistic(int32_t input_multiplier, int32_t input_left_shift,
+                     int32_t input_size, const int16_t* ptr_input_data,
+                     int16_t* ptr_output_data) {
  // We use the LUT for sigmoid and take into account, that
  // tanh(x) = 2*sigmoid(2*x) - 1

-  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+  // We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
+  // In case of general parameter scale, multiplier 3 is taken into account
+  // in TanhPrepare function and it is included in
+  // input_multiplier already.
+
+  TFLITE_DCHECK_GE(input_left_shift, 0);
+  if (input_multiplier == 0) {  // power of two case
+    input_multiplier = 3 << input_left_shift;
+    input_left_shift = 0;
+  }
+
+  int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;

  for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = (*ptr_input_data) * input_data_mul;
+    int32_t input_data =
+        ((*ptr_input_data) * input_multiplier + round) >> input_left_shift;

-    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
-    // we do interpolation on unsigned values.
-    uint32_t abs_input_data = 3 * abs(input_data);
+    // We do interpolation on unsigned values.
+    uint32_t abs_input_data = abs(input_data);

    // We divide by 2 power of 9, because
    // we need to divide by 2 in power of 7 for
    // the input conversion + 1/4 from the scale above.
+
    // Define uh as uint32_t type not to make this function overflow.
    uint32_t uh = abs_input_data >> 9;
    uint32_t result;
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -65,19 +65,25 @@ inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
  // We use the LUT for sigmoid and take into account, that
  // tanh(x) = 2*sigmoid(2*x) - 1

-  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+  // We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
+  // In case of general parameter scale, multiplier 3 is taken into account
+  // in TanhPrepare function and it is included in
+  // input_multiplier already.
+
+  if (input_multiplier == 0) {  // power of two case
+    input_multiplier = 3 << input_left_shift;
+    input_left_shift = 0;
+  }
+
+  int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;

  int flat_size = MatchingFlatSize(input_shape, output_shape);

  for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = (*ptr_input_data) * input_data_mul;
+    int32_t input_data =
+        ((*ptr_input_data) * input_multiplier + round) >> input_left_shift;

-    if (input_left_shift == 1) {
-      input_data <<= 1;
-    }
-
-    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
-    uint32_t abs_input_data = 3 * abs(input_data);
+    uint32_t abs_input_data = abs(input_data);
    uint32_t uh = abs_input_data >> 8;
    int32_t result;

--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
@@ -0,0 +1,221 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Fixed-point per-channel-quantization transpose convolution reference kernel.
+inline void TransposeConv(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
+    int32_t* scratch_buffer) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = std::numeric_limits<int8_t>::min();
+  const int32_t output_activation_max = std::numeric_limits<int8_t>::max();
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int num_elements = output_shape.FlatSize();
+  // We need to initialize scratch_buffer to all 0s, as we apply the same
+  // 'scatter' based trick as in float version.
+  memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence.
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location.
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds.
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  const int8_t input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  const int8_t filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                        out_channel)] +=
+                      (input_value + input_offset) * filter_value;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                              out_channel)];
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+// int16_t input (zero_point=0), int8_t filter, int64 accumulator
+inline void TransposeConv(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
+    std::int64_t* scratch_buffer) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int32_t output_activation_min = std::numeric_limits<int16_t>::min();
+  const int32_t output_activation_max = std::numeric_limits<int16_t>::max();
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int num_elements = output_shape.FlatSize();
+  // We need to initialize scratch_buffer to all 0s, as we apply the same
+  // 'scatter' based trick as in float version.
+  memset(scratch_buffer, 0, num_elements * sizeof(std::int64_t));
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence.
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location.
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds.
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  const int32_t input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  const int32_t filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                        out_channel)] +=
+                      input_value * filter_value;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          std::int64_t acc = scratch_buffer[Offset(output_shape, batch, out_y,
+                                                   out_x, out_channel)];
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          scaled_acc = std::max(scaled_acc, output_activation_min);
+          scaled_acc = std::min(scaled_acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int16_t>(scaled_acc);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TRANSPOSE_CONV_H_
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/leaky_relu.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/leaky_relu.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
+
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void LeakyRelu(const tflite::LeakyReluParams& params,
+                      const RuntimeShape& input_shape, const float* input_data,
+                      const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    // Note that alpha might be > 1 or < 0, so we don't use std::max here.
+    output_data[i] = val > 0 ? val : val * params.alpha;
+  }
+}
+
+template <typename T>
+inline void QuantizeLeakyRelu(const LeakyReluParams& params,
+                              const RuntimeShape& input_shape,
+                              const T* input_data,
+                              const RuntimeShape& output_shape,
+                              T* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static const int32_t quantized_min = std::numeric_limits<T>::min();
+  static const int32_t quantized_max = std::numeric_limits<T>::max();
+  for (int i = 0; i < flat_size; ++i) {
+    const int32_t input_value = input_data[i] - params.input_offset;
+    int32_t unclamped_output;
+    if (input_value >= 0) {
+      unclamped_output = params.output_offset +
+                         MultiplyByQuantizedMultiplier(
+                             input_value, params.output_multiplier_identity,
+                             params.output_shift_identity);
+    } else {
+      unclamped_output = params.output_offset +
+                         MultiplyByQuantizedMultiplier(
+                             input_value, params.output_multiplier_alpha,
+                             params.output_shift_alpha);
+    }
+    const T clamped_output =
+        std::min(quantized_max, std::max(quantized_min, unclamped_output));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEAKY_RELU_H_
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/requantize.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/requantize.h
@@ -45,6 +45,7 @@ inline void Requantize(const input_type* input_data, int32_t size,
      for (int i = 0; i < size; ++i) {
        output_data[i] = input_data[i] ^ 0x80;
      }
+      return;
    }
  }
  static constexpr int32_t kMinOutput = std::numeric_limits<output_type>::min();
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h
@@ -0,0 +1,109 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
+
+#include <cmath>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+// TODO(b/135760455): Move this method anonymous namespace in a cc file.
+inline RuntimeShape ExtendShapeSpaceToBatch(const RuntimeShape& shape) {
+  if (shape.DimensionsCount() == 4) {
+    return shape;
+  }
+  RuntimeShape new_shape(4, 1);
+  new_shape.SetDim(0, shape.Dims(0));
+  new_shape.SetDim(1, shape.Dims(1));
+  new_shape.SetDim(3, shape.Dims(2));
+  return new_shape;
+}
+
+template <typename T>
+inline void SpaceToBatchND(const SpaceToBatchParams& params,
+                           const RuntimeShape& unextended_input1_shape,
+                           const T* input1_data,
+                           const RuntimeShape& unextended_input2_shape,
+                           const int32_t* block_shape_data,
+                           const RuntimeShape& unextended_input3_shape,
+                           const int32_t* paddings_data,
+                           const RuntimeShape& unextended_output_shape,
+                           T* output_data) {
+  ruy::profiler::ScopeLabel label("SpaceToBatchND");
+  TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
+                   unextended_output_shape.DimensionsCount());
+
+  // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
+  const RuntimeShape input1_shape =
+      ExtendShapeSpaceToBatch(unextended_input1_shape);
+  const RuntimeShape output_shape =
+      ExtendShapeSpaceToBatch(unextended_output_shape);
+
+  const int depth = input1_shape.Dims(3);
+  const int input_width = input1_shape.Dims(2);
+  const int input_height = input1_shape.Dims(1);
+  const int input_batch_size = input1_shape.Dims(0);
+
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch_size = output_shape.Dims(0);
+
+  const int block_shape_height = block_shape_data[0];
+  const int block_shape_width =
+      unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
+  const int padding_top = paddings_data[0];
+  const int padding_left =
+      unextended_input1_shape.DimensionsCount() == 4 ? paddings_data[2] : 0;
+
+  // For uint8 quantized, the correct padding "zero value" is the output offset.
+  const int32_t pad_value = params.output_offset;
+  for (int out_b = 0; out_b < output_batch_size; ++out_b) {
+    int input_batch = out_b % input_batch_size;
+    int shift_w = (out_b / input_batch_size) % block_shape_width;
+    int shift_h = (out_b / input_batch_size) / block_shape_width;
+    for (int out_h = 0; out_h < output_height; ++out_h) {
+      for (int out_w = 0; out_w < output_width; ++out_w) {
+        T* out = output_data + Offset(output_shape, out_b, out_h, out_w, 0);
+        if (out_h * block_shape_height + shift_h < padding_top ||
+            out_h * block_shape_height + shift_h >=
+                padding_top + input_height ||
+            out_w * block_shape_width + shift_w < padding_left ||
+            out_w * block_shape_width + shift_w >= padding_left + input_width) {
+          // This may not execute correctly when pad_value != 0 and T != uint8.
+          memset(out, pad_value, depth * sizeof(T));
+        } else {
+          const T* in =
+              input1_data +
+              Offset(input1_shape, input_batch,
+                     (out_h * block_shape_height + shift_h) - padding_top,
+                     (out_w * block_shape_width + shift_w) - padding_left, 0);
+          memcpy(out, in, depth * sizeof(T));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SPACE_TO_BATCH_ND_H_
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/strided_slice.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/strided_slice.h
@@ -15,23 +15,28 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_

+#include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/kernels/internal/types.h"

 namespace tflite {

 namespace reference_ops {
+
 template <typename T>
 inline void StridedSlice(const tflite::StridedSliceParams& op_params,
                         const RuntimeShape& unextended_input_shape,
-                         const T* input_data,
                         const RuntimeShape& unextended_output_shape,
-                         T* output_data) {
+                         SequentialTensorWriter<T>* writer) {
  using strided_slice::LoopCondition;
  using strided_slice::StartForAxis;
  using strided_slice::StopForAxis;
+
+  ruy::profiler::ScopeLabel label("StridedSlice");
+
  // Note that the output_shape is not used herein.
  tflite::StridedSliceParams params_copy = op_params;

@@ -57,7 +62,6 @@ inline void StridedSlice(const tflite::StridedSliceParams& op_params,
  const int start_4 = StartForAxis(params_copy, input_shape, 4);
  const int stop_4 = StopForAxis(params_copy, input_shape, 4, start_4);

-  T* out_ptr = output_data;
  for (int offset_0 = start_0 * input_shape.Dims(1),
           end_0 = stop_0 * input_shape.Dims(1),
           step_0 = params_copy.strides[0] * input_shape.Dims(1);
@@ -81,13 +85,36 @@ inline void StridedSlice(const tflite::StridedSliceParams& op_params,
          for (int offset_4 = offset_3 + start_4, end_4 = offset_3 + stop_4;
               !LoopCondition(offset_4, end_4, params_copy.strides[4]);
               offset_4 += params_copy.strides[4]) {
-            *out_ptr++ = input_data[offset_4];
+            writer->Write(offset_4);
          }
        }
      }
    }
  }
 }
+
+template <typename T>
+inline void StridedSlice(const tflite::StridedSliceParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const T* input_data,
+                         const RuntimeShape& unextended_output_shape,
+                         T* output_data) {
+  SequentialTensorWriter<T> writer(input_data, output_data);
+  StridedSlice<T>(op_params, unextended_input_shape, unextended_output_shape,
+                  &writer);
+}
+
+template <typename T>
+inline void StridedSlice(const tflite::StridedSliceParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const TfLiteTensor* input,
+                         const RuntimeShape& unextended_output_shape,
+                         TfLiteTensor* output) {
+  SequentialTensorWriter<T> writer(input, output);
+  StridedSlice<T>(op_params, unextended_input_shape, unextended_output_shape,
+                  &writer);
+}
+
 }  // namespace reference_ops
 }  // namespace tflite

--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/sub.h
@@ -65,10 +65,6 @@ inline void SubNonBroadcast(const ArithmeticParams& params,
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
 // generate max(D1, D2) nested for loops.
-// TODO(b/151345101): BroadcastSub is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
 template <int N = 5>
 inline void BroadcastSubSlow(const ArithmeticParams& params,
                             const RuntimeShape& input1_shape,
@@ -336,6 +332,50 @@ void BroadcastSubSlow(const ArithmeticParams& params,
  NDOpsHelper<N>(output_desc, sub_func);
 }

+template <int N = 5>
+inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
+                                  const RuntimeShape& input1_shape,
+                                  const int16_t* input1_data,
+                                  const RuntimeShape& input2_shape,
+                                  const int16_t* input2_data,
+                                  const RuntimeShape& output_shape,
+                                  int16_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  auto sub_func = [&](int indexes[N]) {
+    const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
+    const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
+    const int32_t scaled_input1_val =
+        gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
+    const int32_t scaled_input2_val =
+        gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
+    const int32_t raw_output = scaled_input1_val - scaled_input2_val;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        static_cast<int16_t>(clamped_output);
+  };
+  NDOpsHelper<N>(output_desc, sub_func);
+}
+
 // Element-wise Sub that can often be used for inner loop of broadcast sub as
 // well as the non-broadcast sub.
 inline void SubElementwise(int size, const ArithmeticParams& params,
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/transpose_conv.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/reference/transpose_conv.h
@@ -0,0 +1,217 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void TransposeConv(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Although transpose convolution simplifies to convolution with transposed
+  // weights for strides of 1, non-unitary striding complicates matters. To
+  // keep this reference implementation as clear as possible, we use a
+  // "scatter" access pattern, where we loop through all the input elements,
+  // computing their influence on the output, rather than looping through the
+  // output elements in the typical "gather" access pattern of a conv. We
+  // therefore must initialize the output array to zero.
+  const int num_elements = output_shape.FlatSize();
+  for (int i = 0; i < num_elements; i++) {
+    output_data[i] = 0.0f;
+  }
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  float input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  float filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  output_data[Offset(output_shape, batch, out_y, out_x,
+                                     out_channel)] +=
+                      input_value * filter_value;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  if (bias_data) {
+    for (int batch = 0; batch < batches; ++batch) {
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               out_channel)] += bias_data[out_channel];
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void TransposeConv(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, const RuntimeShape& im2col_shape,
+    uint8_t* im2col_data, int32_t* scratch_buffer) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  const int num_elements = output_shape.FlatSize();
+  // We need to initialize scratch_buffer to all 0s, as we apply the same
+  // 'scatter' based trick as in float version.
+  memset(scratch_buffer, 0, num_elements * sizeof(int32_t));
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence.
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location.
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds.
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  uint8_t input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  uint8_t filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                        out_channel)] +=
+                      (input_value + input_offset) *
+                      (filter_value + filter_offset);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          int32_t acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                              out_channel)];
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier, output_shift);
+          scaled_acc += output_offset;
+          scaled_acc = std::max(scaled_acc, output_activation_min);
+          scaled_acc = std::min(scaled_acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<uint8_t>(scaled_acc);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TRANSPOSE_CONV_H_
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/strided_slice_logic.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/strided_slice_logic.h
@@ -140,7 +140,7 @@ inline int StopForAxis(const tflite::StridedSliceParams& params,
  // start_for_axis + 1 to generate a length 1 slice, since start_for_axis has
  // already been adjusted for negative indices.
  if (shrink_axis) {
-    stop = start_for_axis + 1;
+    return start_for_axis + 1;
  }

  // end_mask override
--- a/code/components/tfmicro/tensorflow/lite/kernels/internal/types.h
+++ b/code/components/tfmicro/tensorflow/lite/kernels/internal/types.h
@@ -43,6 +43,20 @@ struct PaddingValues {
  int16_t height_offset;
 };

+struct Padding3DValues {
+  int16_t width;
+  int16_t height;
+  int16_t depth;
+  // offset is used for calculating "remaining" padding, for example, `width`
+  // is 1 and `width_offset` is 1, so padding_left is 1 while padding_right is
+  // 1 + 1 = 2.
+  int16_t width_offset;
+  // Same as width_offset except it's over the height dimension.
+  int16_t height_offset;
+  // Same as width_offset except it's over the depth dimension.
+  int16_t depth_offset;
+};
+
 // This enumeration allows for non-default formats for the weights array
 // of a fully-connected operator, allowing the use of special optimized
 // runtime paths.
@@ -170,7 +184,11 @@ class RuntimeShape {
  // rolls out.
  RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
    if (size_ > kMaxSmallSize) {
+#ifdef TF_LITE_STATIC_MEMORY
+      TFLITE_CHECK(false && "No shape resizing supported on this platform");
+#else
      dims_pointer_ = new int32_t[size_];
+#endif
    }
    std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * size_);
  }
@@ -392,6 +410,20 @@ inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) {
  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
 }

+inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3,
+                  int i4) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), 5);
+  const int* dims_data = reinterpret_cast<const int*>(shape.DimsDataUpTo5D());
+  TFLITE_DCHECK(i0 >= 0 && i0 < dims_data[0]);
+  TFLITE_DCHECK(i1 >= 0 && i1 < dims_data[1]);
+  TFLITE_DCHECK(i2 >= 0 && i2 < dims_data[2]);
+  TFLITE_DCHECK(i3 >= 0 && i3 < dims_data[3]);
+  TFLITE_DCHECK(i4 >= 0 && i4 < dims_data[4]);
+  return (((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3) *
+             dims_data[4] +
+         i4;
+}
+
 inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
  TFLITE_DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
  TFLITE_DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
@@ -840,6 +872,19 @@ struct ConvParams {
  float float_activation_max;
 };

+struct Conv3DParams {
+  Padding3DValues padding_values;
+  int stride_width;
+  int stride_height;
+  int stride_depth;
+  int dilation_width;
+  int dilation_height;
+  int dilation_depth;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+};
+
 struct DepthToSpaceParams {
  int32_t block_size;
 };
@@ -907,6 +952,7 @@ struct FullyConnectedParams {

 struct GatherParams {
  int16_t axis;
+  int16_t batch_dims;
 };

 struct L2NormalizationParams {
@@ -1025,9 +1071,9 @@ struct ResizeNearestNeighborParams {

 struct SliceParams {
  int8_t begin_count;
-  int32_t begin[4];
+  int32_t begin[5];
  int8_t size_count;
-  int32_t size[4];
+  int32_t size[5];
 };

 struct SoftmaxParams {