Update tflite

2026-01-30 06:10:43 +03:00 · 2020-11-08 03:27:52 +01:00
parent 05a0f6fa62
commit 84cea8e3d6
169 changed files with 16367 additions and 11456 deletions
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/common.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/common.h
@@ -55,9 +55,12 @@ inline void GetActivationMinMax(FusedActivationFunctionType ac,
  }
 }

-inline float ActivationFunctionWithMinMax(float x, float output_activation_min,
-                                          float output_activation_max) {
-  return std::min(std::max(x, output_activation_min), output_activation_max);
+template <typename T>
+inline T ActivationFunctionWithMinMax(T x, T output_activation_min,
+                                      T output_activation_max) {
+  using std::max;
+  using std::min;
+  return min(max(x, output_activation_min), output_activation_max);
 }

 // Legacy function, left for compatibility only.
@@ -135,23 +138,24 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
 #endif
 }

-inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp(
-    int32 x, int32 quantized_multiplier, int left_shift) {
+inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
  using gemmlowp::RoundingDivideByPOT;
  using gemmlowp::SaturatingRoundingDoublingHighMul;
  return RoundingDivideByPOT(
      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
 }

-inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
-    int32 x, int32 quantized_multiplier, int left_shift) {
+inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
  using gemmlowp::SaturatingRoundingDoublingHighMul;
  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
                                           quantized_multiplier);
 }

-inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
-                                           int shift) {
+inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
  using gemmlowp::RoundingDivideByPOT;
  using gemmlowp::SaturatingRoundingDoublingHighMul;
  int left_shift = shift > 0 ? shift : 0;
@@ -161,16 +165,16 @@ inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
                             right_shift);
 }

-inline int32 MultiplyByQuantizedMultiplier(int64_t x,
-                                           int32 quantized_multiplier,
-                                           int shift) {
+inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
  // Inputs:
  // - quantized_multiplier has fixed point at bit 31
  // - shift is -31 to +7 (negative for right shift)
  //
  // Assumptions: The following input ranges are assumed
  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
-  // - scaling is chosen so final scaled result fits in int32
+  // - scaling is chosen so final scaled result fits in int32_t
  // - input x is in the range -(1<<47) <= x < (1<<47)
  assert(quantized_multiplier >= 0);
  assert(shift >= -31 && shift < 8);
@@ -215,9 +219,9 @@ inline int CountLeadingSignBits(T integer_input) {
  using U = typename std::make_unsigned<T>::type;
  return integer_input >= 0
             ? CountLeadingZeros(static_cast<U>(integer_input)) - 1
-             : integer_input != std::numeric_limits<T>::min()
-                   ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1)
-                   : 0;
+         : integer_input != std::numeric_limits<T>::min()
+             ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1)
+             : 0;
 #endif
 }

@@ -237,8 +241,12 @@ inline Integer FloorLog2(Integer n) {

 // generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
 // softmax
-inline void gen_lut(const std::function<double(double)>& func, double min,
-                    double max, int16_t* table, const int num) {
+// func - the function to build the LUT for (e.g exp(x))
+// min,max - table limits
+// table - pointer to buffer
+// num - number of elements in the LUT
+inline void gen_lut(double (*func)(double), double min, double max,
+                    int16_t* table, const int num) {
  // size of table should equal to num + 1
  // last element only for slope calculation
  double step = (max - min) / (num - 1);
@@ -259,7 +267,35 @@ inline void gen_lut(const std::function<double(double)>& func, double min,
      std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
 }

-// int16 func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
+// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
+// softmax
+// func - the function to build the LUT for (e.g exp(x))
+// min,max - table limits
+// table - pointer to buffer
+// num - number of elements in the LUT
+inline void gen_lut(float (*func)(float), float min, float max, int16_t* table,
+                    const int num) {
+  // size of table should equal to num + 1
+  // last element only for slope calculation
+  float step = (max - min) / (num - 1);
+  float half_step = step / 2.0f;
+  for (int i = 0; i < num - 1; i++) {
+    float sample_val = TfLiteRound(func(min + i * step) * 32768.0f);
+    float midpoint_interp_val =
+        TfLiteRound((func(min + (i + 1) * step) * 32768.0f +
+                     TfLiteRound(func(min + i * step) * 32768.0f)) /
+                    2.0f);
+    float midpoint_val =
+        TfLiteRound(func(min + i * step + half_step) * 32768.0f);
+    float midpoint_err = midpoint_interp_val - midpoint_val;
+    float bias = TfLiteRound(midpoint_err / 2.0f);
+    table[i] = std::min(std::max(sample_val - bias, -32768.0f), 32767.0f);
+  }
+  table[num - 1] = std::min(
+      std::max(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
+}
+
+// int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
 inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) {
  // 512 base value, lut[513] only for calculate slope
  uint16_t index = static_cast<uint16_t>(256 + (value >> 7));
@@ -410,6 +446,23 @@ SaturatingRoundingMultiplyByPOTParam(
      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
 }

+// Convert int32_t multiplier to int16_t with rounding.
+inline void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32_t,
+                                            int16_t* multiplier_int16_t) {
+  TFLITE_DCHECK_GE(multiplier_int32_t, 0);
+  static constexpr int32_t kRoundingOffset = 1 << 15;
+  if (multiplier_int32_t >=
+      std::numeric_limits<int32_t>::max() - kRoundingOffset) {
+    *multiplier_int16_t = std::numeric_limits<int16_t>::max();
+    return;
+  }
+  const int32_t result = (multiplier_int32_t + kRoundingOffset) >> 16;
+  TFLITE_DCHECK_LE(result << 16, multiplier_int32_t + kRoundingOffset);
+  TFLITE_DCHECK_GT(result << 16, multiplier_int32_t - kRoundingOffset);
+  *multiplier_int16_t = result;
+  TFLITE_DCHECK_EQ(*multiplier_int16_t, result);
+}
+
 // Minimum output bits to accommodate log of maximum input range.  It actually
 // does not matter if one considers, say, [-64,64] or [-64,64).
 //
@@ -418,15 +471,13 @@ SaturatingRoundingMultiplyByPOTParam(
 //  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
 //  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
 constexpr int min_log_x_output_bits(int input_bits) {
-  return input_bits > 90
-             ? 7
-             : input_bits > 44
-                   ? 6
-                   : input_bits > 21
-                         ? 5
-                         : input_bits > 10
-                               ? 4
-                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
+  return input_bits > 90   ? 7
+         : input_bits > 44 ? 6
+         : input_bits > 21 ? 5
+         : input_bits > 10 ? 4
+         : input_bits > 4  ? 3
+         : input_bits > 1  ? 2
+                           : 1;
 }

 // Although currently the name of this function says that it cannot handle
@@ -434,17 +485,17 @@ constexpr int min_log_x_output_bits(int input_bits) {
 // x_max is the largest representable input.  In other words, the output range
 // is symmetric.
 template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
 log_x_for_x_greater_than_or_equal_to_1_impl(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1);
-  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits);
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
+  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32_t>::digits - 1);
+  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32_t>::digits);
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
  // The reason for accumulating the result with an extra bit of headroom is
  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
  // recip_denom will otherwise introduce an error.
  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumIntegerBits>;

  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
      FixedPoint0, 1488522236, std::log(2.0));
@@ -472,10 +523,10 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
  // required shift "ourselves" instead of using, say, Rescale.
  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
  // z_a_pow_2 = input_integer_bits - z_a_headroom;
-  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
+  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32_t>(z_a.raw()));
  FixedPoint0 r_a_tmp =
      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
-  const int32 r_a_raw =
+  const int32_t r_a_raw =
      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
@@ -487,8 +538,8 @@ log_x_for_x_greater_than_or_equal_to_1_impl(

  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
  FixedPoint0 z_b = z_a * sqrt_half;
-  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
-  const int32 r_b_raw =
+  int z_b_headroom = CountLeadingZeros(static_cast<uint32_t>(z_b.raw())) - 1;
+  const int32_t r_b_raw =
      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
@@ -516,9 +567,9 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
 }

 template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
 log_x_for_x_greater_than_or_equal_to_1(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
  static_assert(
      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
      "Output integer bits must be sufficient to accommodate logs of inputs.");
@@ -527,25 +578,25 @@ log_x_for_x_greater_than_or_equal_to_1(
      input_val);
 }

-inline int32 GetReciprocal(int32 x, int x_integer_digits,
-                           int* num_bits_over_unit) {
-  int headroom_plus_one = CountLeadingZeros(static_cast<uint32>(x));
+inline int32_t GetReciprocal(int32_t x, int x_integer_digits,
+                             int* num_bits_over_unit) {
+  int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(x));
  // This is the number of bits to the left of the binary point above 1.0.
  // Consider x=1.25.  In that case shifted_scale=0.8 and
  // no later adjustment will be needed.
  *num_bits_over_unit = x_integer_digits - headroom_plus_one;
-  const int32 shifted_sum_minus_one =
-      static_cast<int32>((static_cast<uint32>(x) << headroom_plus_one) -
-                         (static_cast<uint32>(1) << 31));
+  const int32_t shifted_sum_minus_one =
+      static_cast<int32_t>((static_cast<uint32_t>(x) << headroom_plus_one) -
+                           (static_cast<uint32_t>(1) << 31));

-  gemmlowp::FixedPoint<int32, 0> shifted_scale =
+  gemmlowp::FixedPoint<int32_t, 0> shifted_scale =
      gemmlowp::one_over_one_plus_x_for_x_in_0_1(
-          gemmlowp::FixedPoint<int32, 0>::FromRaw(shifted_sum_minus_one));
+          gemmlowp::FixedPoint<int32_t, 0>::FromRaw(shifted_sum_minus_one));
  return shifted_scale.raw();
 }

-inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
-                                             int32* output_inv_sqrt,
+inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
+                                             int32_t* output_inv_sqrt,
                                             int* output_shift) {
  TFLITE_DCHECK_GE(input, 0);
  if (input <= 1) {
@@ -565,7 +616,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
    ++*output_shift;
  }
  const unsigned max_left_shift_bits =
-      CountLeadingZeros(static_cast<uint32>(input)) - 1;
+      CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
  *output_shift -= left_shift_bit_pairs;
@@ -577,8 +628,8 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
  using gemmlowp::SaturatingRoundingMultiplyByPOT;
  // Using 3 integer bits gives us enough room for the internal arithmetic in
  // this Newton-Raphson iteration.
-  using F3 = FixedPoint<int32, 3>;
-  using F0 = FixedPoint<int32, 0>;
+  using F3 = FixedPoint<int32_t, 3>;
+  using F0 = FixedPoint<int32_t, 0>;
  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
  const F3 fixedpoint_half_input =
      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
@@ -645,6 +696,13 @@ inline int SubscriptToIndex(const NdArrayDesc<5>& desc, int indexes[5]) {
         indexes[4] * desc.strides[4];
 }

+inline int SubscriptToIndex(const NdArrayDesc<8>& desc, int indexes[8]) {
+  return indexes[0] * desc.strides[0] + indexes[1] * desc.strides[1] +
+         indexes[2] * desc.strides[2] + indexes[3] * desc.strides[3] +
+         indexes[4] * desc.strides[4] + indexes[5] * desc.strides[5] +
+         indexes[6] * desc.strides[6] + indexes[7] * desc.strides[7];
+}
+
 // Given the dimensions of the operands for an element-wise binary broadcast,
 // adjusts them so that they can be directly iterated over with simple loops.
 // Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/compatibility.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/compatibility.h
@@ -76,13 +76,15 @@ limitations under the License.
 #define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
 #endif

-// TODO(ahentz): Clean up.
+#ifndef TF_LITE_STATIC_MEMORY
+// TODO(b/162019032): Consider removing these type-aliases.
 using int8 = std::int8_t;
 using uint8 = std::uint8_t;
 using int16 = std::int16_t;
 using uint16 = std::uint16_t;
 using int32 = std::int32_t;
 using uint32 = std::uint32_t;
+#endif  // !defined(TF_LITE_STATIC_MEMORY)

 // TFLITE_DEPRECATED()
 //
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/cppmath.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/cppmath.h
@@ -19,8 +19,9 @@ limitations under the License.

 namespace tflite {

-#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
-    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO)
+#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) ||                           \
+    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO) || \
+    defined(__ZEPHYR__)
 #define TF_LITE_GLOBAL_STD_PREFIX
 #else
 #define TF_LITE_GLOBAL_STD_PREFIX std
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/max.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/max.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
+
+#include <cmath>
+
+namespace tflite {
+
+#if defined(TF_LITE_USE_GLOBAL_MAX) || defined(__ZEPHYR__)
+inline float TfLiteMax(const float& x, const float& y) {
+  return std::max(x, y);
+}
+#else
+template <class T>
+inline T TfLiteMax(const T& x, const T& y) {
+  return std::fmax(x, y);
+}
+#endif
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/min.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/min.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
+
+#include <cmath>
+
+namespace tflite {
+
+#if defined(TF_LITE_USE_GLOBAL_MIN) || defined(__ZEPHYR__)
+inline float TfLiteMin(const float& x, const float& y) {
+  return std::min(x, y);
+}
+#else
+template <class T>
+inline T TfLiteMin(const T& x, const T& y) {
+  return std::fmin(x, y);
+}
+#endif
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/portable_tensor.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/portable_tensor.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
-#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_

 #include <complex>
 #include <vector>
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/string_util.h"

 namespace tflite {

@@ -76,12 +75,12 @@ class VectorOfTensors {

 // A list of quantized tensors in a format that can be used by kernels like
 // split and concatenation.
-class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
+class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t> {
 public:
  // Build with the tensors in 'tensor_list'.
  VectorOfQuantizedTensors(const TfLiteContext& context,
                           const TfLiteIntArray& tensor_list)
-      : VectorOfTensors<uint8>(context, tensor_list) {
+      : VectorOfTensors<uint8_t>(context, tensor_list) {
    for (int i = 0; i < tensor_list.size; ++i) {
      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
      zero_point_.push_back(t->params.zero_point);
@@ -90,10 +89,10 @@ class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
  }

  const float* scale() const { return scale_.data(); }
-  const int32* zero_point() const { return zero_point_.data(); }
+  const int32_t* zero_point() const { return zero_point_.data(); }

 private:
-  std::vector<int32> zero_point_;
+  std::vector<int32_t> zero_point_;
  std::vector<float> scale_;
 };

@@ -119,26 +118,6 @@ class SequentialTensorWriter {
  T* output_ptr_;
 };

-template <>
-class SequentialTensorWriter<string> {
- public:
-  SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output)
-      : input_(input), output_(output) {}
-  ~SequentialTensorWriter() { buffer_.WriteToTensor(output_, nullptr); }
-
-  void Write(int position) { this->WriteN(position, 1); }
-  void WriteN(int position, int len) {
-    for (int i = 0; i < len; i++) {
-      buffer_.AddString(GetString(input_, position + i));
-    }
-  }
-
- private:
-  const TfLiteTensor* input_;
-  TfLiteTensor* output_;
-  DynamicBuffer buffer_;
-};
-
 }  // namespace tflite

-#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/quantization_util.cc
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/quantization_util.cc
@@ -342,13 +342,13 @@ void NudgeQuantizationRange(const float min, const float max,
  const float quant_max_float = static_cast<float>(quant_max);
  *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
  const float zero_point_from_min = quant_min_float - min / *nudged_scale;
-  uint16 nudged_zero_point;
+  uint16_t nudged_zero_point;
  if (zero_point_from_min < quant_min_float) {
-    nudged_zero_point = static_cast<uint16>(quant_min);
+    nudged_zero_point = static_cast<uint16_t>(quant_min);
  } else if (zero_point_from_min > quant_max_float) {
-    nudged_zero_point = static_cast<uint16>(quant_max);
+    nudged_zero_point = static_cast<uint16_t>(quant_max);
  } else {
-    nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
+    nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
  }
  *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
  *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/add.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/add.h
@@ -51,34 +51,39 @@ inline void Add(const ArithmeticParams& params,

 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
+
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
 inline void AddElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
-  TFLITE_DCHECK_GT(params.input1_offset, -256);
-  TFLITE_DCHECK_GT(params.input2_offset, -256);
-  TFLITE_DCHECK_LT(params.input1_offset, 256);
-  TFLITE_DCHECK_LT(params.input2_offset, 256);
+                           const T* input1_data, const T* input2_data,
+                           T* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
+  TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
+  TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
+  TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());

  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sum, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<T>(clamped_output);
  }
 }

@@ -86,40 +91,40 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
 // broadcast add, so that, for example, scalar-broadcast with batch will still
 // be fast.
 inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
-                               uint8 input1_data, const uint8* input2_data,
-                               uint8* output_data) {
+                               uint8_t input1_data, const uint8_t* input2_data,
+                               uint8_t* output_data) {
  TFLITE_DCHECK_GT(params.input1_offset, -256);
  TFLITE_DCHECK_GT(params.input2_offset, -256);
  TFLITE_DCHECK_LT(params.input1_offset, 256);
  TFLITE_DCHECK_LT(params.input2_offset, 256);

-  const int32 input1_val = params.input1_offset + input1_data;
-  const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-  const int32 scaled_input1_val =
+  const int32_t input1_val = params.input1_offset + input1_data;
+  const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32_t scaled_input1_val =
      MultiplyByQuantizedMultiplierSmallerThanOneExp(
          shifted_input1_val, params.input1_multiplier, params.input1_shift);
  for (int i = 0; i < size; ++i) {
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input2_val =
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sum, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
  }
 }

 inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  const int flat_size =
@@ -132,24 +137,53 @@ inline void Add(const ArithmeticParams& params,
  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }

+inline void AddGeneralParamScale(const ArithmeticParams& params,
+                                 const RuntimeShape& input1_shape,
+                                 const int16_t* input1_data,
+                                 const RuntimeShape& input2_shape,
+                                 const int16_t* input2_data,
+                                 const RuntimeShape& output_shape,
+                                 int16_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  int max_value = std::numeric_limits<int16_t>::max();
+
+  TFLITE_DCHECK_GT(params.input1_offset, -max_value);
+  TFLITE_DCHECK_GT(params.input2_offset, -max_value);
+  TFLITE_DCHECK_LT(params.input1_offset, max_value);
+  TFLITE_DCHECK_LT(params.input2_offset, max_value);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
 inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
-                const RuntimeShape& output_shape, int16* output_data) {
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data,
+                bool pot_scale = true) {
+  if (!pot_scale) {
+    AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
+                         input2_data, output_shape, output_data);
+    return;
+  }
+
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);

  const int input1_shift = params.input1_shift;
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  const int16 output_activation_min = params.quantized_activation_min;
-  const int16 output_activation_max = params.quantized_activation_max;
+  const int16_t output_activation_min = params.quantized_activation_min;
+  const int16_t output_activation_max = params.quantized_activation_max;

  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
  TFLITE_DCHECK_LE(input1_shift, 0);
  TFLITE_DCHECK_LE(params.input2_shift, 0);
-  const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
-  const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int16_t* not_shift_input =
+      input1_shift == 0 ? input1_data : input2_data;
+  const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
  const int input_right_shift =
      input1_shift == 0 ? -params.input2_shift : -input1_shift;

@@ -161,8 +195,8 @@ inline void Add(const ArithmeticParams& params,
    F0 scaled_input = F0::FromRaw(
        gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
    F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
-    const int16 raw_output = result.raw();
-    const int16 clamped_output = std::min(
+    const int16_t raw_output = result.raw();
+    const int16_t clamped_output = std::min(
        output_activation_max, std::max(output_activation_min, raw_output));
    output_data[i] = clamped_output;
  }
@@ -218,11 +252,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,

 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                               const RuntimeShape& input1_shape,
-                               const int32* input1_data,
+                               const int32_t* input1_data,
                               const RuntimeShape& input2_shape,
-                               const int32* input2_data,
+                               const int32_t* input2_data,
                               const RuntimeShape& output_shape,
-                               int32* output_data) {
+                               int32_t* output_data) {
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -257,13 +291,14 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
  }
 }

-inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
-                               const RuntimeShape& input1_shape,
-                               const uint8* input1_data,
-                               const RuntimeShape& input2_shape,
-                               const uint8* input2_data,
-                               const RuntimeShape& output_shape,
-                               uint8* output_data) {
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
+inline void BroadcastAdd4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -286,34 +321,34 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
              params.input1_offset +
              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
              params.input2_offset +
              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 shifted_input1_val =
+          const int32_t shifted_input1_val =
              input1_val * (1 << params.left_shift);
-          const int32 shifted_input2_val =
+          const int32_t shifted_input2_val =
              input2_val * (1 << params.left_shift);
-          const int32 scaled_input1_val =
+          const int32_t scaled_input1_val =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  shifted_input1_val, params.input1_multiplier,
                  params.input1_shift);
-          const int32 scaled_input2_val =
+          const int32_t scaled_input2_val =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  shifted_input2_val, params.input2_multiplier,
                  params.input2_shift);
-          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-          const int32 raw_output =
+          const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32_t raw_output =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  raw_sum, params.output_multiplier, params.output_shift) +
              params.output_offset;
-          const int32 clamped_output =
+          const int32_t clamped_output =
              std::min(params.quantized_activation_max,
                       std::max(params.quantized_activation_min, raw_output));
          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<uint8>(clamped_output);
+              static_cast<T>(clamped_output);
        }
      }
    }
@@ -322,11 +357,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,

 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
                                 const RuntimeShape& unswitched_input1_shape,
-                                 const uint8* unswitched_input1_data,
+                                 const uint8_t* unswitched_input1_data,
                                 const RuntimeShape& unswitched_input2_shape,
-                                 const uint8* unswitched_input2_data,
+                                 const uint8_t* unswitched_input2_data,
                                 const RuntimeShape& output_shape,
-                                 uint8* output_data) {
+                                 uint8_t* output_data) {
  ArithmeticParams switched_params = unswitched_params;
  switched_params.input1_offset = unswitched_params.input2_offset;
  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
@@ -341,18 +376,18 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,

  const ArithmeticParams& params =
      use_unswitched ? unswitched_params : switched_params;
-  const uint8* input1_data =
+  const uint8_t* input1_data =
      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const uint8* input2_data =
+  const uint8_t* input2_data =
      use_unswitched ? unswitched_input2_data : unswitched_input1_data;

  // Fivefold nested loops. The second input resets its position for each
  // iteration of the second loop. The first input resets its position at the
  // beginning of the fourth loop. The innermost loop is an elementwise add of
  // sections of the arrays.
-  uint8* output_data_ptr = output_data;
-  const uint8* input1_data_ptr = input1_data;
-  const uint8* input2_data_reset = input2_data;
+  uint8_t* output_data_ptr = output_data;
+  const uint8_t* input1_data_ptr = input1_data;
+  const uint8_t* input2_data_reset = input2_data;
  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
  // between input shapes. y3 for input 1 is always broadcast, and so the
  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
@@ -368,7 +403,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
    // dimension.
    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8_t* input2_data_ptr;
      for (int i1 = 0; i1 < y1; ++i1) {
        input2_data_ptr = input2_data_reset;
        for (int i2 = 0; i2 < y2; ++i2) {
@@ -397,7 +432,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
    // for y4 == 1 and the loop over y3 is contained within the
    // AddScalarBroadcast function.
    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8_t* input2_data_ptr;
      for (int i1 = 0; i1 < y1; ++i1) {
        input2_data_ptr = input2_data_reset;
        for (int i2 = 0; i2 < y2; ++i2) {
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/comparisons.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/string_util.h"

 namespace tflite {

@@ -51,18 +50,6 @@ inline bool LessEqualFn(T lhs, T rhs) {
  return lhs <= rhs;
 }

-inline bool StringRefEqualFn(const StringRef& lhs, const StringRef& rhs) {
-  if (lhs.len != rhs.len) return false;
-  for (int i = 0; i < lhs.len; ++i) {
-    if (lhs.str[i] != rhs.str[i]) return false;
-  }
-  return true;
-}
-
-inline bool StringRefNotEqualFn(const StringRef& lhs, const StringRef& rhs) {
-  return !StringRefEqualFn(lhs, rhs);
-}
-
 template <typename T>
 using ComparisonFn = bool (*)(T, T);

@@ -78,22 +65,6 @@ inline void ComparisonImpl(
  }
 }

-template <bool (*F)(const StringRef&, const StringRef&)>
-inline void ComparisonStringImpl(const RuntimeShape& input1_shape,
-                                 const TfLiteTensor* input1,
-                                 const RuntimeShape& input2_shape,
-                                 const TfLiteTensor* input2,
-                                 const RuntimeShape& output_shape,
-                                 bool* output_data) {
-  const int64_t flatsize =
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
-  for (int64_t i = 0; i < flatsize; ++i) {
-    const auto lhs = GetString(input1, i);
-    const auto rhs = GetString(input2, i);
-    output_data[i] = F(lhs, rhs);
-  }
-}
-
 template <ComparisonFn<float> F>
 inline void Comparison(const ComparisonParams& op_params,
                       const RuntimeShape& input1_shape,
@@ -105,30 +76,30 @@ inline void Comparison(const ComparisonParams& op_params,
                           input2_data, output_shape, output_data);
 }

-template <typename T, ComparisonFn<int32> F>
+template <typename T, ComparisonFn<int32_t> F>
 inline void ComparisonWithScaling(
    const ComparisonParams& op_params, const RuntimeShape& input1_shape,
    const T* input1_data, const RuntimeShape& input2_shape,
    const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
  int left_shift = op_params.left_shift;
-  int32 input1_offset = op_params.input1_offset;
-  int32 input1_multiplier = op_params.input1_multiplier;
+  int32_t input1_offset = op_params.input1_offset;
+  int32_t input1_multiplier = op_params.input1_multiplier;
  int input1_shift = op_params.input1_shift;
-  int32 input2_offset = op_params.input2_offset;
-  int32 input2_multiplier = op_params.input2_multiplier;
+  int32_t input2_offset = op_params.input2_offset;
+  int32_t input2_multiplier = op_params.input2_multiplier;
  int input2_shift = op_params.input2_shift;

  const int64_t flatsize =
      MatchingFlatSize(input1_shape, input2_shape, output_shape);
  for (int64_t i = 0; i < flatsize; ++i) {
-    const int32 input1_val = input1_offset + input1_data[i];
-    const int32 input2_val = input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = input1_offset + input1_data[i];
+    const int32_t input2_val = input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, input1_multiplier, input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, input2_multiplier, input2_shift);
    output_data[i] = F(scaled_input1_val, scaled_input2_val);
@@ -180,31 +151,6 @@ inline void BroadcastComparison4DSlowImpl(
  }
 }

-template <bool (*F)(const StringRef&, const StringRef&)>
-inline void BroadcastComparison4DSlowStringImpl(
-    const RuntimeShape& unextended_input1_shape, const TfLiteTensor* input1,
-    const RuntimeShape& unextended_input2_shape, const TfLiteTensor* input2,
-    const RuntimeShape& unextended_output_shape, bool* output_data) {
-  const BroadcastComparison4DSlowCommon dims =
-      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
-                                          unextended_input2_shape,
-                                          unextended_output_shape);
-
-  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
-    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
-      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
-        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
-          const auto lhs =
-              GetString(input1, SubscriptToIndex(dims.desc1, b, y, x, c));
-          const auto rhs =
-              GetString(input2, SubscriptToIndex(dims.desc2, b, y, x, c));
-          output_data[Offset(dims.output_shape, b, y, x, c)] = F(lhs, rhs);
-        }
-      }
-    }
-  }
-}
-
 template <ComparisonFn<float> F>
 inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
                                      const RuntimeShape& input1_shape,
@@ -218,7 +164,7 @@ inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
                                          output_shape, output_data);
 }

-template <typename T, ComparisonFn<int32> F>
+template <typename T, ComparisonFn<int32_t> F>
 inline void BroadcastComparison4DSlowWithScaling(
    const ComparisonParams& op_params,
    const RuntimeShape& unextended_input1_shape, const T* input1_data,
@@ -230,29 +176,29 @@ inline void BroadcastComparison4DSlowWithScaling(
                                          unextended_output_shape);

  int left_shift = op_params.left_shift;
-  int32 input1_offset = op_params.input1_offset;
-  int32 input1_multiplier = op_params.input1_multiplier;
+  int32_t input1_offset = op_params.input1_offset;
+  int32_t input1_multiplier = op_params.input1_multiplier;
  int input1_shift = op_params.input1_shift;
-  int32 input2_offset = op_params.input2_offset;
-  int32 input2_multiplier = op_params.input2_multiplier;
+  int32_t input2_offset = op_params.input2_offset;
+  int32_t input2_multiplier = op_params.input2_multiplier;
  int input2_shift = op_params.input2_shift;

  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
              input1_offset +
              input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
              input2_offset +
              input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)];
-          const int32 shifted_input1_val = input1_val * (1 << left_shift);
-          const int32 shifted_input2_val = input2_val * (1 << left_shift);
-          const int32 scaled_input1_val =
+          const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+          const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+          const int32_t scaled_input1_val =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  shifted_input1_val, input1_multiplier, input1_shift);
-          const int32 scaled_input2_val =
+          const int32_t scaled_input2_val =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  shifted_input2_val, input2_multiplier, input2_shift);
          output_data[Offset(dims.output_shape, b, y, x, c)] =
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/concatenation.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/concatenation.h
@@ -74,14 +74,14 @@ inline void Concatenation(const ConcatenationParams& params,
 // when optimizng this routine further.
 inline void ConcatenationWithScaling(const ConcatenationParams& params,
                                     const RuntimeShape* const* input_shapes,
-                                     const uint8* const* input_data,
+                                     const uint8_t* const* input_data,
                                     const RuntimeShape& output_shape,
-                                     uint8* output_data) {
+                                     uint8_t* output_data) {
  int axis = params.axis;
-  const int32* input_zeropoint = params.input_zeropoint;
+  const int32_t* input_zeropoint = params.input_zeropoint;
  const float* input_scale = params.input_scale;
  int inputs_count = params.inputs_count;
-  const int32 output_zeropoint = params.output_zeropoint;
+  const int32_t output_zeropoint = params.output_zeropoint;
  const float output_scale = params.output_scale;

  const int concat_dimensions = output_shape.DimensionsCount();
@@ -110,11 +110,11 @@ inline void ConcatenationWithScaling(const ConcatenationParams& params,
  }

  const float inverse_output_scale = 1.f / output_scale;
-  uint8* output_ptr = output_data;
+  uint8_t* output_ptr = output_data;
  for (int k = 0; k < outer_size; k++) {
    for (int i = 0; i < inputs_count; ++i) {
      const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
-      const uint8* input_ptr = input_data[i] + k * copy_size;
+      const uint8_t* input_ptr = input_data[i] + k * copy_size;
      if (input_zeropoint[i] == output_zeropoint &&
          input_scale[i] == output_scale) {
        memcpy(output_ptr, input_ptr, copy_size);
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/conv.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/conv.h
@@ -59,28 +59,31 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
  const int output_width = output_shape.Dims(2);
  for (int batch = 0; batch < batches; ++batch) {
    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
          float total = 0.f;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  float input_value = input_data[Offset(
-                      input_shape, batch, in_y, in_x, in_channel)];
-                  float filter_value =
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
-                                         filter_x, in_channel)];
-                  total += (input_value * filter_value);
-                }
+                float input_value = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                float filter_value = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                total += (input_value * filter_value);
              }
            }
          }
@@ -99,11 +102,11 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
 }

 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
-                 const uint8* input_data, const RuntimeShape& filter_shape,
-                 const uint8* filter_data, const RuntimeShape& bias_shape,
-                 const int32* bias_data, const RuntimeShape& output_shape,
-                 uint8* output_data, const RuntimeShape& im2col_shape,
-                 uint8* im2col_data, void* cpu_backend_context) {
+                 const uint8_t* input_data, const RuntimeShape& filter_shape,
+                 const uint8_t* filter_data, const RuntimeShape& bias_shape,
+                 const int32_t* bias_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data, const RuntimeShape& im2col_shape,
+                 uint8_t* im2col_data, void* cpu_backend_context) {
  (void)cpu_backend_context;  // only used in optimized code.
  (void)im2col_data;   // only used in optimized code.
  (void)im2col_shape;  // only used in optimized code.
@@ -113,13 +116,13 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
  const int dilation_height_factor = params.dilation_height_factor;
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);

  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -139,29 +142,32 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
  const int output_width = output_shape.Dims(2);
  for (int batch = 0; batch < batches; ++batch) {
    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
-          int32 acc = 0;
+          int32_t acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
                                                      in_x, in_channel)];
-                  int32 filter_val =
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
-                                         filter_x, in_channel)];
-                  acc +=
-                      (filter_val + filter_offset) * (input_val + input_offset);
-                }
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                acc +=
+                    (filter_val + filter_offset) * (input_val + input_offset);
              }
            }
          }
@@ -174,7 +180,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
          acc = std::max(acc, output_activation_min);
          acc = std::min(acc, output_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-              static_cast<uint8>(acc);
+              static_cast<uint8_t>(acc);
        }
      }
    }
@@ -220,7 +226,7 @@ inline void HybridConvPerChannel(
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
          const int in_x_origin = (out_x * stride_width) - pad_width;
          const int in_y_origin = (out_y * stride_height) - pad_height;
-          int32 acc = 0;
+          int32_t acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
@@ -231,9 +237,9 @@ inline void HybridConvPerChannel(
                // use zero as a default value.
                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height)) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val =
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
                      filter_data[Offset(filter_shape, out_channel, filter_y,
                                         filter_x, in_channel)];
                  acc += filter_val * (input_val - input_offset[batch]);
@@ -258,5 +264,4 @@ inline void HybridConvPerChannel(
 }  // namespace reference_ops
 }  // namespace tflite

-
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -62,21 +62,21 @@ namespace reference_ops {
 namespace depthwise_conv {

 template <DepthwiseConvOutputRounding output_rounding>
-inline int32 DepthwiseConvRound(int32 x, int32 quantized_multiplier,
-                                int shift) {
+inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
+                                  int shift) {
  TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
 }

 template <>
-inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
-    int32 x, int32 quantized_multiplier, int shift) {
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
+    int32_t x, int32_t quantized_multiplier, int shift) {
  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
 }

 template <>
-inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
-    int32 x, int32 quantized_multiplier, int shift) {
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
+    int32_t x, int32_t quantized_multiplier, int shift) {
  using gemmlowp::SaturatingRoundingDoublingHighMul;
  const int left_shift = shift > 0 ? shift : 0;
  const int right_shift = shift > 0 ? 0 : -shift;
@@ -89,13 +89,12 @@ inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(

 template <DepthwiseConvOutputRounding output_rounding>
 struct DepthwiseConvBasicKernel {
-  static inline void Run(const DepthwiseParams& params,
-                         const RuntimeShape& input_shape,
-                         const uint8* input_data,
-                         const RuntimeShape& filter_shape,
-                         const uint8* filter_data,
-                         const RuntimeShape& bias_shape, const int32* bias_data,
-                         const RuntimeShape& output_shape, uint8* output_data) {
+  static inline void Run(
+      const DepthwiseParams& params, const RuntimeShape& input_shape,
+      const uint8_t* input_data, const RuntimeShape& filter_shape,
+      const uint8_t* filter_data, const RuntimeShape& bias_shape,
+      const int32_t* bias_data, const RuntimeShape& output_shape,
+      uint8_t* output_data) {
    const int stride_width = params.stride_width;
    const int stride_height = params.stride_height;
    const int dilation_width_factor = params.dilation_width_factor;
@@ -103,12 +102,12 @@ struct DepthwiseConvBasicKernel {
    const int pad_width = params.padding_values.width;
    const int pad_height = params.padding_values.height;
    const int depth_multiplier = params.depth_multiplier;
-    const int32 output_activation_min = params.quantized_activation_min;
-    const int32 output_activation_max = params.quantized_activation_max;
-    const int32 input_offset = params.input_offset;
-    const int32 filter_offset = params.weights_offset;
-    const int32 output_offset = params.output_offset;
-    const int32 output_multiplier = params.output_multiplier;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+    const int32_t input_offset = params.input_offset;
+    const int32_t filter_offset = params.weights_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_multiplier = params.output_multiplier;
    const int output_shift = params.output_shift;
    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
@@ -135,7 +134,7 @@ struct DepthwiseConvBasicKernel {
              const int oc = m + ic * depth_multiplier;
              const int in_x_origin = (out_x * stride_width) - pad_width;
              const int in_y_origin = (out_y * stride_height) - pad_height;
-              int32 acc = 0;
+              int32_t acc = 0;
              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                  const int in_x =
@@ -146,9 +145,9 @@ struct DepthwiseConvBasicKernel {
                  // use zero as a default value.
                  if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                      (in_y < input_height)) {
-                    int32 input_val =
+                    int32_t input_val =
                        input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                    int32 filter_val = filter_data[Offset(
+                    int32_t filter_val = filter_data[Offset(
                        filter_shape, 0, filter_y, filter_x, oc)];
                    acc += (filter_val + filter_offset) *
                           (input_val + input_offset);
@@ -164,7 +163,7 @@ struct DepthwiseConvBasicKernel {
              acc = std::max(acc, output_activation_min);
              acc = std::min(acc, output_activation_max);
              output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                  static_cast<uint8>(acc);
+                  static_cast<uint8_t>(acc);
            }
          }
        }
@@ -176,10 +175,10 @@ struct DepthwiseConvBasicKernel {
  // MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
  static inline void RunPerChannel(
      const DepthwiseParams& params, const RuntimeShape& input_shape,
-      const int8* input_data, const RuntimeShape& filter_shape,
-      const int8* filter_data, const RuntimeShape& bias_shape,
-      const int32* bias_data, const RuntimeShape& output_shape,
-      int8* output_data) {
+      const int8_t* input_data, const RuntimeShape& filter_shape,
+      const int8_t* filter_data, const RuntimeShape& bias_shape,
+      const int32_t* bias_data, const RuntimeShape& output_shape,
+      int8_t* output_data) {
    // Get parameters.
    // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
    const int stride_width = params.stride_width;
@@ -189,12 +188,12 @@ struct DepthwiseConvBasicKernel {
    const int pad_width = params.padding_values.width;
    const int pad_height = params.padding_values.height;
    const int depth_multiplier = params.depth_multiplier;
-    const int32 input_offset = params.input_offset;
-    const int32 output_offset = params.output_offset;
-    const int32 output_activation_min = params.quantized_activation_min;
-    const int32 output_activation_max = params.quantized_activation_max;
-    const int32* output_multiplier = params.output_multiplier_per_channel;
-    const int32* output_shift = params.output_shift_per_channel;
+    const int32_t input_offset = params.input_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+    const int32_t* output_multiplier = params.output_multiplier_per_channel;
+    const int32_t* output_shift = params.output_shift_per_channel;

    // Check dimensions of the tensors.
    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -222,7 +221,7 @@ struct DepthwiseConvBasicKernel {
              const int output_channel = m + in_channel * depth_multiplier;
              const int in_x_origin = (out_x * stride_width) - pad_width;
              const int in_y_origin = (out_y * stride_height) - pad_height;
-              int32 acc = 0;
+              int32_t acc = 0;
              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                  const int in_x =
@@ -234,17 +233,18 @@ struct DepthwiseConvBasicKernel {
                      (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                      (in_y < input_height);
                  if (is_point_inside_image) {
-                    int32 input_val = input_data[Offset(
+                    int32_t input_val = input_data[Offset(
                        input_shape, batch, in_y, in_x, in_channel)];
-                    int32 filter_val = filter_data[Offset(
+                    int32_t filter_val = filter_data[Offset(
                        filter_shape, 0, filter_y, filter_x, output_channel)];
                    // Accumulate with 32 bits accumulator.
                    // In the nudging process during model quantization, we
                    // force real value of 0.0 be represented by a quantized
-                    // value. This guarantees that the input_offset is a int8,
-                    // even though it is represented using int32. int32 += int8
-                    // * (int8 - int8) so the highest value we can get from each
-                    // accumulation is [-127, 127] * ([-128, 127] -
+                    // value. This guarantees that the input_offset is a int8_t,
+                    // even though it is represented using int32_t. int32_t +=
+                    // int8_t
+                    // * (int8_t - int8_t) so the highest value we can get from
+                    // each accumulation is [-127, 127] * ([-128, 127] -
                    // [-128, 127]), which is [-32512, 32512]. log2(32512)
                    // = 14.98, which means we can accumulate at least 2^16
                    // multiplications without overflow. The accumulator is
@@ -279,10 +279,10 @@ struct DepthwiseConvBasicKernel {

 inline void DepthwiseConv(
    const DepthwiseParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data) {
  return depthwise_conv::DepthwiseConvBasicKernel<
      DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
                                                       input_data, filter_shape,
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/dequantize.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/dequantize.h
@@ -32,12 +32,12 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
                       const RuntimeShape& input_shape,
                       const InputT* input_data,
                       const RuntimeShape& output_shape, OutputT* output_data) {
-  int32 zero_point = op_params.zero_point;
+  int32_t zero_point = op_params.zero_point;
  const double scale = op_params.scale;
  const int flat_size = MatchingFlatSize(input_shape, output_shape);

  for (int i = 0; i < flat_size; i++) {
-    const int32 val = input_data[i];
+    const int32_t val = input_data[i];
    const OutputT result = static_cast<OutputT>(scale * (val - zero_point));
    output_data[i] = result;
  }
@@ -52,11 +52,11 @@ inline void PerChannelDequantize(
  // Ensure flat size is same.
  MatchingFlatSize(input_shape, output_shape);

-  const int32* zero_point = op_params.zero_point;
+  const int32_t* zero_point = op_params.zero_point;
  const float* scale = op_params.scale;
-  const int32 quantized_dimension = op_params.quantized_dimension;
-  const int32 num_dims = input_shape.DimensionsCount();
-  const int32* dims_data = input_shape.DimsData();
+  const int32_t quantized_dimension = op_params.quantized_dimension;
+  const int32_t num_dims = input_shape.DimensionsCount();
+  const int32_t* dims_data = input_shape.DimsData();
  std::vector<int> current_dim(num_dims, 0);

  do {
@@ -64,7 +64,7 @@ inline void PerChannelDequantize(
        ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
                            current_dim.data(), 0, nullptr);
    const int channel = current_dim[quantized_dimension];
-    const int32 val = input_data[offset];
+    const int32_t val = input_data[offset];
    const float result =
        static_cast<float>(scale[channel] * (val - zero_point[channel]));
    output_data[offset] = result;
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/fully_connected.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/fully_connected.h
@@ -61,17 +61,17 @@ inline void FullyConnected(

 inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);

@@ -89,10 +89,10 @@ inline void FullyConnected(
  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
  for (int b = 0; b < batches; ++b) {
    for (int out_c = 0; out_c < output_depth; ++out_c) {
-      int32 acc = 0;
+      int32_t acc = 0;
      for (int d = 0; d < accum_depth; ++d) {
-        int32 input_val = input_data[b * accum_depth + d];
-        int32 filter_val = filter_data[out_c * accum_depth + d];
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
        acc += (filter_val + filter_offset) * (input_val + input_offset);
      }
      if (bias_data) {
@@ -102,24 +102,24 @@ inline void FullyConnected(
      acc += output_offset;
      acc = std::max(acc, output_activation_min);
      acc = std::min(acc, output_activation_max);
-      output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
+      output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
    }
  }
 }

 inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int16* output_data) {
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;

  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  TFLITE_DCHECK_EQ(output_offset, 0);
@@ -138,20 +138,21 @@ inline void FullyConnected(
    for (int out_c = 0; out_c < output_depth; ++out_c) {
      // Internal accumulation.
      // Initialize accumulator with the bias-value.
-      int32 accum = bias_data[out_c];
+      int32_t accum = bias_data[out_c];
      // Accumulation loop.
      for (int d = 0; d < accum_depth; ++d) {
-        int16 input_val = input_data[b * accum_depth + d] + input_offset;
-        int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset;
+        int16_t input_val = input_data[b * accum_depth + d] + input_offset;
+        int16_t filter_val =
+            filter_data[out_c * accum_depth + d] + filter_offset;
        accum += filter_val * input_val;
      }
-      // Down-scale the final int32 accumulator to the scale used by our
+      // Down-scale the final int32_t accumulator to the scale used by our
      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
      // multiplier and shift here have been pre-computed offline
      // (e.g. by toco).
      accum =
          MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
-      // Saturate, cast to int16, and store to output array.
+      // Saturate, cast to int16_t, and store to output array.
      accum = std::max(accum, output_activation_min - output_offset);
      accum = std::min(accum, output_activation_max - output_offset);
      accum += output_offset;
@@ -162,14 +163,14 @@ inline void FullyConnected(

 inline void ShuffledFullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& weights_shape,
-    const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int16* output_data, uint8* shuffled_input_workspace_data) {
-  const int32 output_multiplier = params.output_multiplier;
+    const uint8_t* input_data, const RuntimeShape& weights_shape,
+    const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, uint8_t* shuffled_input_workspace_data) {
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);

  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
@@ -190,7 +191,7 @@ inline void ShuffledFullyConnected(
  TFLITE_DCHECK((output_depth % 4) == 0);

  // Shuffling and xoring of input activations into the workspace buffer
-  uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+  uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
  if (batches == 1) {
    for (int i = 0; i < accum_depth; i++) {
      shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
@@ -198,13 +199,13 @@ inline void ShuffledFullyConnected(
  } else if (batches == 4) {
    for (int c = 0; c < accum_depth; c += 16) {
      for (int b = 0; b < 4; b++) {
-        const uint8* src_data_ptr = input_data + b * accum_depth + c;
+        const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
        for (int j = 0; j < 16; j++) {
-          uint8 src_val = *src_data_ptr++;
+          uint8_t src_val = *src_data_ptr++;
          // Flip the sign bit, so that the kernel will only need to
-          // reinterpret these uint8 values as int8, getting for free the
+          // reinterpret these uint8_t values as int8_t, getting for free the
          // subtraction of the zero_point value 128.
-          uint8 dst_val = src_val ^ 0x80;
+          uint8_t dst_val = src_val ^ 0x80;
          *shuffled_input_workspace_ptr++ = dst_val;
        }
      }
@@ -216,62 +217,62 @@ inline void ShuffledFullyConnected(

  // Actual computation
  if (batches == 1) {
-    int16* output_ptr = output_data;
+    int16_t* output_ptr = output_data;
    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-    // so that just reinterpreting them as int8 values is equivalent to
+    // so that just reinterpreting them as int8_t values is equivalent to
    // subtracting 128 from them, thus implementing for free the subtraction of
    // the zero_point value 128.
-    const int8* shuffled_weights_ptr =
-        reinterpret_cast<const int8*>(shuffled_weights_data);
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
    // Likewise, we preshuffled and pre-xored the input data above.
-    const int8* shuffled_input_data =
-        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
    for (int c = 0; c < output_depth; c += 4) {
      // Internal accumulation.
      // Initialize accumulator with the bias-value.
-      int32 accum[4] = {0};
+      int32_t accum[4] = {0};
      // Accumulation loop.
      for (int d = 0; d < accum_depth; d += 16) {
        for (int i = 0; i < 4; i++) {
          for (int j = 0; j < 16; j++) {
-            int8 input_val = shuffled_input_data[d + j];
-            int8 weights_val = *shuffled_weights_ptr++;
+            int8_t input_val = shuffled_input_data[d + j];
+            int8_t weights_val = *shuffled_weights_ptr++;
            accum[i] += weights_val * input_val;
          }
        }
      }
      for (int i = 0; i < 4; i++) {
        // Add bias value
-        int32 acc = accum[i] + bias_data[c + i];
-        // Down-scale the final int32 accumulator to the scale used by our
+        int32_t acc = accum[i] + bias_data[c + i];
+        // Down-scale the final int32_t accumulator to the scale used by our
        // (16-bit, typically 3 integer bits) fixed-point format. The quantized
        // multiplier and shift here have been pre-computed offline
        // (e.g. by toco).
        acc =
            MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
-        // Saturate, cast to int16, and store to output array.
+        // Saturate, cast to int16_t, and store to output array.
        acc = std::max(acc, output_activation_min);
        acc = std::min(acc, output_activation_max);
        output_ptr[c + i] = acc;
      }
    }
  } else if (batches == 4) {
-    int16* output_ptr = output_data;
+    int16_t* output_ptr = output_data;
    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-    // so that just reinterpreting them as int8 values is equivalent to
+    // so that just reinterpreting them as int8_t values is equivalent to
    // subtracting 128 from them, thus implementing for free the subtraction of
    // the zero_point value 128.
-    const int8* shuffled_weights_ptr =
-        reinterpret_cast<const int8*>(shuffled_weights_data);
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
    // Likewise, we preshuffled and pre-xored the input data above.
-    const int8* shuffled_input_data =
-        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
    for (int c = 0; c < output_depth; c += 4) {
-      const int8* shuffled_input_ptr = shuffled_input_data;
+      const int8_t* shuffled_input_ptr = shuffled_input_data;
      // Accumulation loop.
      // Internal accumulation.
      // Initialize accumulator with the bias-value.
-      int32 accum[4][4];
+      int32_t accum[4][4];
      for (int i = 0; i < 4; i++) {
        for (int b = 0; b < 4; b++) {
          accum[i][b] = 0;
@@ -281,8 +282,8 @@ inline void ShuffledFullyConnected(
        for (int i = 0; i < 4; i++) {
          for (int b = 0; b < 4; b++) {
            for (int j = 0; j < 16; j++) {
-              int8 input_val = shuffled_input_ptr[16 * b + j];
-              int8 weights_val = shuffled_weights_ptr[16 * i + j];
+              int8_t input_val = shuffled_input_ptr[16 * b + j];
+              int8_t weights_val = shuffled_weights_ptr[16 * i + j];
              accum[i][b] += weights_val * input_val;
            }
          }
@@ -293,14 +294,14 @@ inline void ShuffledFullyConnected(
      for (int i = 0; i < 4; i++) {
        for (int b = 0; b < 4; b++) {
          // Add bias value
-          int32 acc = accum[i][b] + bias_data[c + i];
-          // Down-scale the final int32 accumulator to the scale used by our
+          int32_t acc = accum[i][b] + bias_data[c + i];
+          // Down-scale the final int32_t accumulator to the scale used by our
          // (16-bit, typically 3 integer bits) fixed-point format. The
          // quantized multiplier and shift here have been pre-computed offline
          // (e.g. by toco).
          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
                                              output_shift);
-          // Saturate, cast to int16, and store to output array.
+          // Saturate, cast to int16_t, and store to output array.
          acc = std::max(acc, output_activation_min);
          acc = std::min(acc, output_activation_max);
          output_ptr[b * output_depth + c + i] = acc;
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/hard_swish.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/hard_swish.h
@@ -0,0 +1,166 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline int16_t SaturatingLeftShift(int16_t value, int amount) {
+  int32_t result = static_cast<int32_t>(value) * (1 << amount);
+  result = std::min<int32_t>(result, std::numeric_limits<int16_t>::max());
+  result = std::max<int32_t>(result, std::numeric_limits<int16_t>::min());
+  return result;
+}
+
+// Similar to ARM instruction SQDMULH.
+// Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
+// rounding to zero instead of to nearest (SQRDMULH).
+inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
+  bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
+  std::int32_t a_32(a);
+  std::int32_t b_32(b);
+  std::int32_t ab_32 = a_32 * b_32;
+  std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
+  return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
+}
+
+template <typename T>
+inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("ReferenceHardSwish/Float");
+  auto matching_size = MatchingFlatSize(input_shape, output_shape);
+  const T* in_end = input_data + matching_size;
+  for (; input_data < in_end; input_data++, output_data++) {
+    const float in = *input_data;
+    *output_data =
+        in * std::min(static_cast<T>(6), std::max(static_cast<T>(0), in + 3)) /
+        6;
+  }
+}
+
+template <typename T>
+inline void HardSwish(const HardSwishParams& params,
+                      const RuntimeShape& input_shape, const T* input_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("ReferenceHardSwish/Quantized");
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const int16_t input_value = input_data[i] - params.input_zero_point;
+    // Left-shift as much as we can without overflow/saturation to put
+    // significant bits in the high bits of our 16-bit fixedpoint values, so
+    // that fixed-point approximate computations below are as accurate as
+    // possible.
+    const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
+    // Compute the input value on essentially the output scale, just not
+    // right-shifted yet. This is the value that we'll use in the (x >= +3)
+    // case, and that in the general case we'll multiply against the "relu-ish"
+    // fixed-point multiplier in [0, 1].
+    const int16_t input_value_on_preshift_output_scale =
+        gemmlowp::SaturatingRoundingDoublingHighMul(
+            input_value_on_hires_input_scale,
+            params.output_multiplier_fixedpoint_int16);
+    // Now compute the "relu-ish multiplier". In the (-3 <= x <= +3) case, that
+    // is just an affine rescaling of x from [-3, 3] to [0, 1]. In the general
+    // case, it is just that plus saturation at the boundaries of [-3, 3].
+    // First, we rescale from [-3, 3] to [-1, 1], saturating.
+    // That is done by rescaling the input value with a fixed-point multiplier
+    // (reluish_multiplier_fixedpoint) and bit-shift such that we represent
+    // that input value on the scale where the real value 3.0f is represented
+    // by the quantized value 32768.  (+32768 is actually not representable as
+    // int16_t, so this saturates at +32767, and that is seen empirically to be
+    // a negligible contribution to numerical error/bias).
+    //
+    // This code is careful to correctly implement any magnitude of multiplier,
+    // involving either a right shift or a left shift, with correct saturation
+    // behavior in the left-shift case. This forces this code to be more
+    // complicated, but is necessary for real applications: a partially
+    // trained quantized MobileNet v3-small model that motivated this code
+    // exhibits some large [min, max] range boundaries, of the order of
+    // magnitude of 10 or 100 depending on layers.
+    //
+    // The next few lines are basically just an ordinary
+    // MultiplyByQuantizedMultiplier, except that we are more careful here
+    // about the fine details of saturation when left-shifting, because here
+    // overflow in left-shift is a common case, not an anomaly as
+    // MultiplyByQuantizedMultiplier assumes.
+    int16_t reluish_value = input_value_on_hires_input_scale;
+    // Shift left, saturating, as much as we can while ensuring that this
+    // saturation will not contribute to the result. That is, left shift amount
+    // reduced by 1.
+    if (params.reluish_multiplier_exponent > 0) {
+      reluish_value = SaturatingLeftShift(
+          reluish_value, params.reluish_multiplier_exponent - 1);
+    }
+    // Apply the fixed-point multiplier, dividing the value by a divisor
+    // ranging in [1, 2].
+    reluish_value = gemmlowp::SaturatingRoundingDoublingHighMul(
+        reluish_value, params.reluish_multiplier_fixedpoint_int16);
+    // Apply the last bit of left-shift. Thus, in the left-shifting case, if
+    // any saturation affects the result, it is happening here --- any
+    // saturation having occurred above is overwritten here, not affecting the
+    // result.
+    if (params.reluish_multiplier_exponent > 0) {
+      reluish_value = SaturatingLeftShift(reluish_value, 1);
+    }
+    // Shift right, in the right-shifting case.
+    if (params.reluish_multiplier_exponent < 0) {
+      reluish_value = gemmlowp::RoundingDivideByPOT(
+          reluish_value, -params.reluish_multiplier_exponent);
+    }
+    // At this point we have rescaled the value into a 16bit fixedpoint
+    // reluish_value in [-1, 1].
+    // We now convert that to a 16bit fixedpoint value in [0, 1].
+    reluish_value = (reluish_value + (1 << 15)) >> 1;
+    // Use of SaturatingDoublingHighMul here is important to cancel the biases
+    // from the above SaturatingRoundingDoublingHighMul.
+    //
+    // On a partially trained MobileNet-v3-small,
+    //
+    //                                       | bias on    |  ImageNet
+    //                                       | quantized  |  Top-1
+    // Operation used here                   | values     |  accuracy (50k)
+    // --------------------------------------+------------+-----------
+    // SaturatingDoublingHighMul             | -0.0024    |  58.920
+    // SaturatingRoundingDoublingHighMul     | -0.0067    |  58.064
+    //
+    // In activations_test, this is covered by this testcase:
+    //     QuantizedActivationsOpTest.HardSwishBias
+    //
+    const int16_t preshift_output_value = SaturatingDoublingHighMul(
+        reluish_value, input_value_on_preshift_output_scale);
+    // We were so far operating on the pre-shift output scale. Now we finally
+    // apply that output shift, arriving at the final output scale.
+    int16_t output_value = gemmlowp::RoundingDivideByPOT(
+        preshift_output_value, -params.output_multiplier_exponent);
+    output_value += params.output_zero_point;
+    output_value =
+        std::min<int16_t>(output_value, std::numeric_limits<T>::max());
+    output_value =
+        std::max<int16_t>(output_value, std::numeric_limits<T>::min());
+    output_data[i] = output_value;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -23,34 +23,41 @@ limitations under the License.
 namespace tflite {
 namespace reference_integer_ops {

+inline void CheckArithmeticParams(const ArithmeticParams& params) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  // Input offset is negative input zero point. Activation tensors are
+  // asymmetric quantized so they span the full int8 range.
+  TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_GE(-params.input2_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
+  TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
+}
+
 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
 inline void AddElementwise(int size, const ArithmeticParams& params,
                           const int8_t* input1_data, const int8_t* input2_data,
                           int8_t* output_data) {
-  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
-  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
-  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
-  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
-  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+  CheckArithmeticParams(params);

  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sum, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
    output_data[i] = static_cast<int8_t>(clamped_output);
@@ -61,16 +68,11 @@ inline void Add(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const int8_t* input1_data,
                const RuntimeShape& input2_shape, const int8_t* input2_data,
                const RuntimeShape& output_shape, int8_t* output_data) {
-  TFLITE_DCHECK_LE(params.quantized_activation_min,
-                   params.quantized_activation_max);
+  CheckArithmeticParams(params);
+
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);

-  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
-  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
-  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
-  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
-  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }

--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -22,27 +22,27 @@ namespace reference_integer_ops {

 // Fixed-point per-channel-quantization convolution reference kernel.
 inline void ConvPerChannel(
-    const ConvParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int8* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int8* output_data) {
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
  // Get parameters.
-  const int32 input_offset = params.input_offset;  // r = s(q - Z)
+  const int32_t input_offset = params.input_offset;  // r = s(q - Z)
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
  const int dilation_width_factor = params.dilation_width_factor;
  const int dilation_height_factor = params.dilation_height_factor;
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
-  const int32 output_offset = params.output_offset;
+  const int32_t output_offset = params.output_offset;

  // Set min and max value of the output.
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;

-  // Sanity check.
+  // Consistency check.
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
@@ -63,45 +63,47 @@ inline void ConvPerChannel(
  const int output_width = output_shape.Dims(2);
  for (int batch = 0; batch < batches; ++batch) {
    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
-          int32 acc = 0;
+          int32_t acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // Zero padding by omitting the areas outside the image.
-                const bool is_point_inside_image =
-                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height);
-                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
                                                      in_x, in_channel)];
-                  int32 filter_val =
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
-                                         filter_x, in_channel)];
-                  // Accumulate with 32 bits accumulator.
-                  // In the nudging process during model quantization, we force
-                  // real value of 0.0 be represented by a quantized value. This
-                  // guarantees that the input_offset is a int8, even though it
-                  // is represented using int32.
-                  // int32 += int8 * (int8 - int8) so the highest value we can
-                  // get from each accumulation is [-127, 127] * ([-128, 127] -
-                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
-                  // = 14.98, which means we can accumulate at least 2^16
-                  // multiplications without overflow. The accumulator is
-                  // applied to a filter so the accumulation logic will hold as
-                  // long as the filter size (filter_y * filter_x * in_channel)
-                  // does not exceed 2^16, which is the case in all the models
-                  // we have seen so far.
-                  // TODO(jianlijianli): Add a check to make sure the
-                  // accumulator depth is smaller than 2^16.
-                  acc += filter_val * (input_val + input_offset);
-                }
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 32 bits accumulator.
+                // In the nudging process during model quantization, we force
+                // real value of 0.0 be represented by a quantized value. This
+                // guarantees that the input_offset is a int8_t, even though
+                // it is represented using int32_t. int32_t += int8_t *
+                // (int8_t - int8_t) so the highest value we can get from each
+                // accumulation is [-127, 127] * ([-128, 127] -
+                // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                // = 14.98, which means we can accumulate at least 2^16
+                // multiplications without overflow. The accumulator is
+                // applied to a filter so the accumulation logic will hold as
+                // long as the filter size (filter_y * filter_x * in_channel)
+                // does not exceed 2^16, which is the case in all the models
+                // we have seen so far.
+                // TODO(jianlijianli): Add a check to make sure the
+                // accumulator depth is smaller than 2^16.
+                acc += filter_val * (input_val + input_offset);
              }
            }
          }
@@ -125,12 +127,12 @@ inline void ConvPerChannel(
 // Fixed-point per-channel-quantization convolution reference kernel.
 // 16-bit data and 8-bit filter
 inline void ConvPerChannel(
-    const ConvParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int16* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const std::int64_t* bias_data, const RuntimeShape& output_shape,
-    int16* output_data) {
+    int16_t* output_data) {
  // Get parameters.
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
@@ -140,10 +142,10 @@ inline void ConvPerChannel(
  const int pad_height = params.padding_values.height;

  // Set min and max value of the output.
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;

-  // Sanity check.
+  // Consistency check.
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
@@ -164,35 +166,37 @@ inline void ConvPerChannel(
  const int output_width = output_shape.Dims(2);
  for (int batch = 0; batch < batches; ++batch) {
    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
          std::int64_t acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // Zero padding by omitting the areas outside the image.
-                const bool is_point_inside_image =
-                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height);
-                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
                                                      in_x, in_channel)];
-                  int32 filter_val =
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
-                                         filter_x, in_channel)];
-                  // Accumulate with 64 bits accumulator.
-                  // int64 += int8 * int16 so the highest value we can
-                  // get from each accumulation is [-127, 127] * ([-32768,
-                  // 32767] -
-                  // [-32768, 32767]), which is [-8322945, 8322945].
-                  // log2(8322945) = 22.99.
-                  acc += filter_val * input_val;
-                }
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 64 bits accumulator.
+                // int64_t += int8_t * int16_t so the highest value we can
+                // get from each accumulation is [-127, 127] * ([-32768,
+                // 32767] -
+                // [-32768, 32767]), which is [-8322945, 8322945].
+                // log2(8322945) = 22.99.
+                acc += filter_val * input_val;
              }
            }
          }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -20,12 +20,12 @@ limitations under the License.
 namespace tflite {
 namespace reference_integer_ops {
 inline void DepthwiseConvPerChannel(
-    const DepthwiseParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int8* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int8* output_data) {
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
  // Get parameters.
  // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
  const int stride_width = params.stride_width;
@@ -35,10 +35,10 @@ inline void DepthwiseConvPerChannel(
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
  const int depth_multiplier = params.depth_multiplier;
-  const int32 input_offset = params.input_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;

  // Check dimensions of the tensors.
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -66,7 +66,7 @@ inline void DepthwiseConvPerChannel(
            const int output_channel = m + in_channel * depth_multiplier;
            const int in_x_origin = (out_x * stride_width) - pad_width;
            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
+            int32_t acc = 0;
            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                const int in_x = in_x_origin + dilation_width_factor * filter_x;
@@ -77,17 +77,17 @@ inline void DepthwiseConvPerChannel(
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val = filter_data[Offset(
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
                      filter_shape, 0, filter_y, filter_x, output_channel)];
                  // Accumulate with 32 bits accumulator.
                  // In the nudging process during model quantization, we force
                  // real value of 0.0 be represented by a quantized value. This
-                  // guarantees that the input_offset is a int8, even though it
-                  // is represented using int32.
-                  // int32 += int8 * (int8 - int8) so the highest value we can
-                  // get from each accumulation is [-127, 127] * ([-128, 127] -
+                  // guarantees that the input_offset is a int8_t, even though
+                  // it is represented using int32_t. int32_t += int8_t *
+                  // (int8_t - int8_t) so the highest value we can get from each
+                  // accumulation is [-127, 127] * ([-128, 127] -
                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
                  // = 14.98, which means we can accumulate at least 2^16
                  // multiplications without overflow. The accumulator is
@@ -120,12 +120,12 @@ inline void DepthwiseConvPerChannel(
 }

 inline void DepthwiseConvPerChannel(
-    const DepthwiseParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int16* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const std::int64_t* bias_data, const RuntimeShape& output_shape,
-    int16* output_data) {
+    int16_t* output_data) {
  // Get parameters.
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
@@ -134,8 +134,8 @@ inline void DepthwiseConvPerChannel(
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
  const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;

  // Check dimensions of the tensors.
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -174,9 +174,9 @@ inline void DepthwiseConvPerChannel(
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val = filter_data[Offset(
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
                      filter_shape, 0, filter_y, filter_x, output_channel)];
                  // Accumulate with 64 bits accumulator.
                  // We assume maximum of 2^16 accumulations as with the 8-bit
@@ -190,7 +190,7 @@ inline void DepthwiseConvPerChannel(
            if (bias_data) {
              acc += bias_data[output_channel];
            }
-            int32 scaled_acc = MultiplyByQuantizedMultiplier(
+            int32_t scaled_acc = MultiplyByQuantizedMultiplier(
                acc, output_multiplier[output_channel],
                output_shift[output_channel]);
            scaled_acc = std::max(scaled_acc, output_activation_min);
@@ -207,8 +207,8 @@ inline void DepthwiseConvPerChannel(

 inline void DepthwiseConvHybridPerChannel(
    const DepthwiseParams& params, float* scaling_factors_ptr,
-    const RuntimeShape& input_shape, const int8* input_data,
-    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
    const RuntimeShape& bias_shape, const float* bias_data,
    const RuntimeShape& output_shape, float* output_data,
    const float* per_channel_scale, int32_t* input_offset) {
@@ -247,7 +247,7 @@ inline void DepthwiseConvHybridPerChannel(
            const int output_channel = m + in_channel * depth_multiplier;
            const int in_x_origin = (out_x * stride_width) - pad_width;
            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
+            int32_t acc = 0;
            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                const int in_x = in_x_origin + dilation_width_factor * filter_x;
@@ -258,9 +258,9 @@ inline void DepthwiseConvHybridPerChannel(
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val = filter_data[Offset(
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
                      filter_shape, 0, filter_y, filter_x, output_channel)];
                  acc += filter_val * (input_val - input_offset[batch]);
                }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -24,15 +24,15 @@ inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
    const int8_t* input_data, const RuntimeShape& filter_shape,
    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
    int8_t* output_data) {
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);

@@ -44,10 +44,10 @@ inline void FullyConnected(
  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
  for (int b = 0; b < batches; ++b) {
    for (int out_c = 0; out_c < output_depth; ++out_c) {
-      int32 acc = 0;
+      int32_t acc = 0;
      for (int d = 0; d < accum_depth; ++d) {
-        int32 input_val = input_data[b * accum_depth + d];
-        int32 filter_val = filter_data[out_c * accum_depth + d];
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
        acc += (filter_val + filter_offset) * (input_val + input_offset);
      }
      if (bias_data) {
@@ -68,11 +68,11 @@ inline void FullyConnected(
    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const int64_t* bias_data, const RuntimeShape& output_shape,
    int16_t* output_data) {
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);

@@ -86,8 +86,8 @@ inline void FullyConnected(
    for (int out_c = 0; out_c < output_depth; ++out_c) {
      int64_t acc = 0;
      for (int d = 0; d < accum_depth; ++d) {
-        int32 input_val = input_data[b * accum_depth + d];
-        int32 filter_val = filter_data[out_c * accum_depth + d];
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
        acc += (filter_val + filter_offset) * input_val;
      }
      if (bias_data) {
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
@@ -21,8 +21,8 @@ namespace tflite {
 namespace reference_integer_ops {

 inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
-                            int32_t depth, const int8* input_data,
-                            int8* output_data) {
+                            int32_t depth, const int8_t* input_data,
+                            int8_t* output_data) {
  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
  // The output scale must be in sync with Prepare().
@@ -30,7 +30,7 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
  // to [-1, 127/128].
  static constexpr int32_t kOutputScale = 7;
  for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
-    // int32 = (int8 - int8) ^ 2.
+    // int32_t = (int8_t - int8_t) ^ 2.
    // ([-128, 127] - [-128, 127]) ^ 2 = [0, (2^8 - 1)^2] so the accumulator is
    // safe from overflowing in at least 2^16 steps.
    int32_t acc = 0;
@@ -55,7 +55,7 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
          std::min(static_cast<int32_t>(kMaxInt8),
                   std::max(static_cast<int32_t>(kMinInt8), output_in_q24));
      output_data[depth * outer_index + inner_index] =
-          static_cast<int8>(output_in_q24);
+          static_cast<int8_t>(output_in_q24);
    }
  }
 }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -58,12 +58,15 @@ inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
  }
 }

-inline void Logistic(int32_t input_size, const int16_t* ptr_input_data,
-                     int16_t* ptr_output_data) {
+inline void Logistic(int32_t input_multiplier, int32_t input_size,
+                     const int16_t* ptr_input_data, int16_t* ptr_output_data) {
  // We use the LUT for sigmoid and take into account, that
  // tanh(x) = 2*sigmoid(2*x) - 1
+
+  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+
  for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = *ptr_input_data;
+    int32_t input_data = (*ptr_input_data) * input_data_mul;

    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
    // we do interpolation on unsigned values.
@@ -72,13 +75,20 @@ inline void Logistic(int32_t input_size, const int16_t* ptr_input_data,
    // We divide by 2 power of 9, because
    // we need to divide by 2 in power of 7 for
    // the input conversion + 1/4 from the scale above.
-    uint8_t uh = abs_input_data >> 9;
-    uint32_t ua = sigmoid_table_uint16[uh];
-    uint32_t ub = sigmoid_table_uint16[uh + 1];
-    uint32_t ut = abs_input_data & 0x1ff;
+    // Define uh as uint32_t type not to make this function overflow.
+    uint32_t uh = abs_input_data >> 9;
+    uint32_t result;

-    // Interpolation is done using the fractional bit.
-    uint32_t result = (ua << 9) + ut * (ub - ua);
+    if (uh >= 255) {
+      // Saturate to maximum.
+      result = 0x7FFF << 10;
+    } else {
+      uint32_t ua = sigmoid_table_uint16[uh];
+      uint32_t ub = sigmoid_table_uint16[uh + 1];
+      uint32_t ut = abs_input_data & 0x1ff;
+      // Interpolation is done using the fractional bit.
+      result = (ua << 9) + ut * (ub - ua);
+    }

    result = (input_data >= 0) ? (result + (1 << 9))
                               : ((1 << (16 + 9)) - result + (1 << 9) - 1);
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+template <typename integer_type>
+inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
+                 int32_t shift, const RuntimeShape& unextended_input_shape,
+                 const integer_type* input_data, int32_t input_zero_point,
+                 const RuntimeShape& unextended_output_shape,
+                 integer_type* output_data, int32_t output_zero_point) {
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int output_batch = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int num_elements_in_axis = input_width * input_height;
+
+  TFLITE_CHECK_EQ(op_params.axis_count, 2);
+  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_CHECK_EQ(output_height, 1);
+  TFLITE_CHECK_EQ(output_width, 1);
+
+  static constexpr int32_t kMinInt = std::numeric_limits<integer_type>::min();
+  static constexpr int32_t kMaxInt = std::numeric_limits<integer_type>::max();
+
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_d = 0; out_d < output_depth; ++out_d) {
+      int32_t acc = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)] -
+                 input_zero_point;
+        }
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
+      acc = acc > 0 ? (acc + num_elements_in_axis / 2) / num_elements_in_axis
+                    : (acc - num_elements_in_axis / 2) / num_elements_in_axis;
+      acc += output_zero_point;
+      acc = std::min(std::max(acc, kMinInt), kMaxInt);
+      output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+          static_cast<integer_type>(acc);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@@ -27,14 +27,14 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
                           const T* input1_data, const T* input2_data,
                           T* output_data) {
  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 unclamped_result =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
        params.output_offset +
        MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                      params.output_multiplier,
                                      params.output_shift);
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, unclamped_result));
    output_data[i] = static_cast<T>(clamped_output);
@@ -57,13 +57,13 @@ inline void Mul(const ArithmeticParams& params,

 // Mul with 16 bit inputs and int8_t outputs.
 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
                const RuntimeShape& output_shape, int8_t* output_data) {
  ruy::profiler::ScopeLabel label("Mul/Int16Int8");
-  int32 output_offset = params.output_offset;
-  int32 output_activation_min = params.quantized_activation_min;
-  int32 output_activation_max = params.quantized_activation_max;
+  int32_t output_offset = params.output_offset;
+  int32_t output_activation_min = params.quantized_activation_min;
+  int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);

  const int flat_size =
@@ -75,12 +75,12 @@ inline void Mul(const ArithmeticParams& params,

    F0 unclamped_result =
        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
-    int16 rescaled_result =
+    int16_t rescaled_result =
        gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
-    int16 clamped_result =
-        std::min<int16>(output_activation_max - output_offset, rescaled_result);
-    clamped_result =
-        std::max<int16>(output_activation_min - output_offset, clamped_result);
+    int16_t clamped_result = std::min<int16_t>(
+        output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16_t>(output_activation_min - output_offset,
+                                       clamped_result);
    output_data[i] = output_offset + clamped_result;
  }
 }
@@ -104,18 +104,18 @@ inline void BroadcastMul4DSlow(
    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
              params.input1_offset +
              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
              params.input2_offset +
              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 unclamped_result =
+          const int32_t unclamped_result =
              params.output_offset +
              MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                            params.output_multiplier,
                                            params.output_shift);
-          const int32 clamped_output = std::min(
+          const int32_t clamped_output = std::min(
              params.quantized_activation_max,
              std::max(params.quantized_activation_min, unclamped_result));
          output_data[Offset(extended_output_shape, b, y, x, c)] =
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
@@ -22,8 +22,9 @@ namespace tflite {
 namespace reference_integer_ops {

 inline void AveragePool(const PoolParams& params,
-                        const RuntimeShape& input_shape, const int8* input_data,
-                        const RuntimeShape& output_shape, int8* output_data) {
+                        const RuntimeShape& input_shape,
+                        const int8_t* input_data,
+                        const RuntimeShape& output_shape, int8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -52,7 +53,7 @@ inline void AveragePool(const PoolParams& params,
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end =
              std::min(params.filter_height, input_height - in_y_origin);
-          int32 acc = 0;
+          int32_t acc = 0;
          int filter_count = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end;
               ++filter_y) {
@@ -71,7 +72,7 @@ inline void AveragePool(const PoolParams& params,
          acc = std::max(acc, params.quantized_activation_min);
          acc = std::min(acc, params.quantized_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<int8>(acc);
+              static_cast<int8_t>(acc);
        }
      }
    }
@@ -79,8 +80,8 @@ inline void AveragePool(const PoolParams& params,
 }

 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const int8* input_data, const RuntimeShape& output_shape,
-                    int8* output_data) {
+                    const int8_t* input_data, const RuntimeShape& output_shape,
+                    int8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_GE(params.quantized_activation_min,
@@ -137,8 +138,9 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,

 inline void AveragePool(const PoolParams& params,
                        const RuntimeShape& input_shape,
-                        const int16* input_data,
-                        const RuntimeShape& output_shape, int16* output_data) {
+                        const int16_t* input_data,
+                        const RuntimeShape& output_shape,
+                        int16_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -167,7 +169,7 @@ inline void AveragePool(const PoolParams& params,
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end =
              std::min(params.filter_height, input_height - in_y_origin);
-          int32 acc = 0;
+          int32_t acc = 0;
          int filter_count = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end;
               ++filter_y) {
@@ -186,7 +188,7 @@ inline void AveragePool(const PoolParams& params,
          acc = std::max(acc, params.quantized_activation_min);
          acc = std::min(acc, params.quantized_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<int16>(acc);
+              static_cast<int16_t>(acc);
        }
      }
    }
@@ -194,8 +196,8 @@ inline void AveragePool(const PoolParams& params,
 }

 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const int16* input_data, const RuntimeShape& output_shape,
-                    int16* output_data) {
+                    const int16_t* input_data, const RuntimeShape& output_shape,
+                    int16_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_GE(params.quantized_activation_min,
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -0,0 +1,110 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
+
+#include <limits>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
+                 int32_t input_multiplier, int32_t input_shift,
+                 const RuntimeShape& input_shape, const int8_t* input_data,
+                 const RuntimeShape& output_shape, int8_t* output_data) {
+  // Integer bits must be in sync with Prepare() function.
+  static constexpr int32_t kInputIntegerBits = 4;
+  static constexpr int32_t kOutputScale = 7;
+  static constexpr int32_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  using F4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i) {
+    const int32_t input =
+        static_cast<int32_t>(input_data[i]) - input_zero_point;
+    if (input <= -input_range_radius) {
+      output_data[i] = kMinInt8;
+    } else if (input >= input_range_radius) {
+      output_data[i] = kMaxInt8;
+    } else {
+      const int32_t input_in_q4 =
+          MultiplyByQuantizedMultiplier(input, input_multiplier, input_shift);
+      const int32_t output_in_q0 =
+          gemmlowp::tanh(F4::FromRaw(input_in_q4)).raw();
+
+      // Rescale and downcast.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_in_q24 =
+          RoundingDivideByPOT(output_in_q0, 31 - kOutputScale);
+      output_in_q24 = std::min(std::max(output_in_q24, kMinInt8), kMaxInt8);
+      output_data[i] = static_cast<int8_t>(output_in_q24);
+    }
+  }
+}
+
+inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
+                 const RuntimeShape& input_shape, const int16_t* ptr_input_data,
+                 const RuntimeShape& output_shape, int16_t* ptr_output_data) {
+  // We use the LUT for sigmoid and take into account, that
+  // tanh(x) = 2*sigmoid(2*x) - 1
+
+  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+
+  int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++) {
+    int32_t input_data = (*ptr_input_data) * input_data_mul;
+
+    if (input_left_shift == 1) {
+      input_data <<= 1;
+    }
+
+    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
+    uint32_t abs_input_data = 3 * abs(input_data);
+    uint32_t uh = abs_input_data >> 8;
+    int32_t result;
+
+    if (uh >= 255) {
+      // Saturate to maximum.
+      result = 0xFFFF << 8;
+    } else {
+      uint32_t ua = sigmoid_table_uint16[uh];
+      uint32_t ub = sigmoid_table_uint16[uh + 1];
+
+      uint8_t ut = abs_input_data & 0xFF;
+
+      result = (ua << 8) + ut * (ub - ua);
+    }
+
+    result = (input_data >= 0)
+                 ? (result - (1 << (14 + 9)) + (1 << (9 - 2)))
+                 : (-result + (1 << (14 + 9)) + (1 << (9 - 2)) - 1);
+
+    // Convert back to 16-bit.
+    result >>= (9 - 1);
+
+    *ptr_output_data = result;
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/l2normalization.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/l2normalization.h
@@ -52,40 +52,39 @@ inline void L2Normalization(const tflite::L2NormalizationParams& op_params,

 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                            const RuntimeShape& input_shape,
-                            const uint8* input_data,
+                            const uint8_t* input_data,
                            const RuntimeShape& output_shape,
-                            uint8* output_data) {
+                            uint8_t* output_data) {
  const int trailing_dim = input_shape.DimensionsCount() - 1;
  const int depth =
      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
  const int outer_size =
      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-  const int32 input_zero_point = op_params.input_zero_point;
+  const int32_t input_zero_point = op_params.input_zero_point;

  for (int i = 0; i < outer_size; ++i) {
-    int32 square_l2_norm = 0;
+    int32_t square_l2_norm = 0;
    for (int c = 0; c < depth; c++) {
-      int32 diff = input_data[depth * i + c] - input_zero_point;
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
      square_l2_norm += diff * diff;
    }
-    int32 inv_l2norm_multiplier;
+    int32_t inv_l2norm_multiplier;
    int inv_l2norm_shift;
    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
    for (int c = 0; c < depth; c++) {
-      int32 diff = input_data[depth * i + c] - input_zero_point;
-      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
+      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
-      int32 unclamped_output_val = 128 + rescaled_diff;
-      int32 output_val =
-          std::min(static_cast<int32>(255),
-                   std::max(static_cast<int32>(0), unclamped_output_val));
-      output_data[depth * i + c] = static_cast<uint8>(output_val);
+      int32_t unclamped_output_val = 128 + rescaled_diff;
+      int32_t output_val =
+          std::min(static_cast<int32_t>(255),
+                   std::max(static_cast<int32_t>(0), unclamped_output_val));
+      output_data[depth * i + c] = static_cast<uint8_t>(output_val);
    }
  }
 }

-
 }  // namespace reference_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/logistic.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/logistic.h
@@ -66,8 +66,8 @@ inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
 }

 inline void Logistic(const LogisticParams& params,
-                     const RuntimeShape& input_shape, const int16* input_data,
-                     const RuntimeShape& output_shape, int16* output_data) {
+                     const RuntimeShape& input_shape, const int16_t* input_data,
+                     const RuntimeShape& output_shape, int16_t* output_data) {
  const int flat_size = MatchingFlatSize(input_shape, output_shape);

  for (int i = 0; i < flat_size; i++) {
@@ -84,12 +84,12 @@ inline void Logistic(const LogisticParams& params,
  }
 }

-// Quantized int8 logistic activation.  Cheats by dequantizing and requantizing
-// around the floating point logistic method.  This implementation is slow on
-// platforms without a floating point unit.
+// Quantized int8_t logistic activation.  Cheats by dequantizing and
+// requantizing around the floating point logistic method.  This implementation
+// is slow on platforms without a floating point unit.

-// TODO(b/141211002): Delete this int8 implementation once we can reuse the
-// approach used in TFLite for int8 Logistic.
+// TODO(b/141211002): Delete this int8_t implementation once we can reuse the
+// approach used in TFLite for int8_t Logistic.
 inline void Logistic(const RuntimeShape& input_shape, const int8_t* input_data,
                     float input_scale, int input_zero_point,
                     const RuntimeShape& output_shape, int8_t* output_data,
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/mul.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/mul.h
@@ -24,20 +24,20 @@ namespace reference_ops {
 // Element-wise mul that can often be used for inner loop of broadcast Mul as
 // well as the non-broadcast Mul.
 inline void MulElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 unclamped_result =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
        params.output_offset +
        MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                      params.output_multiplier,
                                      params.output_shift);
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, unclamped_result));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
  }
 }

@@ -60,9 +60,9 @@ inline void Mul(const ArithmeticParams& params,
 }

 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  const int flat_size =
@@ -73,11 +73,11 @@ inline void Mul(const ArithmeticParams& params,

 inline void BroadcastMul4DSlow(const ArithmeticParams& params,
                               const RuntimeShape& input1_shape,
-                               const uint8* input1_data,
+                               const uint8_t* input1_data,
                               const RuntimeShape& input2_shape,
-                               const uint8* input2_data,
+                               const uint8_t* input2_data,
                               const RuntimeShape& output_shape,
-                               uint8* output_data) {
+                               uint8_t* output_data) {
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -89,22 +89,22 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
              params.input1_offset +
              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
              params.input2_offset +
              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 unclamped_result =
+          const int32_t unclamped_result =
              params.output_offset +
              MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                            params.output_multiplier,
                                            params.output_shift);
-          const int32 clamped_output = std::min(
+          const int32_t clamped_output = std::min(
              params.quantized_activation_max,
              std::max(params.quantized_activation_min, unclamped_result));
          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<uint8>(clamped_output);
+              static_cast<uint8_t>(clamped_output);
        }
      }
    }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/pad.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/pad.h
@@ -32,8 +32,8 @@ constexpr int PadKernelMaxDimensionCount() { return 4; }
 // equivalent to a simple input1_data.  For Pad, it should point to a zero
 // value.
 //
-// Note that two typenames are required, so that T=P=int32 is considered a
-// specialization distinct from P=int32.
+// Note that two typenames are required, so that T=P=int32_t is considered a
+// specialization distinct from P=int32_t.
 template <typename T, typename P>
 inline void PadImpl(const tflite::PadParams& op_params,
                    const RuntimeShape& input_shape, const T* input_data,
@@ -116,11 +116,11 @@ inline void Pad(const tflite::PadParams& op_params,
          output_data);
 }

-// The second (pad-value) input can be int32 when, say, the first is uint8.
+// The second (pad-value) input can be int32_t when, say, the first is uint8_t.
 template <typename T>
 inline void Pad(const tflite::PadParams& op_params,
                const RuntimeShape& input_shape, const T* input_data,
-                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
                T* output_data) {
  const T converted_pad_value = static_cast<T>(*pad_value_ptr);
  PadImpl(op_params, input_shape, input_data, &converted_pad_value,
@@ -130,40 +130,18 @@ inline void Pad(const tflite::PadParams& op_params,
 // This version avoids conflicting template matching.
 template <>
 inline void Pad(const tflite::PadParams& op_params,
-                const RuntimeShape& input_shape, const int32* input_data,
-                const int32* pad_value_ptr, const RuntimeShape& output_shape,
-                int32* output_data) {
+                const RuntimeShape& input_shape, const int32_t* input_data,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
+                int32_t* output_data) {
  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
          output_data);
 }

-// One could make all PadImageStyle calls simply delegate the work to the
-// ordinary Pad.  However, it is better that the reference code asserts false in
-// similar cases.
 template <typename T, typename P>
 inline void PadImageStyle(const tflite::PadParams& op_params,
                          const RuntimeShape& input_shape, const T* input_data,
                          const P* pad_value_ptr,
                          const RuntimeShape& output_shape, T* output_data) {
-  TFLITE_ASSERT_FALSE;
-}
-
-template <typename P>
-inline void PadImageStyle(const tflite::PadParams& op_params,
-                          const RuntimeShape& input_shape,
-                          const uint8* input_data, const P* pad_value_ptr,
-                          const RuntimeShape& output_shape,
-                          uint8* output_data) {
-  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
-      output_data);
-}
-
-template <typename P>
-inline void PadImageStyle(const tflite::PadParams& op_params,
-                          const RuntimeShape& input_shape,
-                          const int8_t* input_data, const P* pad_value_ptr,
-                          const RuntimeShape& output_shape,
-                          int8_t* output_data) {
  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
      output_data);
 }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/pooling.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/pooling.h
@@ -78,8 +78,9 @@ inline void AveragePool(const PoolParams& params,

 inline void AveragePool(const PoolParams& params,
                        const RuntimeShape& input_shape,
-                        const uint8* input_data,
-                        const RuntimeShape& output_shape, uint8* output_data) {
+                        const uint8_t* input_data,
+                        const RuntimeShape& output_shape,
+                        uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -108,7 +109,7 @@ inline void AveragePool(const PoolParams& params,
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end =
              std::min(params.filter_height, input_height - in_y_origin);
-          int32 acc = 0;
+          int32_t acc = 0;
          int filter_count = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end;
               ++filter_y) {
@@ -125,7 +126,7 @@ inline void AveragePool(const PoolParams& params,
          acc = std::max(acc, params.quantized_activation_min);
          acc = std::min(acc, params.quantized_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<uint8>(acc);
+              static_cast<uint8_t>(acc);
        }
      }
    }
@@ -237,8 +238,8 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
 }

 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const uint8* input_data, const RuntimeShape& output_shape,
-                    uint8* output_data) {
+                    const uint8_t* input_data, const RuntimeShape& output_shape,
+                    uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_GE(params.quantized_activation_min, 0);
@@ -269,7 +270,7 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end =
              std::min(params.filter_height, input_height - in_y_origin);
-          uint8 max = 0;
+          uint8_t max = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end;
               ++filter_y) {
            for (int filter_x = filter_x_start; filter_x < filter_x_end;
@@ -281,10 +282,10 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
            }
          }
-          max = std::max<uint8>(max, params.quantized_activation_min);
-          max = std::min<uint8>(max, params.quantized_activation_max);
+          max = std::max<uint8_t>(max, params.quantized_activation_min);
+          max = std::min<uint8_t>(max, params.quantized_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<uint8>(max);
+              static_cast<uint8_t>(max);
        }
      }
    }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/prelu.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -23,7 +23,7 @@ namespace tflite {

 namespace reference_ops {

-// Broadcast prelu to output_shape for quantized uint8/int8 data.
+// Broadcast prelu to output_shape for quantized uint8_t/int8_t data.
 template <typename T>
 inline void BroadcastPrelu4DSlow(
    const PreluParams& params, const RuntimeShape& input_shape,
@@ -44,24 +44,26 @@ inline void BroadcastPrelu4DSlow(
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
          int output_index = Offset(extended_output_shape, b, y, x, c);
          int input_index = SubscriptToIndex(desc1, b, y, x, c);
-          const int32 input_value =
+          const int32_t input_value =
              params.input_offset + input_data[input_index];
-          int32 output_value;
+          int32_t output_value;
          if (input_value >= 0) {
-            output_value = input_value;
+            output_value = MultiplyByQuantizedMultiplier(
+                input_value, params.output_multiplier_1, params.output_shift_1);
          } else {
            auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
-            const int32 alpha_value =
+            const int32_t alpha_value =
                params.alpha_offset + alpha_data[alpha_index];
+
            output_value = MultiplyByQuantizedMultiplier(
-                input_value * alpha_value, params.output_multiplier,
-                params.output_shift);
+                input_value * alpha_value, params.output_multiplier_2,
+                params.output_shift_2);
          }
          output_value += params.output_offset;

-          const int32 quantized_min = std::numeric_limits<T>::min();
-          const int32 quantized_max = std::numeric_limits<T>::max();
-          const int32 clamped_output =
+          const int32_t quantized_min = std::numeric_limits<T>::min();
+          const int32_t quantized_max = std::numeric_limits<T>::max();
+          const int32_t clamped_output =
              std::min(quantized_max, std::max(quantized_min, output_value));
          output_data[output_index] = static_cast<T>(clamped_output);
        }
@@ -70,6 +72,37 @@ inline void BroadcastPrelu4DSlow(
  }
 }

+template <typename T>
+inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
+                  const T* input_data, const RuntimeShape& alpha_shape,
+                  const T* alpha_data, const RuntimeShape& output_shape,
+                  T* output_data) {
+  const int32_t quantized_min = std::numeric_limits<T>::min();
+  const int32_t quantized_max = std::numeric_limits<T>::max();
+
+  const int flat_size =
+      MatchingElementsSize(input_shape, alpha_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const int32_t input_value = params.input_offset + input_data[i];
+    int32_t output_value;
+    if (input_value >= 0) {
+      output_value = MultiplyByQuantizedMultiplier(
+          input_value, params.output_multiplier_1, params.output_shift_1);
+    } else {
+      const int32_t alpha_value = params.alpha_offset + alpha_data[i];
+
+      output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value,
+                                                   params.output_multiplier_2,
+                                                   params.output_shift_2);
+    }
+    output_value += params.output_offset;
+
+    const int32_t clamped_output =
+        std::min(quantized_max, std::max(quantized_min, output_value));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite

--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
@@ -76,6 +76,10 @@ inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
          BroadcastableOpCategory::kFirstInputBroadcastsFast &&
      params->broadcast_category !=
          BroadcastableOpCategory::kSecondInputBroadcastsFast) {
+    // This is unreachable because at least one else clause in the above loop
+    // must be reached.
+    TFLITE_DCHECK(false);
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
    return false;
  }

--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/quantize.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/quantize.h
@@ -15,7 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_

+#include <algorithm>
+#include <limits>
+
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/types.h"

@@ -29,18 +33,18 @@ inline void AffineQuantize(const tflite::QuantizationParams& op_params,
                           const InputT* input_data,
                           const RuntimeShape& output_shape,
                           OutputT* output_data) {
-  const int32 zero_point = op_params.zero_point;
+  const int32_t zero_point = op_params.zero_point;
  const double scale = op_params.scale;
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  static constexpr int32 min_val = std::numeric_limits<OutputT>::min();
-  static constexpr int32 max_val = std::numeric_limits<OutputT>::max();
+  static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
+  static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();

  for (int i = 0; i < flat_size; i++) {
    const InputT val = input_data[i];
-    int32 unclamped =
-        static_cast<int32>(TfLiteRound(val / static_cast<float>(scale))) +
+    int32_t unclamped =
+        static_cast<int32_t>(TfLiteRound(val / static_cast<float>(scale))) +
        zero_point;
-    int32 clamped = std::min(std::max(unclamped, min_val), max_val);
+    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
    output_data[i] = clamped;
  }
 }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/reduce.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/reduce.h
@@ -18,6 +18,8 @@ limitations under the License.
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/max.h"
+#include "tensorflow/lite/kernels/internal/min.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/types.h"

@@ -68,6 +70,9 @@ inline bool ResolveAxis(const int num_dims, const int* axis,
    // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
    int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
    TFLITE_DCHECK(current >= 0 && current < num_dims);
+    if (current < 0 || current >= num_dims) {
+      return false;
+    }
    bool is_dup = false;
    for (int j = 0; j < *out_num_axis; ++j) {
      if (out_axis[j] == current) {
@@ -127,6 +132,11 @@ inline bool ReduceGeneric(const T* input_data, const int* input_dims,
                          bool keep_dims, int* temp_index, int* resolved_axis,
                          T init_value,
                          T reducer(const T current, const T in)) {
+  // Return early when input shape has zero dim.
+  for (int i = 0; i < input_num_dims; ++i) {
+    if (input_dims[i] == 0) return true;
+  }
+
  // Reset output data.
  if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
                               output_data)) {
@@ -184,11 +194,11 @@ inline bool Mean(const T* input_data, const int* input_dims,
  }

  // Calculate mean by dividing output_data by num of aggregated element.
-  U num_elements_in_axis = 1;
+  size_t num_elements_in_axis = 1;
  for (int idx = 0; idx < num_resolved_axis; ++idx) {
    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
    // Overflow prevention.
-    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
      return false;
    }
    num_elements_in_axis *= current;
@@ -249,9 +259,9 @@ inline void Mean(const tflite::MeanParams& op_params,

 inline void Mean(const tflite::MeanParams& op_params,
                 const RuntimeShape& unextended_input_shape,
-                 const uint8_t* input_data, int32 input_zero_point,
+                 const uint8_t* input_data, int32_t input_zero_point,
                 float input_scale, const RuntimeShape& unextended_output_shape,
-                 uint8_t* output_data, int32 output_zero_point,
+                 uint8_t* output_data, int32_t output_zero_point,
                 float output_scale) {
  ruy::profiler::ScopeLabel label("Mean4D/Uint8");

@@ -280,9 +290,9 @@ inline void Mean(const tflite::MeanParams& op_params,
  constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
  constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();

-  int32 bias =
+  int32_t bias =
      output_zero_point -
-      static_cast<int32>(input_zero_point * input_scale / output_scale);
+      static_cast<int32_t>(input_zero_point * input_scale / output_scale);
  double real_scale =
      static_cast<double>(input_scale / (num_elements_in_axis * output_scale));

@@ -291,7 +301,7 @@ inline void Mean(const tflite::MeanParams& op_params,
  QuantizeMultiplier(real_scale, &multiplier, &shift);
  for (int out_b = 0; out_b < output_batch; ++out_b) {
    for (int out_d = 0; out_d < output_depth; ++out_d) {
-      int32 acc = 0;
+      int32_t acc = 0;
      for (int in_h = 0; in_h < input_height; ++in_h) {
        for (int in_w = 0; in_w < input_width; ++in_w) {
          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
@@ -310,18 +320,21 @@ inline void Mean(const tflite::MeanParams& op_params,
 // It does so in two stages, first calculates the sum of elements along the axis
 // then divides it by the number of element in axis for quantized values.
 template <typename T, typename U>
-inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
+inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
                               float input_scale, const int* input_dims,
                               const int input_num_dims, T* output_data,
-                               int32 output_zero_point, float output_scale,
+                               int32_t output_zero_point, float output_scale,
                               const int* output_dims,
                               const int output_num_dims, const int* axis,
                               const int num_axis_dimensions, bool keep_dims,
                               int* temp_index, int* resolved_axis, U* temp_sum,
                               bool compute_sum) {
-  const bool uint8_case = std::is_same<T, int8_t>::value;
+  const bool uint8_case = std::is_same<T, uint8_t>::value;
+  const bool int16_case = std::is_same<T, int16_t>::value;
  if (uint8_case) {
    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
+  } else if (int16_case) {
+    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
  } else {
    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
  }
@@ -354,11 +367,11 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
  }

  // Calculate mean by dividing output_data by num of aggregated element.
-  U num_elements_in_axis = 1;
+  size_t num_elements_in_axis = 1;
  for (int idx = 0; idx < num_resolved_axis; ++idx) {
    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
    // Overflow prevention.
-    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
      return false;
    }
    num_elements_in_axis *= current;
@@ -368,8 +381,7 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
    const float scale = input_scale / output_scale;
    if (compute_sum) {
      // TODO(b/116341117): Eliminate float and do this completely in 8bit.
-      const float bias =
-          -input_zero_point * scale * num_elements_in_axis + 0.5f;
+      const float bias = -input_zero_point * scale * num_elements_in_axis;
      for (size_t idx = 0; idx < num_outputs; ++idx) {
        const U value =
            static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
@@ -377,15 +389,15 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
        output_data[idx] = static_cast<T>(value);
      }
    } else {
-      const float bias = -input_zero_point * scale + 0.5f;
+      const float bias = -input_zero_point * scale;
      for (size_t idx = 0; idx < num_outputs; ++idx) {
        float float_mean = static_cast<float>(temp_sum[idx]) /
                           static_cast<float>(num_elements_in_axis);
-        float result =
-            std::min(TfLiteRound(float_mean * scale + bias) + output_zero_point,
-                     static_cast<float>(std::numeric_limits<T>::max()));
-        result =
-            std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
+        float result = TfLiteMin(
+            TfLiteRound(float_mean * scale + bias) + output_zero_point,
+            static_cast<float>(std::numeric_limits<T>::max()));
+        result = TfLiteMax(result,
+                           static_cast<float>(std::numeric_limits<T>::min()));
        output_data[idx] = static_cast<T>(result);
      }
    }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
@@ -17,28 +17,30 @@ limitations under the License.

 #include <cmath>

+#include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/types.h"

 namespace tflite {

 namespace reference_ops {

-inline int32 GetNearestNeighbor(const int input_value, const int32 input_size,
-                                const int32 output_size,
-                                const bool align_corners,
-                                const bool half_pixel_centers) {
+inline int32_t GetNearestNeighbor(const int input_value,
+                                  const int32_t input_size,
+                                  const int32_t output_size,
+                                  const bool align_corners,
+                                  const bool half_pixel_centers) {
  const float scale =
      (align_corners && output_size > 1)
          ? (input_size - 1) / static_cast<float>(output_size - 1)
          : input_size / static_cast<float>(output_size);
  const float offset = half_pixel_centers ? 0.5f : 0.0f;
-  int32 output_value = std::min(
+  int32_t output_value = std::min(
      align_corners
-          ? static_cast<int32>(std::round((input_value + offset) * scale))
-          : static_cast<int32>(std::floor((input_value + offset) * scale)),
+          ? static_cast<int32_t>(TfLiteRound((input_value + offset) * scale))
+          : static_cast<int32_t>(std::floor((input_value + offset) * scale)),
      input_size - 1);
  if (half_pixel_centers) {
-    output_value = std::max(static_cast<int32>(0), output_value);
+    output_value = std::max(static_cast<int32_t>(0), output_value);
  }
  return output_value;
 }
@@ -47,7 +49,7 @@ template <typename T>
 inline void ResizeNearestNeighbor(
    const tflite::ResizeNearestNeighborParams& op_params,
    const RuntimeShape& unextended_input_shape, const T* input_data,
-    const RuntimeShape& output_size_shape, const int32* output_size_data,
+    const RuntimeShape& output_size_shape, const int32_t* output_size_data,
    const RuntimeShape& unextended_output_shape, T* output_data) {
  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
@@ -57,16 +59,16 @@ inline void ResizeNearestNeighbor(
  const RuntimeShape output_shape =
      RuntimeShape::ExtendedShape(4, unextended_output_shape);

-  int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
-  int32 input_height = input_shape.Dims(1);
-  int32 input_width = input_shape.Dims(2);
-  int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);

  // The Tensorflow version of this op allows resize on the width and height
  // axis only.
  TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
-  int32 output_height = output_size_data[0];
-  int32 output_width = output_size_data[1];
+  int32_t output_height = output_size_data[0];
+  int32_t output_width = output_size_data[1];

  const int col_offset = input_shape.Dims(3);
  const int row_offset = input_shape.Dims(2) * col_offset;
@@ -76,14 +78,14 @@ inline void ResizeNearestNeighbor(
  T* output_ptr = output_data;
  for (int b = 0; b < batches; ++b) {
    for (int y = 0; y < output_height; ++y) {
-      int32 in_y = GetNearestNeighbor(y, input_height, output_height,
-                                      op_params.align_corners,
-                                      op_params.half_pixel_centers);
-      const T* y_input_ptr = input_ptr + in_y * row_offset;
-      for (int x = 0; x < output_width; ++x) {
-        int32 in_x = GetNearestNeighbor(x, input_width, output_width,
+      int32_t in_y = GetNearestNeighbor(y, input_height, output_height,
                                        op_params.align_corners,
                                        op_params.half_pixel_centers);
+      const T* y_input_ptr = input_ptr + in_y * row_offset;
+      for (int x = 0; x < output_width; ++x) {
+        int32_t in_x = GetNearestNeighbor(x, input_width, output_width,
+                                          op_params.align_corners,
+                                          op_params.half_pixel_centers);
        const T* x_input_ptr = y_input_ptr + in_x * col_offset;
        memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
        output_ptr += depth;
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_

 #include <limits>
-#include <vector>

 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -49,26 +48,27 @@ inline void Softmax(const SoftmaxParams& params,
    // Compute sum.
    float sum = 0.f;
    for (int c = 0; c < depth; ++c) {
-      sum += std::exp((input_data[i * depth + c] - max) *
-                      static_cast<float>(params.beta));
+      const float exp_c = std::exp((input_data[i * depth + c] - max) *
+                                   static_cast<float>(params.beta));
+      output_data[i * depth + c] = exp_c;
+      sum += exp_c;
    }

    // Compute result.
    for (int c = 0; c < depth; ++c) {
-      output_data[i * depth + c] = std::exp((input_data[i * depth + c] - max) *
-                                            static_cast<float>(params.beta)) /
-                                   sum;
+      output_data[i * depth + c] = output_data[i * depth + c] / sum;
    }
  }
 }

-// Quantized softmax with int8/uint8 input and int8/uint8/int16 output.
+// Quantized softmax with int8_t/uint8_t input and int8_t/uint8_t/int16_t
+// output.
 template <typename InputT, typename OutputT>
 inline void Softmax(const SoftmaxParams& params,
                    const RuntimeShape& input_shape, const InputT* input_data,
                    const RuntimeShape& output_shape, OutputT* output_data) {
-  const int32 input_beta_multiplier = params.input_multiplier;
-  const int32 input_beta_left_shift = params.input_left_shift;
+  const int32_t input_beta_multiplier = params.input_multiplier;
+  const int32_t input_beta_left_shift = params.input_left_shift;
  const int diff_min = params.diff_min;
  // The representation chosen for the input to the exp() function is Q5.26.
  // We need to leave extra space since values that we skip might be as large as
@@ -78,9 +78,10 @@ inline void Softmax(const SoftmaxParams& params,
  static const int kScaledDiffIntegerBits = 5;
  static const int kAccumulationIntegerBits = 12;
  using FixedPointScaledDiff =
-      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
+  using FixedPointAccum =
+      gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;

  const int trailing_dim = input_shape.DimensionsCount() - 1;
  const int outer_size =
@@ -96,10 +97,10 @@ inline void Softmax(const SoftmaxParams& params,

    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
    for (int c = 0; c < depth; ++c) {
-      int32 input_diff =
-          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
      if (input_diff >= diff_min) {
-        const int32 input_diff_rescaled =
+        const int32_t input_diff_rescaled =
            MultiplyByQuantizedMultiplierGreaterThanOne(
                input_diff, input_beta_multiplier, input_beta_left_shift);
        const FixedPointScaledDiff scaled_diff_f8 =
@@ -114,28 +115,28 @@ inline void Softmax(const SoftmaxParams& params,
        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));

    for (int c = 0; c < depth; ++c) {
-      int32 input_diff =
-          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
      if (input_diff >= diff_min) {
-        const int32 input_diff_rescaled =
+        const int32_t input_diff_rescaled =
            MultiplyByQuantizedMultiplierGreaterThanOne(
                input_diff, input_beta_multiplier, input_beta_left_shift);
        const FixedPointScaledDiff scaled_diff_f8 =
            FixedPointScaledDiff::FromRaw(input_diff_rescaled);

        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-        int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+        int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
            (shifted_scale * exp_in_0).raw(),
            num_bits_over_unit + 31 - (sizeof(OutputT) * 8));

-        const int32 shifted_output =
+        const int32_t shifted_output =
            unsat_output +
-            static_cast<int32>(std::numeric_limits<OutputT>::min());
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min());

        output_data[i * depth + c] = static_cast<OutputT>(std::max(
            std::min(shifted_output,
-                     static_cast<int32>(std::numeric_limits<OutputT>::max())),
-            static_cast<int32>(std::numeric_limits<OutputT>::min())));
+                     static_cast<int32_t>(std::numeric_limits<OutputT>::max())),
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min())));
      } else {
        output_data[i * depth + c] = std::numeric_limits<OutputT>::min();
      }
@@ -143,7 +144,24 @@ inline void Softmax(const SoftmaxParams& params,
  }
 }

-// Quantized softmax with int16 input and int16 output.
+// Computes exp(input - max_input)
+inline int16_t SoftMaxCalculateExp(const SoftmaxParams& params,
+                                   const int16_t* input_data, const int depth,
+                                   int16_t max_in_row, int i, int c) {
+  int32_t input_diff = input_data[i * depth + c] - max_in_row;
+  // scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
+  // exp lut generated with range [-10, 0], as exp(-10) is negligible.
+  int32_t scaled_diff = MultiplyByQuantizedMultiplier(
+      input_diff, params.input_multiplier, params.input_left_shift);
+  // recenter to [-32768, 32767]
+  int32_t sym_scaled_diff = scaled_diff + 32767;
+  int16_t sat_sym_scaled_diff =
+      std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
+               static_cast<int32_t>(32767));
+  // apply the exp() LUT activation function
+  return generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
+}
+// Quantized softmax with int16_t input and int16_t output.
 inline void SoftmaxInt16(const SoftmaxParams& params,
                         const RuntimeShape& input_shape,
                         const int16_t* input_data,
@@ -162,28 +180,16 @@ inline void SoftmaxInt16(const SoftmaxParams& params,
      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
    }

-    // Compute exp(input - max_input)
-    std::vector<int16_t> exp_result_Q015(depth);
+    // This loops computes the exp values and their sum. We will need the exp
+    // values later on in the function so we cache them in the output_data
+    // buffer. This is an optimization done to avoid calculating the exp values
+    // twice making use of the output_data buffer as scratch memory.
+    int32_t sum_of_exps = 0;  // Q16.15 fixed point format.
+    int16_t* exp_results_Q015 = output_data + i * depth;
    for (int c = 0; c < depth; ++c) {
-      int32_t input_diff = input_data[i * depth + c] - max_in_row;
-      // scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
-      int32_t scaled_diff = MultiplyByQuantizedMultiplier(
-          input_diff, params.input_multiplier, params.input_left_shift);
-      // recenter to [-32768, 32767]
-      int32_t sym_scaled_diff = scaled_diff + 32767;
-      int16_t sat_sym_scaled_diff =
-          std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
-                   static_cast<int32_t>(32767));
-      // apply the exp() LUT activation function
-      exp_result_Q015[c] =
-          generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
-    }
-
-    // sum_of_exps is a Q16.15 fixed point format.
-    int32_t sum_of_exps = 0;
-    for (int c = 0; c < depth; ++c) {
-      // Q16.15 + Q0.15
-      sum_of_exps += exp_result_Q015[c];
+      exp_results_Q015[c] =
+          SoftMaxCalculateExp(params, input_data, depth, max_in_row, i, c);
+      sum_of_exps += exp_results_Q015[c];
    }

    // Compute the reciprocal 1/sum_of_exps
@@ -209,7 +215,7 @@ inline void SoftmaxInt16(const SoftmaxParams& params,
    for (int c = 0; c < depth; ++c) {
      uint8_t right_shift = 31 - headroom_plus_one;
      int64_t round = 1 << (right_shift - 1);
-      int32_t result = (static_cast<int64_t>(exp_result_Q015[c]) *
+      int32_t result = (static_cast<int64_t>(exp_results_Q015[c]) *
                            static_cast<int64_t>(reciprocal_scale_Q015) +
                        round) >>
                       right_shift;
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/strided_slice.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/strided_slice.h
@@ -16,8 +16,10 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_

 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/kernels/internal/types.h"
+
 namespace tflite {

 namespace reference_ops {
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/sub.h
@@ -15,9 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_

-#include "fixedpoint/fixedpoint.h"
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"

 namespace tflite {

@@ -41,11 +47,11 @@ inline void SubNonBroadcast(const ArithmeticParams& params,

 inline void SubNonBroadcast(const ArithmeticParams& params,
                            const RuntimeShape& input1_shape,
-                            const int32* input1_data,
+                            const int32_t* input1_data,
                            const RuntimeShape& input2_shape,
-                            const int32* input2_data,
+                            const int32_t* input2_data,
                            const RuntimeShape& output_shape,
-                            int32* output_data) {
+                            int32_t* output_data) {
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);
  for (int i = 0; i < flat_size; ++i) {
@@ -106,12 +112,12 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
 template <int N = 5>
 inline void BroadcastSubSlow(const ArithmeticParams& params,
                             const RuntimeShape& input1_shape,
-                             const uint8* input1_data,
+                             const uint8_t* input1_data,
                             const RuntimeShape& input2_shape,
-                             const uint8* input2_data,
+                             const uint8_t* input2_data,
                             const RuntimeShape& output_shape,
-                             uint8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8");
+                             uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8_t");
  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
@@ -134,28 +140,28 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
  // nesting loops such that the innermost loop has the smallest stride for the
  // best cache behavior.
  auto sub_func = [&](int indexes[N]) {
-    const int32 input1_val =
+    const int32_t input1_val =
        params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
-    const int32 input2_val =
+    const int32_t input2_val =
        params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sub, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
    output_data[SubscriptToIndex(output_desc, indexes)] =
-        static_cast<uint8>(clamped_output);
+        static_cast<uint8_t>(clamped_output);
  };
  NDOpsHelper<N>(output_desc, sub_func);
 }
@@ -163,12 +169,12 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
 template <int N = 5>
 inline void BroadcastSubSlow(const ArithmeticParams& params,
                             const RuntimeShape& input1_shape,
-                             const int32* input1_data,
+                             const int32_t* input1_data,
                             const RuntimeShape& input2_shape,
-                             const int32* input2_data,
+                             const int32_t* input2_data,
                             const RuntimeShape& output_shape,
-                             int32* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32");
+                             int32_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
@@ -208,7 +214,7 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
                             const int8_t* input2_data,
                             const RuntimeShape& output_shape,
                             int8_t* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8");
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8_t");
  NdArrayDesc<N> desc1;
  NdArrayDesc<N> desc2;
  NdArrayDesc<N> output_desc;
@@ -254,6 +260,45 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
  NDOpsHelper<N>(output_desc, sub_func);
 }

+template <int N = 5>
+void BroadcastSubSlow(const ArithmeticParams& params,
+                      const RuntimeShape& input1_shape,
+                      const int64_t* input1_data,
+                      const RuntimeShape& input2_shape,
+                      const int64_t* input2_data,
+                      const RuntimeShape& output_shape, int64_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  auto sub_func = [&](int indexes[N]) {
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        ActivationFunctionWithMinMax(
+            input1_data[SubscriptToIndex(desc1, indexes)] -
+                input2_data[SubscriptToIndex(desc2, indexes)],
+            params.int64_activation_min, params.int64_activation_max);
+  };
+  NDOpsHelper<N>(output_desc, sub_func);
+}
+
 template <typename T, int N = 5>
 void BroadcastSubSlow(const ArithmeticParams& params,
                      const RuntimeShape& input1_shape, const T* input1_data,
@@ -294,33 +339,33 @@ void BroadcastSubSlow(const ArithmeticParams& params,
 // Element-wise Sub that can often be used for inner loop of broadcast sub as
 // well as the non-broadcast sub.
 inline void SubElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
  TFLITE_DCHECK_GT(params.input1_offset, -256);
  TFLITE_DCHECK_GT(params.input2_offset, -256);
  TFLITE_DCHECK_LT(params.input1_offset, 256);
  TFLITE_DCHECK_LT(params.input2_offset, 256);

  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sub, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
  }
 }

@@ -336,22 +381,22 @@ inline void SubElementwise(int size, const ArithmeticParams& params,
  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);

  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sub, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
    output_data[i] = static_cast<int8_t>(clamped_output);
@@ -359,9 +404,9 @@ inline void SubElementwise(int size, const ArithmeticParams& params,
 }

 inline void Sub(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  const int flat_size =
@@ -428,40 +473,43 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
  }
 }

-inline void SubWithActivation(const ArithmeticParams& params,
-                              const RuntimeShape& input1_shape,
-                              const int32* input1_data,
-                              const RuntimeShape& input2_shape,
-                              const int32* input2_data,
-                              const RuntimeShape& output_shape,
-                              int32* output_data) {
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                int32_t* activation_min,
+                                int32_t* activation_max) {
+  *activation_min = params.quantized_activation_min;
+  *activation_max = params.quantized_activation_max;
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                float* activation_min, float* activation_max) {
+  *activation_min = params.float_activation_min;
+  *activation_max = params.float_activation_max;
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                int64_t* activation_min,
+                                int64_t* activation_max) {
+  *activation_min = params.int64_activation_min;
+  *activation_max = params.int64_activation_max;
+}
+
+template <typename T>
+inline void SubWithActivation(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
  ruy::profiler::ScopeLabel label("SubWithActivation");
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  T activation_min, activation_max;
+  SetActivationMinMax(params, &activation_min, &activation_max);
+
  for (int i = 0; i < flat_size; ++i) {
    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.quantized_activation_min,
-        params.quantized_activation_max);
+        input1_data[i] - input2_data[i], activation_min, activation_max);
  }
 }

-inline void SubWithActivation(const ArithmeticParams& params,
-                              const RuntimeShape& input1_shape,
-                              const float* input1_data,
-                              const RuntimeShape& input2_shape,
-                              const float* input2_data,
-                              const RuntimeShape& output_shape,
-                              float* output_data) {
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.float_activation_min,
-        params.float_activation_max);
-  }
-}
-
-
 }  // namespace reference_ops
 }  // namespace tflite

--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/tanh.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/tanh.h
@@ -0,0 +1,129 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
+
+#include <cmath>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = std::tanh(val);
+    output_data[i] = result;
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// uniform between data types.
+inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& output_shape,
+                 float* output_data) {
+  // Drop params: not needed.
+  Tanh(input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
+                 const int16_t* input_data, const RuntimeShape& output_shape,
+                 int16_t* output_data) {
+  const int input_left_shift = params.input_left_shift;
+  // Support for shifts is limited until we have a parameterized version of
+  // SaturatingRoundingMultiplyByPOT().
+  TFLITE_DCHECK_GE(input_left_shift, 0);
+  TFLITE_DCHECK_LE(input_left_shift, 1);
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  // F0 uses 0 integer bits, range [-1, 1].
+  // This is the return type of math functions such as tanh, logistic,
+  // whose range is in [-1, 1].
+  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+  if (input_left_shift == 0) {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(input_data[i]);
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  } else {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(
+          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  }
+}
+
+inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
+                 const uint8_t* input_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data) {
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int32_t input_multiplier = params.input_multiplier;
+  const int input_left_shift = params.input_left_shift;
+  const int32_t output_zero_point = 128;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8_t input_val_u8 = input_data[i];
+    const int32_t input_val_centered =
+        static_cast<int32_t>(input_val_u8) - input_zero_point;
+    uint8_t output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32_t input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      // Convert from Q0.31 to Q24.7.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+      output_val_s32 += output_zero_point;
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
+      }
+      // Reinterpret as Q0.7, encoded in uint8_t.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8_t>(output_val_s32);
+    }
+    output_data[i] = output_val;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/strided_slice_logic.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/strided_slice_logic.h
@@ -18,6 +18,7 @@ limitations under the License.

 #include <limits>
 #include <vector>
+
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"

@@ -69,8 +70,8 @@ inline void StridedSlicePadIndices(tflite::StridedSliceParams* p,
 }

 // Return the index for the first element along that axis. This index will be a
-// positive integer between [0, axis_size - 1] that can be used to index
-// directly into the data.
+// positive integer between [0, axis_size] (or [-1, axis_size -1] if stride < 0)
+// that can be used to index directly into the data.
 inline int StartForAxis(const tflite::StridedSliceParams& params,
                        const RuntimeShape& input_shape, int axis) {
  const auto begin_mask = params.begin_mask;
@@ -102,7 +103,13 @@ inline int StartForAxis(const tflite::StridedSliceParams& params,
  }

  // Clamping
-  start = Clamp(start, 0, axis_size - 1);
+  if (strides[axis] > 0) {
+    // Forward iteration
+    start = Clamp(start, 0, axis_size);
+  } else {
+    // Backward iteration
+    start = Clamp(start, -1, axis_size - 1);
+  }

  return start;
 }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/types.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/types.h
@@ -24,24 +24,29 @@ limitations under the License.

 namespace tflite {

-enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu };
-enum class PaddingType : uint8 { kNone, kSame, kValid };
+enum class FusedActivationFunctionType : uint8_t {
+  kNone,
+  kRelu6,
+  kRelu1,
+  kRelu
+};
+enum class PaddingType : uint8_t { kNone, kSame, kValid };

 struct PaddingValues {
-  int16 width;
-  int16 height;
+  int16_t width;
+  int16_t height;
  // offset is used for calculating "remaining" padding, for example, `width`
  // is 1 and `width_offset` is 1, so padding_left is 1 while padding_right is
  // 1 + 1 = 2.
-  int16 width_offset;
+  int16_t width_offset;
  // Same as width_offset except it's over the height dimension.
-  int16 height_offset;
+  int16_t height_offset;
 };

 // This enumeration allows for non-default formats for the weights array
 // of a fully-connected operator, allowing the use of special optimized
 // runtime paths.
-enum class FullyConnectedWeightsFormat : uint8 {
+enum class FullyConnectedWeightsFormat : uint8_t {
  // Default format (flat 2D layout, the inner contiguous dimension
  // is input_depth, the outer non-contiguous dimension is output_depth)
  kDefault,
@@ -88,11 +93,11 @@ enum class FullyConnectedWeightsFormat : uint8 {
  //     maximize arithmetic throughput.
  //
  // Finally, the 'Int8' part in the name refers to the fact that this
-  // weights format has each weights value encoded as a signed int8 value,
-  // even if the data type of the weights buffer is uint8.  This is intended
+  // weights format has each weights value encoded as a signed int8_t value,
+  // even if the data type of the weights buffer is uint8_t.  This is intended
  // to save runtime kernels the effort to have to XOR the top bit of these
  // bytes before using them in signed arithmetic, see this file for more
-  // explanations on the 'signed int8 trick' in matrix multiplication kernels:
+  // explanations on the 'signed int8_t trick' in matrix multiplication kernels:
  //
  //   tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
  //
@@ -111,7 +116,7 @@ enum class FullyConnectedWeightsFormat : uint8 {
 // the real 0 value, and scale designates the difference between the real values
 // corresponding to consecutive quantized values differing by 1.
 struct QuantizationParams {
-  int32 zero_point = 0;
+  int32_t zero_point = 0;
  double scale = 0.0;
 };

@@ -140,20 +145,20 @@ class RuntimeShape {
    if (dimensions_count > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
-      dims_pointer_ = new int32[dimensions_count];
+#else  // TF_LITE_STATIC_MEMORY
+      dims_pointer_ = new int32_t[dimensions_count];
 #endif  // TF_LITE_STATIC_MEMORY
    }
  }

-  RuntimeShape(int shape_size, int32 value) : size_(0) {
+  RuntimeShape(int shape_size, int32_t value) : size_(0) {
    Resize(shape_size);
    for (int i = 0; i < shape_size; ++i) {
      SetDim(i, value);
    }
  }

-  RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) {
+  RuntimeShape(int dimensions_count, const int32_t* dims_data) : size_(0) {
    ReplaceWith(dimensions_count, dims_data);
  }

@@ -165,33 +170,34 @@ class RuntimeShape {
  // rolls out.
  RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
    if (size_ > kMaxSmallSize) {
-      dims_pointer_ = new int32[size_];
+      dims_pointer_ = new int32_t[size_];
    }
-    std::memcpy(DimsData(), other.DimsData(), sizeof(int32) * size_);
+    std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * size_);
  }

  bool operator==(const RuntimeShape& comp) const {
    return this->size_ == comp.size_ &&
-           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32)) == 0;
+           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32_t)) ==
+               0;
  }

  ~RuntimeShape() {
    if (size_ > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
+#else  // TF_LITE_STATIC_MEMORY
      delete[] dims_pointer_;
 #endif  // TF_LITE_STATIC_MEMORY
    }
  }

-  inline int32 DimensionsCount() const { return size_; }
-  inline int32 Dims(int i) const {
+  inline int32_t DimensionsCount() const { return size_; }
+  inline int32_t Dims(int i) const {
    TFLITE_DCHECK_GE(i, 0);
    TFLITE_DCHECK_LT(i, size_);
    return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i];
  }
-  inline void SetDim(int i, int32 val) {
+  inline void SetDim(int i, int32_t val) {
    TFLITE_DCHECK_GE(i, 0);
    TFLITE_DCHECK_LT(i, size_);
    if (size_ > kMaxSmallSize) {
@@ -201,20 +207,20 @@ class RuntimeShape {
    }
  }

-  inline int32* DimsData() {
+  inline int32_t* DimsData() {
    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
  }
-  inline const int32* DimsData() const {
+  inline const int32_t* DimsData() const {
    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
  }
  // The caller must ensure that the shape is no bigger than 5-D.
-  inline const int32* DimsDataUpTo5D() const { return dims_; }
+  inline const int32_t* DimsDataUpTo5D() const { return dims_; }

  inline void Resize(int dimensions_count) {
    if (size_ > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
+#else  // TF_LITE_STATIC_MEMORY
      delete[] dims_pointer_;
 #endif  // TF_LITE_STATIC_MEMORY
    }
@@ -222,16 +228,16 @@ class RuntimeShape {
    if (dimensions_count > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
-      dims_pointer_ = new int32[dimensions_count];
+#else  // TF_LITE_STATIC_MEMORY
+      dims_pointer_ = new int32_t[dimensions_count];
 #endif  // TF_LITE_STATIC_MEMORY
    }
  }

-  inline void ReplaceWith(int dimensions_count, const int32* dims_data) {
+  inline void ReplaceWith(int dimensions_count, const int32_t* dims_data) {
    Resize(dimensions_count);
-    int32* dst_dims = DimsData();
-    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32));
+    int32_t* dst_dims = DimsData();
+    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
  }

  template <typename T>
@@ -239,7 +245,7 @@ class RuntimeShape {
    const int dimensions_count =
        std::distance(src_iterable.begin(), src_iterable.end());
    Resize(dimensions_count);
-    int32* data = DimsData();
+    int32_t* data = DimsData();
    for (auto it : src_iterable) {
      *data = it;
      ++data;
@@ -288,13 +294,13 @@ class RuntimeShape {
      SetDim(i, pad_value);
    }
    std::memcpy(DimsData() + size_increase, shape.DimsData(),
-                sizeof(int32) * shape.DimensionsCount());
+                sizeof(int32_t) * shape.DimensionsCount());
  }

-  int32 size_;
+  int32_t size_;
  union {
-    int32 dims_[kMaxSmallSize];
-    int32* dims_pointer_;
+    int32_t dims_[kMaxSmallSize];
+    int32_t* dims_pointer_;
  };
 };

@@ -432,7 +438,7 @@ int MatchingArraySize(const ArrayType1& array1, int index1,
 inline int MatchingDim(const RuntimeShape& shape1, int index1,
                       const RuntimeShape& shape2, int index2) {
  TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2));
-  return shape1.Dims(index1);
+  return std::min(shape1.Dims(index1), shape2.Dims(index2));
 }

 template <typename... Args>
@@ -713,7 +719,7 @@ void ComputeStrides(Dims<N>* dims) {
  }
 }

-enum class BroadcastableOpCategory : uint8 {
+enum class BroadcastableOpCategory : uint8_t {
  kNone,
  kNonBroadcast,               // Matching input shapes.
  kFirstInputBroadcastsFast,   // Fivefold nested loops.
@@ -729,21 +735,21 @@ static_assert(sizeof(MinMax) == 8, "");

 struct ActivationParams {
  FusedActivationFunctionType activation_type;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
 };

 struct ReluParams : public ActivationParams {
-  int32 input_offset;
-  int32 output_offset;
-  int32 output_multiplier;
-  int32 output_shift;
+  int32_t input_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
 };

 // Styles of resizing op usages. For example, kImageStyle can be used with a Pad
 // op for pattern-specific optimization.
-enum class ResizingCategory : uint8 {
+enum class ResizingCategory : uint8_t {
  kNone,
  kImageStyle,  // 4D, operating on inner dimensions, say {0, a, b, 0}.
  kGenericResize,
@@ -753,24 +759,29 @@ enum class ResizingCategory : uint8 {
 struct ArithmeticParams {
  // Shape dependent / common to data / op types.
  BroadcastableOpCategory broadcast_category;
-  // uint8 inference params.
-  int32 input1_offset;
-  int32 input2_offset;
-  int32 output_offset;
-  int32 output_multiplier;
+  // uint8_t inference params.
+  int32_t input1_offset;
+  int32_t input2_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
  int output_shift;
-  // Add / Sub, not Mul, uint8 inference params.
+  // Add / Sub, not Mul, uint8_t inference params.
  int left_shift;
-  int32 input1_multiplier;
+  int32_t input1_multiplier;
  int input1_shift;
-  int32 input2_multiplier;
+  int32_t input2_multiplier;
  int input2_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+
+  // TODO(b/158622529): Union the following activation params.
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
+  // int64_t activation params.
+  int64_t int64_activation_min;
+  int64_t int64_activation_max;

  // Processed output dimensions.
  // Let input "a" be the one that broadcasts in the faster-changing dimension.
@@ -785,22 +796,22 @@ struct ArithmeticParams {
 };

 struct ConcatenationParams {
-  int8 axis;
-  const int32* input_zeropoint;
+  int8_t axis;
+  const int32_t* input_zeropoint;
  const float* input_scale;
-  uint16 inputs_count;
-  int32 output_zeropoint;
+  uint16_t inputs_count;
+  int32_t output_zeropoint;
  float output_scale;
 };

 struct ComparisonParams {
-  // uint8 inference params.
+  // uint8_t inference params.
  int left_shift;
-  int32 input1_offset;
-  int32 input1_multiplier;
+  int32_t input1_offset;
+  int32_t input1_multiplier;
  int input1_shift;
-  int32 input2_offset;
-  int32 input2_multiplier;
+  int32_t input2_offset;
+  int32_t input2_multiplier;
  int input2_shift;
  // Shape dependent / common to inference types.
  bool is_broadcast;
@@ -810,81 +821,81 @@ struct ConvParams {
  PaddingType padding_type;
  PaddingValues padding_values;
  // TODO(starka): This was just "stride", so check that width+height is OK.
-  int16 stride_width;
-  int16 stride_height;
-  int16 dilation_width_factor;
-  int16 dilation_height_factor;
-  // uint8 inference params.
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  // uint8_t inference params.
  // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
-  int32 weights_offset;
-  int32 output_offset;
-  int32 output_multiplier;
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
  int output_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
 };

 struct DepthToSpaceParams {
-  int32 block_size;
+  int32_t block_size;
 };

 struct DepthwiseParams {
  PaddingType padding_type;
  PaddingValues padding_values;
-  int16 stride_width;
-  int16 stride_height;
-  int16 dilation_width_factor;
-  int16 dilation_height_factor;
-  int16 depth_multiplier;
-  // uint8 inference params.
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  int16_t depth_multiplier;
+  // uint8_t inference params.
  // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
-  int32 weights_offset;
-  int32 output_offset;
-  int32 output_multiplier;
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
  int output_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
-  const int32* output_multiplier_per_channel;
-  const int32* output_shift_per_channel;
+  const int32_t* output_multiplier_per_channel;
+  const int32_t* output_shift_per_channel;
 };

 struct DequantizationParams {
  double scale;
-  int32 zero_point;
+  int32_t zero_point;
 };

 struct PerChannelDequantizationParams {
  const float* scale;
-  const int32* zero_point;
-  int32 quantized_dimension;
+  const int32_t* zero_point;
+  int32_t quantized_dimension;
 };

 struct FakeQuantParams {
  MinMax minmax;
-  int32 num_bits;
+  int32_t num_bits;
 };

 struct FullyConnectedParams {
-  // uint8 inference params.
+  // uint8_t inference params.
  // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
-  int32 weights_offset;
-  int32 output_offset;
-  int32 output_multiplier;
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
  int output_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
@@ -895,16 +906,16 @@ struct FullyConnectedParams {
 };

 struct GatherParams {
-  int16 axis;
+  int16_t axis;
 };

 struct L2NormalizationParams {
-  // uint8 inference params.
-  int32 input_zero_point;
+  // uint8_t inference params.
+  int32_t input_zero_point;
 };

 struct LocalResponseNormalizationParams {
-  int32 range;
+  int32_t range;
  double bias;
  double alpha;
  double beta;
@@ -932,48 +943,50 @@ struct HardSwishParams {
 };

 struct LogisticParams {
-  // uint8 inference params.
-  int32 input_zero_point;
-  int32 input_range_radius;
-  int32 input_multiplier;
+  // uint8_t inference params.
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
  int input_left_shift;
 };

 struct LstmCellParams {
-  int32 weights_zero_point;
-  int32 accum_multiplier;
+  int32_t weights_zero_point;
+  int32_t accum_multiplier;
  int accum_shift;
  int state_integer_bits;
 };

 struct MeanParams {
-  int8 axis_count;
-  int16 axis[4];
+  int8_t axis_count;
+  int16_t axis[4];
 };

 struct PackParams {
-  int8 axis;
-  const int32* input_zeropoint;
+  int8_t axis;
+  const int32_t* input_zeropoint;
  const float* input_scale;
-  uint16 inputs_count;
-  int32 output_zeropoint;
+  uint16_t inputs_count;
+  int32_t output_zeropoint;
  float output_scale;
 };

 struct PadParams {
-  int8 left_padding_count;
-  int32 left_padding[4];
-  int8 right_padding_count;
-  int32 right_padding[4];
+  int8_t left_padding_count;
+  int32_t left_padding[4];
+  int8_t right_padding_count;
+  int32_t right_padding[4];
  ResizingCategory resizing_category;
 };

 struct PreluParams {
-  int32 input_offset;
-  int32 alpha_offset;
-  int32 output_offset;
-  int32 output_multiplier;
-  int output_shift;
+  int32_t input_offset;
+  int32_t alpha_offset;
+  int32_t output_offset;
+  int32_t output_multiplier_1;
+  int output_shift_1;
+  int32_t output_multiplier_2;
+  int output_shift_2;
 };

 struct PoolParams {
@@ -984,17 +997,17 @@ struct PoolParams {
  int stride_width;
  int filter_height;
  int filter_width;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
 };

 struct ReshapeParams {
-  int8 shape_count;
-  int32 shape[4];
+  int8_t shape_count;
+  int32_t shape[4];
 };

 struct ResizeBilinearParams {
@@ -1011,91 +1024,95 @@ struct ResizeNearestNeighborParams {
 };

 struct SliceParams {
-  int8 begin_count;
-  int32 begin[4];
-  int8 size_count;
-  int32 size[4];
+  int8_t begin_count;
+  int32_t begin[4];
+  int8_t size_count;
+  int32_t size[4];
 };

 struct SoftmaxParams {
  // beta is not really used (not a Tensorflow parameter) and not implemented
  // for LogSoftmax.
  double beta;
-  // uint8 inference params.  Used even when beta defaults to 1.0.
-  int32 input_multiplier;
-  int32 input_left_shift;
+  // uint8_t inference params.  Used even when beta defaults to 1.0.
+  int32_t input_multiplier;
+  int32_t input_left_shift;
  // Reverse scaling is only used by LogSoftmax.
-  int32 reverse_scaling_divisor;
-  int32 reverse_scaling_right_shift;
+  int32_t reverse_scaling_divisor;
+  int32_t reverse_scaling_right_shift;
  int diff_min;
  int32_t zero_point;
  float scale;
  float* table;
+  // int16 LUT for exp(x), where x uniform distributed between [-10.0 , 0.0]
  int16_t* exp_lut;
+  // int16 LUT for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0]
  int16_t* one_over_one_plus_x_lut;
+  uint8_t* uint8_table1;
+  uint8_t* uint8_table2;
 };

 struct SpaceToBatchParams {
-  // "Zero" padding for uint8 means padding with the output offset.
-  int32 output_offset;
+  // "Zero" padding for uint8_t means padding with the output offset.
+  int32_t output_offset;
 };

 struct SpaceToDepthParams {
-  int32 block_size;
+  int32_t block_size;
 };

 struct SplitParams {
  // Graphs that split into, say, 2000 nodes are encountered.  The indices in
-  // OperatorEdges are of type uint16.
-  uint16 num_split;
-  int16 axis;
+  // OperatorEdges are of type uint16_t.
+  uint16_t num_split;
+  int16_t axis;
 };

 struct SqueezeParams {
-  int8 squeeze_dims_count;
-  int32 squeeze_dims[4];
+  int8_t squeeze_dims_count;
+  int32_t squeeze_dims[4];
 };

 struct StridedSliceParams {
-  int8 start_indices_count;
-  int32 start_indices[5];
-  int8 stop_indices_count;
-  int32 stop_indices[5];
-  int8 strides_count;
-  int32 strides[5];
+  int8_t start_indices_count;
+  int32_t start_indices[5];
+  int8_t stop_indices_count;
+  int32_t stop_indices[5];
+  int8_t strides_count;
+  int32_t strides[5];

-  int16 begin_mask;
-  int16 ellipsis_mask;
-  int16 end_mask;
-  int16 new_axis_mask;
-  int16 shrink_axis_mask;
+  int16_t begin_mask;
+  int16_t ellipsis_mask;
+  int16_t end_mask;
+  int16_t new_axis_mask;
+  int16_t shrink_axis_mask;
 };

 struct TanhParams {
-  int32 input_zero_point;
-  int32 input_range_radius;
-  int32 input_multiplier;
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
  int input_left_shift;
 };

 struct TransposeParams {
-  int8 perm_count;
-  int32 perm[5];
+  int8_t perm_count;
+  int32_t perm[5];
 };

 struct UnpackParams {
-  uint16 num_split;
-  int16 axis;
+  uint16_t num_split;
+  int16_t axis;
 };

 struct LeakyReluParams {
  float alpha;
-  int32 input_offset;
-  int32 output_offset;
-  int32 output_multiplier_alpha;
-  int32 output_shift_alpha;
-  int32 output_multiplier_identity;
-  int32 output_shift_identity;
+  int32_t input_offset;
+  int32_t output_offset;
+  int32_t output_multiplier_alpha;
+  int32_t output_shift_alpha;
+  int32_t output_multiplier_identity;
+  int32_t output_shift_identity;
 };

 template <typename P>
@@ -1105,13 +1122,19 @@ inline void SetActivationParams(float min, float max, P* params) {
 }

 template <typename P>
-inline void SetActivationParams(int32 min, int32 max, P* params) {
+inline void SetActivationParams(int32_t min, int32_t max, P* params) {
  params->quantized_activation_min = min;
  params->quantized_activation_max = max;
 }

 template <typename P>
-inline void GetActivationParams(const P& params, int32* min, int32* max) {
+inline void SetActivationParams(int64_t min, int64_t max, P* params) {
+  params->int64_activation_min = min;
+  params->int64_activation_max = max;
+}
+
+template <typename P>
+inline void GetActivationParams(const P& params, int32_t* min, int32_t* max) {
  *min = params.quantized_activation_min;
  *max = params.quantized_activation_max;
 }
@@ -1122,6 +1145,11 @@ inline void GetActivationParams(const P& params, float* min, float* max) {
  *max = params.float_activation_max;
 }

+template <typename P>
+inline void GetActivationParams(const P& params, int64_t* min, int64_t* max) {
+  *min = params.int64_activation_min;
+  *max = params.int64_activation_max;
+}
 }  // namespace tflite

 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/kernel_util.cc
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/kernel_util.cc
@@ -14,15 +14,176 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/kernel_util.h"

+#include <stdint.h>
+#include <stdlib.h>
+
 #include <algorithm>
-#include <cmath>
+#include <complex>
+#include <limits>
 #include <memory>

+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"

 namespace tflite {

+namespace {
+
+// Assumes tensor_index is a valid index (in bounds)
+inline TfLiteTensor* GetTensorAtIndex(const TfLiteContext* context,
+                                      int tensor_index) {
+  if (context->tensors != nullptr) {
+    return &context->tensors[tensor_index];
+  } else {
+    return context->GetTensor(context, tensor_index);
+  }
+}
+
+// Validate in a single place to reduce binary size
+inline TfLiteStatus ValidateTensorIndexingSafe(const TfLiteContext* context,
+                                               int index, int max_size,
+                                               const int* tensor_indices,
+                                               int* tensor_index) {
+  if (index < 0 || index >= max_size) {
+    TF_LITE_KERNEL_LOG(const_cast<TfLiteContext*>(context),
+                       "Invalid tensor index %d (not in [0, %d))\n", index,
+                       max_size);
+    return kTfLiteError;
+  }
+  if (tensor_indices[index] == kTfLiteOptionalTensor) {
+    TF_LITE_KERNEL_LOG(const_cast<TfLiteContext*>(context),
+                       "Tensor at index %d was optional but was expected\n",
+                       index);
+    return kTfLiteError;
+  }
+
+  *tensor_index = tensor_indices[index];
+  return kTfLiteOk;
+}
+
+// Same as above but returns -1 for invalid inputs instead of status + logging
+// error.
+inline int ValidateTensorIndexing(const TfLiteContext* context, int index,
+                                  int max_size, const int* tensor_indices) {
+  if (index >= 0 && index < max_size) {
+    const int tensor_index = tensor_indices[index];
+    if (tensor_index != kTfLiteOptionalTensor) {
+      return tensor_index;
+    }
+  }
+  return -1;
+}
+
+inline TfLiteTensor* GetMutableInput(const TfLiteContext* context,
+                                     const TfLiteNode* node, int index) {
+  const int tensor_index = ValidateTensorIndexing(
+      context, index, node->inputs->size, node->inputs->data);
+  if (tensor_index < 0) {
+    return nullptr;
+  }
+  return GetTensorAtIndex(context, tensor_index);
+}
+
+inline TfLiteStatus GetMutableInputSafe(const TfLiteContext* context,
+                                        const TfLiteNode* node, int index,
+                                        const TfLiteTensor** tensor) {
+  int tensor_index;
+  TF_LITE_ENSURE_OK(
+      context, ValidateTensorIndexingSafe(context, index, node->inputs->size,
+                                          node->inputs->data, &tensor_index));
+  *tensor = GetTensorAtIndex(context, tensor_index);
+  return kTfLiteOk;
+}
+
+}  // anonymous namespace.
+
+const TfLiteTensor* GetInput(const TfLiteContext* context,
+                             const TfLiteNode* node, int index) {
+  return GetMutableInput(context, node, index);
+}
+
+TfLiteStatus GetInputSafe(const TfLiteContext* context, const TfLiteNode* node,
+                          int index, const TfLiteTensor** tensor) {
+  return GetMutableInputSafe(context, node, index, tensor);
+}
+
+TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
+                               int index) {
+  TfLiteTensor* tensor = GetMutableInput(context, node, index);
+  return tensor->is_variable ? tensor : nullptr;
+}
+
+TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
+                        int index) {
+  const int tensor_index = ValidateTensorIndexing(
+      context, index, node->outputs->size, node->outputs->data);
+  if (tensor_index < 0) {
+    return nullptr;
+  }
+  return GetTensorAtIndex(context, tensor_index);
+}
+
+TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node,
+                           int index, TfLiteTensor** tensor) {
+  int tensor_index;
+  TF_LITE_ENSURE_OK(
+      context, ValidateTensorIndexingSafe(context, index, node->outputs->size,
+                                          node->outputs->data, &tensor_index));
+  *tensor = GetTensorAtIndex(context, tensor_index);
+  return kTfLiteOk;
+}
+
+const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
+                                           const TfLiteNode* node, int index) {
+  return GetInput(context, node, index);
+}
+
+#ifndef TF_LITE_STATIC_MEMORY
+TfLiteTensor* GetTemporary(TfLiteContext* context, const TfLiteNode* node,
+                           int index) {
+  const int tensor_index = ValidateTensorIndexing(
+      context, index, node->temporaries->size, node->temporaries->data);
+  if (tensor_index < 0) {
+    return nullptr;
+  }
+  return GetTensorAtIndex(context, tensor_index);
+}
+
+TfLiteStatus GetTemporarySafe(const TfLiteContext* context,
+                              const TfLiteNode* node, int index,
+                              TfLiteTensor** tensor) {
+  int tensor_index;
+  TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe(
+                                 context, index, node->temporaries->size,
+                                 node->temporaries->data, &tensor_index));
+  *tensor = GetTensorAtIndex(context, tensor_index);
+  return kTfLiteOk;
+}
+
+const TfLiteTensor* GetIntermediates(TfLiteContext* context,
+                                     const TfLiteNode* node, int index) {
+  const int tensor_index = ValidateTensorIndexing(
+      context, index, node->intermediates->size, node->intermediates->data);
+  if (tensor_index < 0) {
+    return nullptr;
+  }
+  return GetTensorAtIndex(context, tensor_index);
+}
+
+TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context,
+                                  const TfLiteNode* node, int index,
+                                  TfLiteTensor** tensor) {
+  int tensor_index;
+  TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe(
+                                 context, index, node->intermediates->size,
+                                 node->intermediates->data, &tensor_index));
+  *tensor = GetTensorAtIndex(context, tensor_index);
+  return kTfLiteOk;
+}
+#endif  // TF_LITE_STATIC_MEMORY
+
 // Per-axis
 TfLiteStatus PopulateConvolutionQuantizationParams(
    TfLiteContext* context, const TfLiteTensor* input,
@@ -126,11 +287,27 @@ TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
  // pipeline.
  if (bias) {
    const double bias_scale = static_cast<double>(bias->params.scale);
-    // Here we're making sure the input_product_scale & bias_scale the same.
-    // Normally this should be guaranteed by the training pipeline, we are
-    // setting the threshold to be 2e-6 to allow some numeric stability
-    // difference.
-    TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <= 2e-6);
+    // Here we're making sure the input_product_scale & bias_scale are about the
+    // same. Since we have:
+    // (output - output_zp) * output_scale =
+    // input_product_scale * input_product + bias * bias_scale ---- (0)
+    //
+    // (0) equals:
+    // (input_product + bias) * input_product_scale ----- (1)
+    //           +
+    // bias * (bias_scale - input_product_scale)   ------ (2)
+    //
+    // For the real kernel computation, we're doing (1), so we really need to
+    // make sure (2) has minimum impact on the output, so:
+    // bias * (bias_scale - input_product_scale) / output_scale should be
+    // a small number for an integer.
+    // Since normally bias should be within a small range.
+    // We should expect (bias_scale - input_product_scale) / output_scale to
+    // be a small number like 0.02.
+    const double scale_diff = std::abs(input_product_scale - bias_scale);
+    const double output_scale = static_cast<double>(output->params.scale);
+
+    TF_LITE_ENSURE(context, scale_diff / output_scale <= 0.02);
  }
  return GetQuantizedConvolutionMultipler(context, input, filter, output,
                                          multiplier);
@@ -167,7 +344,7 @@ void CalculateActivationRangeQuantizedImpl(TfLiteFusedActivation activation,
  } else if (activation == kTfLiteActRelu6) {
    *act_min = std::max(qmin, quantize(0.0));
    *act_max = std::min(qmax, quantize(6.0));
-  } else if (activation == kTfLiteActRelu1) {
+  } else if (activation == kTfLiteActReluN1To1) {
    *act_min = std::max(qmin, quantize(-1.0));
    *act_max = std::min(qmax, quantize(1.0));
  } else {
@@ -258,4 +435,44 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
 }
 #endif  // TF_LITE_STATIC_MEMORY

+// Size of string is not constant, return 0 in such case.
+int TfLiteTypeGetSize(TfLiteType type) {
+  switch (type) {
+    case kTfLiteUInt8:
+      TF_LITE_ASSERT_EQ(sizeof(uint8_t), 1);
+      return 1;
+    case kTfLiteInt8:
+      TF_LITE_ASSERT_EQ(sizeof(int8_t), 1);
+      return 1;
+    case kTfLiteBool:
+      return sizeof(bool);
+    case kTfLiteInt16:
+      TF_LITE_ASSERT_EQ(sizeof(int16_t), 2);
+      return 2;
+    case kTfLiteFloat16:
+      TF_LITE_ASSERT_EQ(sizeof(int16_t), 2);
+      return 2;
+    case kTfLiteFloat32:
+      TF_LITE_ASSERT_EQ(sizeof(float), 4);
+      return 4;
+    case kTfLiteInt32:
+      TF_LITE_ASSERT_EQ(sizeof(int32_t), 4);
+      return 4;
+    case kTfLiteInt64:
+      TF_LITE_ASSERT_EQ(sizeof(int64_t), 8);
+      return 8;
+    case kTfLiteFloat64:
+      TF_LITE_ASSERT_EQ(sizeof(double), 8);
+      return 8;
+    case kTfLiteComplex64:
+      TF_LITE_ASSERT_EQ(sizeof(std::complex<float>), 8);
+      return 8;
+    case kTfLiteComplex128:
+      TF_LITE_ASSERT_EQ(sizeof(std::complex<double>), 16);
+      return 16;
+    default:
+      return 0;
+  }
+}
+
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/kernels/kernel_util.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/kernel_util.h
@@ -15,52 +15,148 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
 #define TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_

-#include <algorithm>
+#include <stdint.h>
+
 #include <limits>

-#include "flatbuffers/flatbuffers.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"

 namespace tflite {

+// A fair number of functions in this header have historically been inline.
+// It is ok to change functions to not be inline if the latency with
+// benchmark_model for MobileNet + MobileBERT is unaffected. If such a change is
+// made, move the newly non-inlined function declarations to the top of this
+// header file.
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetInput(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+const TfLiteTensor* GetInput(const TfLiteContext* context,
+                             const TfLiteNode* node, int index);
+
+// Same as `GetInput` but returns boolean and uses output argument for tensor.
+//
+//   TfLiteTensor* my_tensor;
+//   TF_LITE_ENSURE_OK(context,
+//                     GetInputSafe(context, node, kMyTensorIdx, &my_tensor));
+//   // can use my_tensor directly from here onwards, it is not nullptr
+//
+// Should be used in cases where the binary size is too large.
+TfLiteStatus GetInputSafe(const TfLiteContext* context, const TfLiteNode* node,
+                          int index, const TfLiteTensor** tensor);
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
+                               int index);
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetOutput(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
+                        int index);
+
+// Same as `GetOutput` but returns boolean and uses output argument for tensor.
+//
+//   TfLiteTensor* my_tensor;
+//   TF_LITE_ENSURE_OK(context,
+//                     GetOutputSafe(context, node, kMyTensorIdx, &my_tensor));
+//   // can use my_tensor directly from here onwards, it is not nullptr
+//
+// Should be used in cases where the binary size is too large.
+TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node,
+                           int index, TfLiteTensor** tensor);
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetOptionalInputTensor(context, node, kIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+//
+// Deprecated. GetInput has the same functionality.
+const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
+                                           const TfLiteNode* node, int index);
+
+#ifndef TF_LITE_STATIC_MEMORY
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetTemporary(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+TfLiteTensor* GetTemporary(TfLiteContext* context, const TfLiteNode* node,
+                           int index);
+
+// Same as `GetTemporary` but returns boolean and uses output argument for
+// tensor.
+//
+//   TfLiteTensor* my_tensor;
+//   TF_LITE_ENSURE_OK(context,
+//                     GetTemporarySafe(context, node, kMyTensorIdx,
+//                     &my_tensor));
+//   // can use my_tensor directly from here onwards, it is not nullptr
+//
+// Should be used in cases where the binary size is too large.
+TfLiteStatus GetTemporarySafe(const TfLiteContext* context,
+                              const TfLiteNode* node, int index,
+                              TfLiteTensor** tensor);
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetIntermediates(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+const TfLiteTensor* GetIntermediates(TfLiteContext* context,
+                                     const TfLiteNode* node, int index);
+
+// Same as `GetIntermediates` but returns boolean and uses output argument for
+// tensor.
+//
+//   TfLiteTensor* my_tensor;
+//   TF_LITE_ENSURE_OK(context,
+//                     GetIntermediatesSafe(context, node, kMyTensorIdx,
+//                     &my_tensor));
+//   // can use my_tensor directly from here onwards, it is not nullptr
+//
+// Should be used in cases where the binary size is too large.
+TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context,
+                                  const TfLiteNode* node, int index,
+                                  TfLiteTensor** tensor);
+#endif  // TF_LITE_STATIC_MEMORY
+
 inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
 inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
  return t->dims->data[dim];
 }
-inline const TfLiteTensor* GetInput(TfLiteContext* context,
-                                    const TfLiteNode* node, int index) {
-  return &context
-              ->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
-}
-// Note: You must check if result is not null:
-// TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
-// TF_LITE_ENSURE(context, my_tensor != nullptr);
-inline TfLiteTensor* GetVariableInput(TfLiteContext* context,
-                                      const TfLiteNode* node, int index) {
-  TfLiteTensor* tensor =
-      &context->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
-  return (tensor->is_variable) ? tensor : nullptr;
-}
-inline TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
-                               int index) {
-  return &context
-              ->tensors[flatbuffers::EndianScalar(node->outputs->data[index])];
-}
-inline TfLiteTensor* GetTemporary(TfLiteContext* context,
-                                  const TfLiteNode* node, int index) {
-  return &context->tensors[flatbuffers::EndianScalar(
-      node->temporaries->data[index])];
-}
-inline const TfLiteTensor* GetIntermediates(TfLiteContext* context,
-                                            const TfLiteNode* node, int index) {
-  return &context->tensors[node->intermediates->data[index]];
-}
+
 inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
 inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
+
+#ifndef TF_LITE_STATIC_MEMORY
 inline int NumIntermediates(const TfLiteNode* node) {
  return node->intermediates->size;
 }
+#endif  // TF_LITE_STATIC_MEMORY

 inline int64_t NumElements(const TfLiteIntArray* dims) {
  int64_t count = 1;
@@ -74,19 +170,11 @@ inline int64_t NumElements(const TfLiteTensor* t) {
  return NumElements(t->dims);
 }

-inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
-                                                  const TfLiteNode* node,
-                                                  int index) {
-  const bool use_tensor = index < node->inputs->size &&
-                          node->inputs->data[index] != kTfLiteOptionalTensor;
-  if (use_tensor) {
-    return &context
-                ->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
-  }
-  return nullptr;
-}
-
 // Determines whether tensor is constant.
+// TODO(b/138199592): Introduce new query which checks for constant OR
+// persistent-read-only, which would be useful for most tensor kernels that
+// are potentially dynamic based on the input tensor value availability at the
+// time of prepare.
 inline bool IsConstantTensor(const TfLiteTensor* tensor) {
  return tensor->allocation_type == kTfLiteMmapRo;
 }
@@ -105,6 +193,14 @@ inline void SetTensorToDynamic(TfLiteTensor* tensor) {
  }
 }

+// Sets tensor to persistent and read-only.
+inline void SetTensorToPersistentRo(TfLiteTensor* tensor) {
+  if (tensor->allocation_type != kTfLitePersistentRo) {
+    tensor->allocation_type = kTfLitePersistentRo;
+    tensor->data.raw = nullptr;
+  }
+}
+
 // Determines whether it is a hybrid op - one that has float inputs and
 // quantized weights.
 inline bool IsHybridOp(const TfLiteTensor* input, const TfLiteTensor* weight) {
@@ -162,7 +258,7 @@ void CalculateActivationRange(TfLiteFusedActivation activation,
  } else if (activation == kTfLiteActRelu6) {
    *activation_min = 0;
    *activation_max = 6;
-  } else if (activation == kTfLiteActRelu1) {
+  } else if (activation == kTfLiteActReluN1To1) {
    *activation_min = -1;
    *activation_max = 1;
  } else {
@@ -188,6 +284,10 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
                                        const TfLiteTensor* input2,
                                        const TfLiteTensor* input3,
                                        TfLiteIntArray** output_shape);
+
+// Return the size of given type in bytes. Return 0 in in case of string.
+int TfLiteTypeGetSize(TfLiteType type);
+
 }  // namespace tflite

 #endif  // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/op_macros.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/op_macros.h
@@ -19,7 +19,7 @@ limitations under the License.
 // non-portable function.
 #ifdef TF_LITE_MCU_DEBUG_LOG

-#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/micro/debug_log.h"

 #define DEBUG_LOG(x) \
  do {               \
@@ -36,7 +36,6 @@ inline void InfiniteLoop() {

 #else  // TF_LITE_MCU_DEBUG_LOG

-#include <cassert>
 #include <cstdio>
 #include <cstdlib>

@@ -45,6 +44,15 @@ inline void InfiniteLoop() {
    fprintf(stderr, "%s", (x)); \
  } while (0)

+// Report Error for unsupported type by op 'op_name' and returns kTfLiteError.
+#define TF_LITE_UNSUPPORTED_TYPE(context, type, op_name)                    \
+  do {                                                                      \
+    TF_LITE_KERNEL_LOG((context), "%s:%d Type %s is unsupported by op %s.", \
+                       __FILE__, __LINE__, TfLiteTypeGetName(type),         \
+                       (op_name));                                          \
+    return kTfLiteError;                                                    \
+  } while (0)
+
 #define TFLITE_ABORT abort()

 #endif  // TF_LITE_MCU_DEBUG_LOG