Merge branch 'rolling' into master

2025-12-08 20:46:52 +03:00 · 2020-11-13 07:05:37 +01:00
parent f4f871002b bb92d4aa54
commit 14d221bf9c
171 changed files with 16367 additions and 11456 deletions
--- a/README.md
+++ b/README.md
@@ -25,7 +25,11 @@ A 3d-printable housing can be found here: https://www.thingiverse.com/thing:4571
 **General remark:** Beside the `firmware.bin`, typically also the content of `/html` needs to be updated!
-##### Rolling - (2020-11-03)
+##### Rolling - (2020-11-08)
 * Updated Tensorflow tflite Kernel to master@20201108 (R2.4?)
 2020-11-03
 * Bug-Fix in time sync on warm reboot
--- a/code/lib/jomjol_tfliteclass/CTfLiteClass.cpp
+++ b/code/lib/jomjol_tfliteclass/CTfLiteClass.cpp
@@ -152,7 +152,8 @@ bool CTfLiteClass::LoadInputImage(std::string _fn)
 void CTfLiteClass::MakeAllocate()
 {
-    static tflite::ops::micro::AllOpsResolver resolver;
+//    static tflite::ops::micro::AllOpsResolver resolver;
    static tflite::AllOpsResolver resolver;
    this->interpreter = new tflite::MicroInterpreter(this->model, resolver, this->tensor_arena, this->kTensorArenaSize, this->error_reporter);
    TfLiteStatus allocate_status = this->interpreter->AllocateTensors();
--- a/code/lib/jomjol_tfliteclass/CTfLiteClass.h
+++ b/code/lib/jomjol_tfliteclass/CTfLiteClass.h
@@ -5,7 +5,7 @@
    exit(1);                                                 \
  }
-#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -39,7 +39,8 @@ class CTfLiteClass
        const tflite::Model* model;
        tflite::MicroInterpreter* interpreter;
        TfLiteTensor* output = nullptr;     
-        static tflite::ops::micro::AllOpsResolver *resolver; 
+//        static tflite::ops::micro::AllOpsResolver *resolver; 
        static tflite::AllOpsResolver resolver;
        int kTensorArenaSize;
        uint8_t *tensor_arena;
--- a/code/lib/tfmicro/CMakeLists.txt
+++ b/code/lib/tfmicro/CMakeLists.txt
@@ -23,8 +23,8 @@ if(NOT DEFINED ENV{IDF_PATH})
 endif()
 idf_component_register(
-  SRCS tensorflow/lite/micro/simple_memory_allocator.cc tensorflow/lite/micro/micro_error_reporter.cc tensorflow/lite/micro/memory_helpers.cc tensorflow/lite/micro/test_helpers.cc tensorflow/lite/micro/micro_utils.cc tensorflow/lite/micro/micro_time.cc tensorflow/lite/micro/debug_log.cc tensorflow/lite/micro/micro_string.cc tensorflow/lite/micro/micro_optional_debug_tools.cc tensorflow/lite/micro/micro_interpreter.cc tensorflow/lite/micro/micro_allocator.cc tensorflow/lite/micro/kernels/comparisons.cc tensorflow/lite/micro/kernels/fully_connected.cc tensorflow/lite/micro/kernels/depthwise_conv.cc tensorflow/lite/micro/kernels/logistic.cc tensorflow/lite/micro/kernels/pooling.cc tensorflow/lite/micro/kernels/prelu.cc tensorflow/lite/micro/kernels/concatenation.cc tensorflow/lite/micro/kernels/dequantize.cc tensorflow/lite/micro/kernels/pad.cc tensorflow/lite/micro/kernels/l2norm.cc tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc tensorflow/lite/micro/kernels/activations.cc tensorflow/lite/micro/kernels/ceil.cc tensorflow/lite/micro/kernels/arg_min_max.cc tensorflow/lite/micro/kernels/reduce.cc tensorflow/lite/micro/kernels/unpack.cc tensorflow/lite/micro/kernels/add.cc tensorflow/lite/micro/kernels/split.cc tensorflow/lite/micro/kernels/circular_buffer.cc tensorflow/lite/micro/kernels/softmax.cc tensorflow/lite/micro/kernels/floor.cc tensorflow/lite/micro/kernels/sub.cc tensorflow/lite/micro/kernels/mul.cc tensorflow/lite/micro/kernels/conv.cc tensorflow/lite/micro/kernels/neg.cc tensorflow/lite/micro/kernels/quantize.cc tensorflow/lite/micro/kernels/elementwise.cc tensorflow/lite/micro/kernels/all_ops_resolver.cc tensorflow/lite/micro/kernels/svdf.cc tensorflow/lite/micro/kernels/maximum_minimum.cc tensorflow/lite/micro/kernels/reshape.cc tensorflow/lite/micro/kernels/strided_slice.cc tensorflow/lite/micro/kernels/round.cc tensorflow/lite/micro/kernels/pack.cc tensorflow/lite/micro/kernels/logical.cc tensorflow/lite/micro/memory_planner/linear_memory_planner.cc tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc tensorflow/lite/c/common.c tensorflow/lite/core/api/error_reporter.cc tensorflow/lite/core/api/flatbuffer_conversions.cc tensorflow/lite/core/api/op_resolver.cc tensorflow/lite/core/api/tensor_utils.cc tensorflow/lite/kernels/internal/quantization_util.cc tensorflow/lite/kernels/kernel_util.cc tensorflow/lite/micro/testing/test_utils.cc 
+  SRCS tensorflow/lite/micro/micro_error_reporter.cc tensorflow/lite/micro/simple_memory_allocator.cc tensorflow/lite/micro/memory_helpers.cc tensorflow/lite/micro/test_helpers.cc tensorflow/lite/micro/recording_micro_allocator.cc tensorflow/lite/micro/micro_time.cc tensorflow/lite/micro/recording_simple_memory_allocator.cc tensorflow/lite/micro/micro_string.cc tensorflow/lite/micro/micro_profiler.cc tensorflow/lite/micro/debug_log.cc tensorflow/lite/micro/all_ops_resolver.cc tensorflow/lite/micro/micro_utils.cc tensorflow/lite/micro/micro_interpreter.cc tensorflow/lite/micro/micro_allocator.cc tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc tensorflow/lite/micro/memory_planner/linear_memory_planner.cc tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc tensorflow/lite/micro/testing/test_conv_model.cc tensorflow/lite/c/common.c tensorflow/lite/core/api/error_reporter.cc tensorflow/lite/core/api/flatbuffer_conversions.cc tensorflow/lite/core/api/op_resolver.cc tensorflow/lite/core/api/tensor_utils.cc tensorflow/lite/kernels/internal/quantization_util.cc tensorflow/lite/kernels/kernel_util.cc tensorflow/lite/schema/schema_utils.cc tensorflow/lite/micro/kernels/prelu.cc tensorflow/lite/micro/kernels/dequantize.cc tensorflow/lite/micro/kernels/pad.cc tensorflow/lite/micro/kernels/shape.cc tensorflow/lite/micro/kernels/l2norm.cc tensorflow/lite/micro/kernels/tanh.cc tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc tensorflow/lite/micro/kernels/logical.cc tensorflow/lite/micro/kernels/kernel_util.cc tensorflow/lite/micro/kernels/ceil.cc tensorflow/lite/micro/kernels/arg_min_max.cc tensorflow/lite/micro/kernels/softmax.cc tensorflow/lite/micro/kernels/sub.cc tensorflow/lite/micro/kernels/add.cc tensorflow/lite/micro/kernels/floor.cc tensorflow/lite/micro/kernels/kernel_runner.cc tensorflow/lite/micro/kernels/split_v.cc tensorflow/lite/micro/kernels/hard_swish.cc tensorflow/lite/micro/kernels/pooling.cc tensorflow/lite/micro/kernels/concatenation.cc tensorflow/lite/micro/kernels/mul.cc tensorflow/lite/micro/kernels/unpack.cc tensorflow/lite/micro/kernels/round.cc tensorflow/lite/micro/kernels/quantize.cc tensorflow/lite/micro/kernels/ethosu.cc tensorflow/lite/micro/kernels/svdf.cc tensorflow/lite/micro/kernels/maximum_minimum.cc tensorflow/lite/micro/kernels/reshape.cc tensorflow/lite/micro/kernels/reduce.cc tensorflow/lite/micro/kernels/strided_slice.cc tensorflow/lite/micro/kernels/neg.cc tensorflow/lite/micro/kernels/pack.cc tensorflow/lite/micro/kernels/elementwise.cc tensorflow/lite/micro/kernels/comparisons.cc tensorflow/lite/micro/kernels/fully_connected.cc tensorflow/lite/micro/kernels/depthwise_conv.cc tensorflow/lite/micro/kernels/split.cc tensorflow/lite/micro/kernels/logistic.cc tensorflow/lite/micro/kernels/circular_buffer.cc tensorflow/lite/micro/kernels/conv.cc tensorflow/lite/micro/kernels/activations.cc 
-  INCLUDE_DIRS . third_party/gemmlowp third_party/flatbuffers/include third_party/ruy third_party/kissfft)
+  INCLUDE_DIRS . third_party/gemmlowp third_party/flatbuffers/include third_party/ruy)
 # Reduce the level of paranoia to be able to compile TF sources
 target_compile_options(${COMPONENT_LIB} PRIVATE
@@ -32,6 +32,7 @@ target_compile_options(${COMPONENT_LIB} PRIVATE
  -Wno-missing-field-initializers
  -Wno-type-limits)
-target_compile_options(${COMPONENT_LIB} PRIVATE -std=c11   -DTF_LITE_STATIC_MEMORY -O3 -Wno-nonnull -Wno-nonnull -Wno-nonnull -Wno-nonnull)
+target_compile_options(${COMPONENT_LIB} PRIVATE -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Werror -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter)
-target_compile_options(${COMPONENT_LIB} PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -std=c++11 -DTF_LITE_STATIC_MEMORY -O3 -Wno-return-type -Wno-strict-aliasing -Wno-ignored-qualifiers -Wno-return-type -Wno-strict-aliasing -Wno-ignored-qualifiers -Wno-return-type -Wno-strict-aliasing -Wno-return-type -Wno-strict-aliasing >)
+target_compile_options(${COMPONENT_LIB} PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -std=c++11 -fno-rtti -fno-exceptions -fno-threadsafe-statics -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Werror -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter   >)
 target_compile_options(${COMPONENT_LIB} INTERFACE $<$<IN_LIST:-DTF_LITE_STATIC_MEMORY,$<TARGET_PROPERTY:${COMPONENT_LIB},COMPILE_OPTIONS>>:-DTF_LITE_STATIC_MEMORY>)
 target_link_libraries(${COMPONENT_LIB} PRIVATE -lm)
--- a/code/lib/tfmicro/third_party/gemmlowp/LICENSE
+++ b/code/lib/tfmicro/third_party/gemmlowp/LICENSE
--- a/code/lib/tfmicro/fixedpoint/fixedpoint_neon.h
+++ b/code/lib/tfmicro/fixedpoint/fixedpoint_neon.h
@@ -0,0 +1,331 @@
 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // fixedpoint_neon.h: optimized NEON specializations of the templates
 // in fixedpoint.h.
 #ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
 #define GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
 #include <arm_neon.h>
 namespace gemmlowp {
 template <>
 struct FixedPointRawTypeTraits<int32x4_t> {
  typedef std::int32_t ScalarRawType;
  static constexpr int kLanes = 4;
 };
 template <>
 struct FixedPointRawTypeTraits<int16x8_t> {
  typedef std::int16_t ScalarRawType;
  static constexpr int kLanes = 8;
 };
 template <>
 inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) {
  return vandq_s32(a, b);
 }
 template <>
 inline int16x8_t BitAnd(int16x8_t a, int16x8_t b) {
  return vandq_s16(a, b);
 }
 template <>
 inline int32x4_t BitOr(int32x4_t a, int32x4_t b) {
  return vorrq_s32(a, b);
 }
 template <>
 inline int16x8_t BitOr(int16x8_t a, int16x8_t b) {
  return vorrq_s16(a, b);
 }
 template <>
 inline int32x4_t BitXor(int32x4_t a, int32x4_t b) {
  return veorq_s32(a, b);
 }
 template <>
 inline int16x8_t BitXor(int16x8_t a, int16x8_t b) {
  return veorq_s16(a, b);
 }
 template <>
 inline int32x4_t BitNot(int32x4_t a) {
  return veorq_s32(a, vdupq_n_s32(-1));
 }
 template <>
 inline int16x8_t BitNot(int16x8_t a) {
  return veorq_s16(a, vdupq_n_s16(-1));
 }
 template <>
 inline int32x4_t Add(int32x4_t a, int32x4_t b) {
  return vaddq_s32(a, b);
 }
 template <>
 inline int16x8_t Add(int16x8_t a, int16x8_t b) {
  return vaddq_s16(a, b);
 }
 template <>
 inline int32x4_t Sub(int32x4_t a, int32x4_t b) {
  return vsubq_s32(a, b);
 }
 template <>
 inline int16x8_t Sub(int16x8_t a, int16x8_t b) {
  return vsubq_s16(a, b);
 }
 template <>
 inline int32x4_t Neg(int32x4_t a) {
  return vnegq_s32(a);
 }
 template <>
 inline int16x8_t Neg(int16x8_t a) {
  return vnegq_s16(a);
 }
 template <>
 inline int32x4_t ShiftLeft(int32x4_t a, int offset) {
  return vshlq_s32(a, vdupq_n_s32(offset));
 }
 template <>
 inline int16x8_t ShiftLeft(int16x8_t a, int offset) {
  return vshlq_s16(a, vdupq_n_s16(offset));
 }
 template <>
 inline int32x4_t ShiftRight(int32x4_t a, int offset) {
  return vshlq_s32(a, vdupq_n_s32(-offset));
 }
 template <>
 inline int16x8_t ShiftRight(int16x8_t a, int offset) {
  return vshlq_s16(a, vdupq_n_s16(-offset));
 }
 template <>
 inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val,
                                 int32x4_t else_val) {
  return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val);
 }
 template <>
 inline int16x8_t SelectUsingMask(int16x8_t if_mask, int16x8_t then_val,
                                 int16x8_t else_val) {
  return vbslq_s16(vreinterpretq_u16_s16(if_mask), then_val, else_val);
 }
 template <>
 inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vceqq_s32(a, b));
 }
 template <>
 inline int16x8_t MaskIfEqual(int16x8_t a, int16x8_t b) {
  return vreinterpretq_s16_u16(vceqq_s16(a, b));
 }
 template <>
 inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) {
  return BitNot(MaskIfEqual(a, b));
 }
 template <>
 inline int16x8_t MaskIfNotEqual(int16x8_t a, int16x8_t b) {
  return BitNot(MaskIfEqual(a, b));
 }
 template <>
 inline int32x4_t MaskIfZero(int32x4_t a) {
  return MaskIfEqual(a, vdupq_n_s32(0));
 }
 template <>
 inline int16x8_t MaskIfZero(int16x8_t a) {
  return MaskIfEqual(a, vdupq_n_s16(0));
 }
 template <>
 inline int32x4_t MaskIfNonZero(int32x4_t a) {
  return vreinterpretq_s32_u32(vtstq_s32(a, a));
 }
 template <>
 inline int16x8_t MaskIfNonZero(int16x8_t a) {
  return vreinterpretq_s16_u16(vtstq_s16(a, a));
 }
 template <>
 inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vcgtq_s32(a, b));
 }
 template <>
 inline int16x8_t MaskIfGreaterThan(int16x8_t a, int16x8_t b) {
  return vreinterpretq_s16_u16(vcgtq_s16(a, b));
 }
 template <>
 inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vcgeq_s32(a, b));
 }
 template <>
 inline int16x8_t MaskIfGreaterThanOrEqual(int16x8_t a, int16x8_t b) {
  return vreinterpretq_s16_u16(vcgeq_s16(a, b));
 }
 template <>
 inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vcltq_s32(a, b));
 }
 template <>
 inline int16x8_t MaskIfLessThan(int16x8_t a, int16x8_t b) {
  return vreinterpretq_s16_u16(vcltq_s16(a, b));
 }
 template <>
 inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vcleq_s32(a, b));
 }
 template <>
 inline int16x8_t MaskIfLessThanOrEqual(int16x8_t a, int16x8_t b) {
  return vreinterpretq_s16_u16(vcleq_s16(a, b));
 }
 template <>
 inline bool All(int32x4_t a) {
  a = vandq_s32(a, vextq_s32(a, a, 1));
  a = vandq_s32(a, vextq_s32(a, a, 2));
  return vgetq_lane_s32(a, 0);
 }
 template <>
 inline bool All(int16x8_t a) {
  a = vandq_s16(a, vextq_s16(a, a, 1));
  a = vandq_s16(a, vextq_s16(a, a, 2));
  a = vandq_s16(a, vextq_s16(a, a, 4));
  return vgetq_lane_s16(a, 0);
 }
 template <>
 inline bool Any(int32x4_t a) {
  a = vorrq_s32(a, vextq_s32(a, a, 1));
  a = vorrq_s32(a, vextq_s32(a, a, 2));
  return vgetq_lane_s32(a, 0);
 }
 template <>
 inline bool Any(int16x8_t a) {
  a = vorrq_s16(a, vextq_s16(a, a, 1));
  a = vorrq_s16(a, vextq_s16(a, a, 2));
  a = vorrq_s16(a, vextq_s16(a, a, 4));
  return vgetq_lane_s16(a, 0);
 }
 template <>
 inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) {
  return vrhaddq_s32(a, b);
 }
 template <>
 inline int16x8_t RoundingHalfSum(int16x8_t a, int16x8_t b) {
  return vrhaddq_s16(a, b);
 }
 template <>
 inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) {
  return vqrdmulhq_s32(a, b);
 }
 template <>
 inline int16x8_t SaturatingRoundingDoublingHighMul(int16x8_t a, int16x8_t b) {
  return vqrdmulhq_s16(a, b);
 }
 template <>
 inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) {
  const int32x4_t shift_vec = vdupq_n_s32(-exponent);
  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
  const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
  return vrshlq_s32(fixed_up_x, shift_vec);
 }
 template <>
 inline int16x8_t RoundingDivideByPOT(int16x8_t x, int exponent) {
  const int16x8_t shift_vec = vdupq_n_s16(-exponent);
  const int16x8_t fixup = vshrq_n_s16(vandq_s16(x, shift_vec), 15);
  const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
  return vrshlq_s16(fixed_up_x, shift_vec);
 }
 template <int Exponent>
 struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> {
  static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); }
 };
 template <int Exponent>
 struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> {
  static int32x4_t eval(int32x4_t x) {
    const int32x4_t fixup = vshrq_n_s32(x, 31);
    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
    return vrshrq_n_s32(fixed_up_x, -Exponent);
  }
 };
 template <int Exponent>
 struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int16x8_t, 1> {
  static int16x8_t eval(int16x8_t x) { return vqshlq_n_s16(x, Exponent); }
 };
 template <int Exponent>
 struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int16x8_t, -1> {
  static int16x8_t eval(int16x8_t x) {
    const int16x8_t fixup = vshrq_n_s16(x, 15);
    const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
    return vrshrq_n_s16(fixed_up_x, -Exponent);
  }
 };
 template <>
 inline int32x4_t Dup<int32x4_t>(std::int32_t x) {
  return vdupq_n_s32(x);
 }
 template <>
 inline int16x8_t Dup<int16x8_t>(std::int16_t x) {
  return vdupq_n_s16(x);
 }
 // So far this is only needed for int16.
 template <>
 inline int16x8_t SaturatingAdd(int16x8_t a, int16x8_t b) {
  return vqaddq_s16(a, b);
 }
 }  // end namespace gemmlowp
 #endif  // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
--- a/code/lib/tfmicro/third_party/flatbuffers/LICENSE.txt
+++ b/code/lib/tfmicro/third_party/flatbuffers/LICENSE.txt
--- a/code/lib/tfmicro/flatbuffers/base.h
+++ b/code/lib/tfmicro/flatbuffers/base.h
@@ -46,14 +46,17 @@
 #include <iterator>
 #include <memory>
 #if defined(__unix__) && !defined(FLATBUFFERS_LOCALE_INDEPENDENT)
  #include <unistd.h>
 #endif
 #ifdef _STLPORT_VERSION
  #define FLATBUFFERS_CPP98_STL
 #endif
 #ifndef FLATBUFFERS_CPP98_STL
  #include <functional>
 #endif
-#include "flatbuffers/stl_emulation.h"
+#ifdef __ANDROID__
  #include <android/api-level.h>
 #endif
 #if defined(__ICCARM__)
 #include <intrinsics.h>
@@ -154,10 +157,12 @@ namespace flatbuffers {
    defined(__clang__)
  #define FLATBUFFERS_FINAL_CLASS final
  #define FLATBUFFERS_OVERRIDE override
  #define FLATBUFFERS_EXPLICIT_CPP11 explicit
  #define FLATBUFFERS_VTABLE_UNDERLYING_TYPE : flatbuffers::voffset_t
 #else
  #define FLATBUFFERS_FINAL_CLASS
  #define FLATBUFFERS_OVERRIDE
  #define FLATBUFFERS_EXPLICIT_CPP11
  #define FLATBUFFERS_VTABLE_UNDERLYING_TYPE
 #endif
@@ -165,10 +170,14 @@ namespace flatbuffers {
    (!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 406)) || \
    (defined(__cpp_constexpr) && __cpp_constexpr >= 200704)
  #define FLATBUFFERS_CONSTEXPR constexpr
  #define FLATBUFFERS_CONSTEXPR_CPP11 constexpr
  #define FLATBUFFERS_CONSTEXPR_DEFINED
 #else
  #define FLATBUFFERS_CONSTEXPR const
  #define FLATBUFFERS_CONSTEXPR_CPP11
 #endif
 // This macro is never used in code!
 #if (defined(__cplusplus) && __cplusplus >= 201402L) || \
    (defined(__cpp_constexpr) && __cpp_constexpr >= 201304)
  #define FLATBUFFERS_CONSTEXPR_CPP14 FLATBUFFERS_CONSTEXPR
@@ -194,6 +203,16 @@ namespace flatbuffers {
  #define FLATBUFFERS_DELETE_FUNC(func) private: func;
 #endif
 // Check if we can use template aliases
 // Not possible if Microsoft Compiler before 2012
 // Possible is the language feature __cpp_alias_templates is defined well
 // Or possible if the C++ std is C+11 or newer
 #if (defined(_MSC_VER) && _MSC_VER > 1700 /* MSVC2012 */) \
    || (defined(__cpp_alias_templates) && __cpp_alias_templates >= 200704) \
    || (defined(__cplusplus) && __cplusplus >= 201103L)
  #define FLATBUFFERS_TEMPLATES_ALIASES
 #endif
 #ifndef FLATBUFFERS_HAS_STRING_VIEW
  // Only provide flatbuffers::string_view if __has_include can be used
  // to detect a header that provides an implementation
@@ -236,10 +255,8 @@ namespace flatbuffers {
 #ifndef FLATBUFFERS_LOCALE_INDEPENDENT
  // Enable locale independent functions {strtof_l, strtod_l,strtoll_l, strtoull_l}.
  // They are part of the POSIX-2008 but not part of the C/C++ standard.
  // GCC/Clang have definition (_XOPEN_SOURCE>=700) if POSIX-2008.
  #if ((defined(_MSC_VER) && _MSC_VER >= 1800)            || \
-       (defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE>=700)))
+       (defined(_XOPEN_VERSION) && (_XOPEN_VERSION>=700)) && (!defined(__ANDROID_API__) || (defined(__ANDROID_API__) && (__ANDROID_API__>=21))))
    #define FLATBUFFERS_LOCALE_INDEPENDENT 1
  #else
    #define FLATBUFFERS_LOCALE_INDEPENDENT 0
@@ -309,6 +326,7 @@ typedef uintmax_t largest_scalar_t;
 #define FLATBUFFERS_MAX_ALIGNMENT 16
 #if defined(_MSC_VER)
  #pragma warning(disable: 4351) // C4351: new behavior: elements of array ... will be default initialized
  #pragma warning(push)
  #pragma warning(disable: 4127) // C4127: conditional expression is constant
 #endif
@@ -374,6 +392,13 @@ T ReadScalar(const void *p) {
  return EndianScalar(*reinterpret_cast<const T *>(p));
 }
 // See https://github.com/google/flatbuffers/issues/5950
 #if (FLATBUFFERS_GCC >= 100000) && (FLATBUFFERS_GCC < 110000)
  #pragma GCC diagnostic push
  #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
 template<typename T>
 // UBSAN: C++ aliasing type rules, see std::bit_cast<> for details.
 __supress_ubsan__("alignment")
@@ -386,6 +411,10 @@ template<typename T> __supress_ubsan__("alignment") void WriteScalar(void *p, Of
  *reinterpret_cast<uoffset_t *>(p) = EndianScalar(t.o);
 }
 #if (FLATBUFFERS_GCC >= 100000) && (FLATBUFFERS_GCC < 110000)
  #pragma GCC diagnostic pop
 #endif
 // Computes how many bytes you'd have to pad to be able to write an
 // "scalar_size" scalar if the buffer had grown to "buf_size" (downwards in
 // memory).
--- a/code/lib/tfmicro/flatbuffers/flatbuffers.h
+++ b/code/lib/tfmicro/flatbuffers/flatbuffers.h
@@ -18,6 +18,11 @@
 #define FLATBUFFERS_H_
 #include "flatbuffers/base.h"
 #include "flatbuffers/stl_emulation.h"
 #ifndef FLATBUFFERS_CPP98_STL
  #include <functional>
 #endif
 #if defined(FLATBUFFERS_NAN_DEFAULTS)
 #  include <cmath>
@@ -581,6 +586,14 @@ static inline const char *GetCstring(const String *str) {
  return str ? str->c_str() : "";
 }
 #ifdef FLATBUFFERS_HAS_STRING_VIEW
 // Convenience function to get string_view from a String returning an empty
 // string_view on null pointer.
 static inline flatbuffers::string_view GetStringView(const String *str) {
  return str ? str->string_view() : flatbuffers::string_view();
 }
 #endif  // FLATBUFFERS_HAS_STRING_VIEW
 // Allocator interface. This is flatbuffers-specific and meant only for
 // `vector_downward` usage.
 class Allocator {
@@ -1211,7 +1224,7 @@ class FlatBufferBuilder {
  /// you call Finish()). You can use this information if you need to embed
  /// a FlatBuffer in some other buffer, such that you can later read it
  /// without first having to copy it into its own buffer.
-  size_t GetBufferMinAlignment() {
+  size_t GetBufferMinAlignment() const {
    Finished();
    return minalign_;
  }
@@ -1295,6 +1308,11 @@ class FlatBufferBuilder {
    TrackField(field, off);
  }
  template<typename T> void AddElement(voffset_t field, T e) {
    auto off = PushElement(e);
    TrackField(field, off);
  }
  template<typename T> void AddOffset(voffset_t field, Offset<T> off) {
    if (off.IsNull()) return;  // Don't store.
    AddElement(field, ReferTo(off.o), static_cast<uoffset_t>(0));
@@ -1599,6 +1617,9 @@ class FlatBufferBuilder {
    // causing the wrong overload to be selected, remove it.
    AssertScalarT<T>();
    StartVector(len, sizeof(T));
    if (len == 0) {
      return Offset<Vector<T>>(EndVector(len));
    }
    // clang-format off
    #if FLATBUFFERS_LITTLEENDIAN
      PushBytes(reinterpret_cast<const uint8_t *>(v), len * sizeof(T));
@@ -1795,8 +1816,8 @@ class FlatBufferBuilder {
      return a.KeyCompareLessThan(&b);
    }
-   private:
+    FLATBUFFERS_DELETE_FUNC(
-    StructKeyComparator &operator=(const StructKeyComparator &);
+        StructKeyComparator &operator=(const StructKeyComparator &))
  };
  /// @endcond
@@ -1871,10 +1892,7 @@ class FlatBufferBuilder {
    vector_downward &buf_;
   private:
-    TableKeyComparator &operator=(const TableKeyComparator &other) {
+    FLATBUFFERS_DELETE_FUNC(TableKeyComparator &operator=(const TableKeyComparator &other))
      buf_ = other.buf_;
      return *this;
    }
  };
  /// @endcond
@@ -2269,8 +2287,8 @@ class Verifier FLATBUFFERS_FINAL_CLASS {
  template<typename T>
  bool VerifyBufferFromStart(const char *identifier, size_t start) {
-    if (identifier && (size_ < 2 * sizeof(flatbuffers::uoffset_t) ||
+    if (identifier && !Check((size_ >= 2 * sizeof(flatbuffers::uoffset_t) &&
-                       !BufferHasIdentifier(buf_ + start, identifier))) {
+                              BufferHasIdentifier(buf_ + start, identifier)))) {
      return false;
    }
@@ -2452,12 +2470,26 @@ class Table {
    return field_offset ? reinterpret_cast<P>(p) : nullptr;
  }
  template<typename Raw, typename Face>
  flatbuffers::Optional<Face> GetOptional(voffset_t field) const {
    auto field_offset = GetOptionalFieldOffset(field);
    auto p = data_ + field_offset;
    return field_offset ? Optional<Face>(static_cast<Face>(ReadScalar<Raw>(p)))
                        : Optional<Face>();
  }
  template<typename T> bool SetField(voffset_t field, T val, T def) {
    auto field_offset = GetOptionalFieldOffset(field);
    if (!field_offset) return IsTheSameAs(val, def);
    WriteScalar(data_ + field_offset, val);
    return true;
  }
  template<typename T> bool SetField(voffset_t field, T val) {
    auto field_offset = GetOptionalFieldOffset(field);
    if (!field_offset) return false;
    WriteScalar(data_ + field_offset, val);
    return true;
  }
  bool SetPointer(voffset_t field, const uint8_t *val) {
    auto field_offset = GetOptionalFieldOffset(field);
@@ -2525,6 +2557,17 @@ class Table {
  uint8_t data_[1];
 };
 // This specialization allows avoiding warnings like:
 // MSVC C4800: type: forcing value to bool 'true' or 'false'.
 template<>
 inline flatbuffers::Optional<bool> Table::GetOptional<uint8_t, bool>(
    voffset_t field) const {
  auto field_offset = GetOptionalFieldOffset(field);
  auto p = data_ + field_offset;
  return field_offset ? Optional<bool>(ReadScalar<uint8_t>(p) != 0)
                      : Optional<bool>();
 }
 template<typename T>
 void FlatBufferBuilder::Required(Offset<T> table, voffset_t field) {
  auto table_ptr = reinterpret_cast<const Table *>(buf_.data_at(table.o));
@@ -2704,7 +2747,7 @@ inline const char * const *ElementaryTypeNames() {
 // Basic type info cost just 16bits per field!
 struct TypeCode {
  uint16_t base_type : 4;  // ElementaryType
-  uint16_t is_vector : 1;
+  uint16_t is_repeating : 1;  // Either vector (in table) or array (in struct)
  int16_t sequence_ref : 11;  // Index into type_refs below, or -1 for none.
 };
@@ -2720,6 +2763,7 @@ struct TypeTable {
  size_t num_elems;  // of type_codes, values, names (but not type_refs).
  const TypeCode *type_codes;     // num_elems count
  const TypeFunction *type_refs;  // less than num_elems entries (see TypeCode).
  const int16_t *array_sizes;     // less than num_elems entries (see TypeCode).
  const int64_t *values;  // Only set for non-consecutive enum/union or structs.
  const char *const *names;  // Only set if compiled with --reflect-names.
 };
--- a/code/lib/tfmicro/flatbuffers/stl_emulation.h
+++ b/code/lib/tfmicro/flatbuffers/stl_emulation.h
@@ -18,6 +18,7 @@
 #define FLATBUFFERS_STL_EMULATION_H_
 // clang-format off
 #include "flatbuffers/base.h"
 #include <string>
 #include <type_traits>
@@ -25,6 +26,17 @@
 #include <memory>
 #include <limits>
 // Detect C++17 compatible compiler.
 // __cplusplus >= 201703L - a compiler has support of 'static inline' variables.
 #if defined(FLATBUFFERS_USE_STD_OPTIONAL) \
    || (defined(__cplusplus) && __cplusplus >= 201703L) \
    || (defined(_MSVC_LANG) &&  (_MSVC_LANG >= 201703L))
  #include <optional>
  #ifndef FLATBUFFERS_USE_STD_OPTIONAL
    #define FLATBUFFERS_USE_STD_OPTIONAL
  #endif
 #endif
 #if defined(_STLPORT_VERSION) && !defined(FLATBUFFERS_CPP98_STL)
  #define FLATBUFFERS_CPP98_STL
 #endif  // defined(_STLPORT_VERSION) && !defined(FLATBUFFERS_CPP98_STL)
@@ -33,16 +45,6 @@
  #include <cctype>
 #endif  // defined(FLATBUFFERS_CPP98_STL)
 // Check if we can use template aliases
 // Not possible if Microsoft Compiler before 2012
 // Possible is the language feature __cpp_alias_templates is defined well
 // Or possible if the C++ std is C+11 or newer
 #if (defined(_MSC_VER) && _MSC_VER > 1700 /* MSVC2012 */) \
    || (defined(__cpp_alias_templates) && __cpp_alias_templates >= 200704) \
    || (defined(__cplusplus) && __cplusplus >= 201103L)
  #define FLATBUFFERS_TEMPLATES_ALIASES
 #endif
 // This header provides backwards compatibility for C++98 STLs like stlport.
 namespace flatbuffers {
@@ -190,7 +192,7 @@ inline void vector_emplace_back(std::vector<T> *vector, V &&data) {
    // MSVC 2010 doesn't support C++11 aliases.
    // We're manually "aliasing" the class here as we want to bring unique_ptr
    // into the flatbuffers namespace.  We have unique_ptr in the flatbuffers
-    // namespace we have a completely independent implemenation (see below)
+    // namespace we have a completely independent implementation (see below)
    // for C++98 STL implementations.
    template <class T> class unique_ptr : public std::unique_ptr<T> {
     public:
@@ -302,6 +304,146 @@ inline void vector_emplace_back(std::vector<T> *vector, V &&data) {
 #endif  // !FLATBUFFERS_CPP98_STL
 #ifdef FLATBUFFERS_USE_STD_OPTIONAL
 template<class T>
 using Optional = std::optional<T>;
 using nullopt_t = std::nullopt_t;
 inline constexpr nullopt_t nullopt = std::nullopt;
 #else
 // Limited implementation of Optional<T> type for a scalar T.
 // This implementation limited by trivial types compatible with
 // std::is_arithmetic<T> or std::is_enum<T> type traits.
 // A tag to indicate an empty flatbuffers::optional<T>.
 struct nullopt_t {
  explicit FLATBUFFERS_CONSTEXPR_CPP11 nullopt_t(int) {}
 };
 #if defined(FLATBUFFERS_CONSTEXPR_DEFINED)
  namespace internal {
    template <class> struct nullopt_holder {
      static constexpr nullopt_t instance_ = nullopt_t(0);
    };
    template<class Dummy>
    constexpr nullopt_t nullopt_holder<Dummy>::instance_;
  }
  static constexpr const nullopt_t &nullopt = internal::nullopt_holder<void>::instance_;
 #else
  namespace internal {
    template <class> struct nullopt_holder {
      static const nullopt_t instance_;
    };
    template<class Dummy>
    const nullopt_t nullopt_holder<Dummy>::instance_  = nullopt_t(0);
  }
  static const nullopt_t &nullopt = internal::nullopt_holder<void>::instance_;
 #endif
 template<class T>
 class Optional FLATBUFFERS_FINAL_CLASS {
  // Non-scalar 'T' would extremely complicated Optional<T>.
  // Use is_scalar<T> checking because flatbuffers flatbuffers::is_arithmetic<T>
  // isn't implemented.
  static_assert(flatbuffers::is_scalar<T>::value, "unexpected type T");
 public:
  ~Optional() {}
  FLATBUFFERS_CONSTEXPR_CPP11 Optional() FLATBUFFERS_NOEXCEPT
    : value_(), has_value_(false) {}
  FLATBUFFERS_CONSTEXPR_CPP11 Optional(nullopt_t) FLATBUFFERS_NOEXCEPT
    : value_(), has_value_(false) {}
  FLATBUFFERS_CONSTEXPR_CPP11 Optional(T val) FLATBUFFERS_NOEXCEPT
    : value_(val), has_value_(true) {}
  FLATBUFFERS_CONSTEXPR_CPP11 Optional(const Optional &other) FLATBUFFERS_NOEXCEPT
    : value_(other.value_), has_value_(other.has_value_) {}
  FLATBUFFERS_CONSTEXPR_CPP14 Optional &operator=(const Optional &other) FLATBUFFERS_NOEXCEPT {
    value_ = other.value_;
    has_value_ = other.has_value_;
    return *this;
  }
  FLATBUFFERS_CONSTEXPR_CPP14 Optional &operator=(nullopt_t) FLATBUFFERS_NOEXCEPT {
    value_ = T();
    has_value_ = false;
    return *this;
  }
  FLATBUFFERS_CONSTEXPR_CPP14 Optional &operator=(T val) FLATBUFFERS_NOEXCEPT {
    value_ = val;
    has_value_ = true;
    return *this;
  }
  void reset() FLATBUFFERS_NOEXCEPT {
    *this = nullopt;
  }
  void swap(Optional &other) FLATBUFFERS_NOEXCEPT {
    std::swap(value_, other.value_);
    std::swap(has_value_, other.has_value_);
  }
  FLATBUFFERS_CONSTEXPR_CPP11 FLATBUFFERS_EXPLICIT_CPP11 operator bool() const FLATBUFFERS_NOEXCEPT {
    return has_value_;
  }
  FLATBUFFERS_CONSTEXPR_CPP11 bool has_value() const FLATBUFFERS_NOEXCEPT {
    return has_value_;
  }
  FLATBUFFERS_CONSTEXPR_CPP11 const T& operator*() const FLATBUFFERS_NOEXCEPT {
    return value_;
  }
  const T& value() const {
    FLATBUFFERS_ASSERT(has_value());
    return value_;
  }
  T value_or(T default_value) const FLATBUFFERS_NOEXCEPT {
    return has_value() ? value_ : default_value;
  }
 private:
  T value_;
  bool has_value_;
 };
 template<class T>
 FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const Optional<T>& opt, nullopt_t) FLATBUFFERS_NOEXCEPT {
  return !opt;
 }
 template<class T>
 FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(nullopt_t, const Optional<T>& opt) FLATBUFFERS_NOEXCEPT {
  return !opt;
 }
 template<class T, class U>
 FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const Optional<T>& lhs, const U& rhs) FLATBUFFERS_NOEXCEPT {
  return static_cast<bool>(lhs) && (*lhs == rhs);
 }
 template<class T, class U>
 FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const T& lhs, const Optional<U>& rhs) FLATBUFFERS_NOEXCEPT {
  return static_cast<bool>(rhs) && (lhs == *rhs);
 }
 template<class T, class U>
 FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const Optional<T>& lhs, const Optional<U>& rhs) FLATBUFFERS_NOEXCEPT {
  return static_cast<bool>(lhs) != static_cast<bool>(rhs)
              ? false
              : !static_cast<bool>(lhs) ? false : (*lhs == *rhs);
 }
 #endif // FLATBUFFERS_USE_STD_OPTIONAL
 }  // namespace flatbuffers
 #endif  // FLATBUFFERS_STL_EMULATION_H_
--- a/code/lib/tfmicro/kissfft/COPYING
+++ b/code/lib/tfmicro/kissfft/COPYING
@@ -1,11 +0,0 @@
 Copyright (c) 2003-2010 Mark Borgerding
 All rights reserved.
 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/code/lib/tfmicro/kissfft/_kiss_fft_guts.h
+++ b/code/lib/tfmicro/kissfft/_kiss_fft_guts.h
@@ -1,164 +0,0 @@
 /*
 Copyright (c) 2003-2010, Mark Borgerding
 All rights reserved.
 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /* kiss_fft.h
   defines kiss_fft_scalar as either short or a float type
   and defines
   typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */
 #include "kiss_fft.h"
 #include <limits.h>
 #define MAXFACTORS 32
 /* e.g. an fft of length 128 has 4 factors 
 as far as kissfft is concerned
 4*4*4*2
 */
 struct kiss_fft_state{
    int nfft;
    int inverse;
    int factors[2*MAXFACTORS];
    kiss_fft_cpx twiddles[1];
 };
 /*
  Explanation of macros dealing with complex math:
   C_MUL(m,a,b)         : m = a*b
   C_FIXDIV( c , div )  : if a fixed point impl., c /= div. noop otherwise
   C_SUB( res, a,b)     : res = a - b
   C_SUBFROM( res , a)  : res -= a
   C_ADDTO( res , a)    : res += a
 * */
 #ifdef FIXED_POINT
 #if (FIXED_POINT==32)
 # define FRACBITS 31
 # define SAMPPROD int64_t
 #define SAMP_MAX 2147483647
 #else
 # define FRACBITS 15
 # define SAMPPROD int32_t 
 #define SAMP_MAX 32767
 #endif
 #define SAMP_MIN -SAMP_MAX
 #if defined(CHECK_OVERFLOW)
 #  define CHECK_OVERFLOW_OP(a,op,b)  \
 	if ( (SAMPPROD)(a) op (SAMPPROD)(b) > SAMP_MAX || (SAMPPROD)(a) op (SAMPPROD)(b) < SAMP_MIN ) { \
 		fprintf(stderr,"WARNING:overflow @ " __FILE__ "(%d): (%d " #op" %d) = %ld\n",__LINE__,(a),(b),(SAMPPROD)(a) op (SAMPPROD)(b) );  }
 #endif
 #   define smul(a,b) ( (SAMPPROD)(a)*(b) )
 #   define sround( x )  (kiss_fft_scalar)( ( (x) + (1<<(FRACBITS-1)) ) >> FRACBITS )
 #   define S_MUL(a,b) sround( smul(a,b) )
 #   define C_MUL(m,a,b) \
      do{ (m).r = sround( smul((a).r,(b).r) - smul((a).i,(b).i) ); \
          (m).i = sround( smul((a).r,(b).i) + smul((a).i,(b).r) ); }while(0)
 #   define DIVSCALAR(x,k) \
 	(x) = sround( smul(  x, SAMP_MAX/k ) )
 #   define C_FIXDIV(c,div) \
 	do {    DIVSCALAR( (c).r , div);  \
 		DIVSCALAR( (c).i  , div); }while (0)
 #   define C_MULBYSCALAR( c, s ) \
    do{ (c).r =  sround( smul( (c).r , s ) ) ;\
        (c).i =  sround( smul( (c).i , s ) ) ; }while(0)
 #else  /* not FIXED_POINT*/
 #   define S_MUL(a,b) ( (a)*(b) )
 #define C_MUL(m,a,b) \
    do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
        (m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
 #   define C_FIXDIV(c,div) /* NOOP */
 #   define C_MULBYSCALAR( c, s ) \
    do{ (c).r *= (s);\
        (c).i *= (s); }while(0)
 #endif
 #ifndef CHECK_OVERFLOW_OP
 #  define CHECK_OVERFLOW_OP(a,op,b) /* noop */
 #endif
 #define  C_ADD( res, a,b)\
    do { \
 	    CHECK_OVERFLOW_OP((a).r,+,(b).r)\
 	    CHECK_OVERFLOW_OP((a).i,+,(b).i)\
 	    (res).r=(a).r+(b).r;  (res).i=(a).i+(b).i; \
    }while(0)
 #define  C_SUB( res, a,b)\
    do { \
 	    CHECK_OVERFLOW_OP((a).r,-,(b).r)\
 	    CHECK_OVERFLOW_OP((a).i,-,(b).i)\
 	    (res).r=(a).r-(b).r;  (res).i=(a).i-(b).i; \
    }while(0)
 #define C_ADDTO( res , a)\
    do { \
 	    CHECK_OVERFLOW_OP((res).r,+,(a).r)\
 	    CHECK_OVERFLOW_OP((res).i,+,(a).i)\
 	    (res).r += (a).r;  (res).i += (a).i;\
    }while(0)
 #define C_SUBFROM( res , a)\
    do {\
 	    CHECK_OVERFLOW_OP((res).r,-,(a).r)\
 	    CHECK_OVERFLOW_OP((res).i,-,(a).i)\
 	    (res).r -= (a).r;  (res).i -= (a).i; \
    }while(0)
 #ifdef FIXED_POINT
 #  define KISS_FFT_COS(phase)  floor(.5+SAMP_MAX * cos (phase))
 #  define KISS_FFT_SIN(phase)  floor(.5+SAMP_MAX * sin (phase))
 #  define HALF_OF(x) ((x)>>1)
 #elif defined(USE_SIMD)
 #  define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) )
 #  define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) )
 #  define HALF_OF(x) ((x)*_mm_set1_ps(.5))
 #else
 #  define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase)
 #  define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase)
 #  define HALF_OF(x) ((x)*.5)
 #endif
 #define  kf_cexp(x,phase) \
 	do{ \
 		(x)->r = KISS_FFT_COS(phase);\
 		(x)->i = KISS_FFT_SIN(phase);\
 	}while(0)
 /* a debugging function */
 #define pcpx(c)\
    fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) )
 #ifdef KISS_FFT_USE_ALLOCA
 // define this to allow use of alloca instead of malloc for temporary buffers
 // Temporary buffers are used in two case: 
 // 1. FFT sizes that have "bad" factors. i.e. not 2,3 and 5
 // 2. "in-place" FFTs.  Notice the quotes, since kissfft does not really do an in-place transform.
 #include <alloca.h>
 #define  KISS_FFT_TMP_ALLOC(nbytes) alloca(nbytes)
 #define  KISS_FFT_TMP_FREE(ptr) 
 #else
 #define  KISS_FFT_TMP_ALLOC(nbytes) KISS_FFT_MALLOC(nbytes)
 #define  KISS_FFT_TMP_FREE(ptr) KISS_FFT_FREE(ptr)
 #endif
--- a/code/lib/tfmicro/kissfft/kiss_fft.h
+++ b/code/lib/tfmicro/kissfft/kiss_fft.h
@@ -1,131 +0,0 @@
 #ifndef KISS_FFT_H
 #define KISS_FFT_H
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*
 ATTENTION!
 If you would like a :
 -- a utility that will handle the caching of fft objects
 -- real-only (no imaginary time component ) FFT
 -- a multi-dimensional FFT
 -- a command-line utility to perform ffts
 -- a command-line utility to perform fast-convolution filtering
 Then see kfc.h kiss_fftr.h kiss_fftnd.h fftutil.c kiss_fastfir.c
  in the tools/ directory.
 */
 #ifdef USE_SIMD
 # include <xmmintrin.h>
 # define kiss_fft_scalar __m128
 #define KISS_FFT_MALLOC(nbytes) _mm_malloc(nbytes,16)
 #define KISS_FFT_FREE _mm_free
 #else	
 #define KISS_FFT_MALLOC(X) (void*)(0) /* Patched. */
 #define KISS_FFT_FREE(X) /* Patched. */
 #endif	
 // Patched automatically by download_dependencies.sh so default is 16 bit.
 #ifndef FIXED_POINT
 #define FIXED_POINT (16)
 #endif
 // End patch.
 #ifdef FIXED_POINT
 #include <stdint.h> /* Patched. */
 #include <sys/types.h>	
 # if (FIXED_POINT == 32)
 #  define kiss_fft_scalar int32_t
 # else	
 #  define kiss_fft_scalar int16_t
 # endif
 #else
 # ifndef kiss_fft_scalar
 /*  default is float */
 #   define kiss_fft_scalar float
 # endif
 #endif
 typedef struct {
    kiss_fft_scalar r;
    kiss_fft_scalar i;
 }kiss_fft_cpx;
 typedef struct kiss_fft_state* kiss_fft_cfg;
 /* 
 *  kiss_fft_alloc
 *  
 *  Initialize a FFT (or IFFT) algorithm's cfg/state buffer.
 *
 *  typical usage:      kiss_fft_cfg mycfg=kiss_fft_alloc(1024,0,NULL,NULL);
 *
 *  The return value from fft_alloc is a cfg buffer used internally
 *  by the fft routine or NULL.
 *
 *  If lenmem is NULL, then kiss_fft_alloc will allocate a cfg buffer using malloc.
 *  The returned value should be free()d when done to avoid memory leaks.
 *  
 *  The state can be placed in a user supplied buffer 'mem':
 *  If lenmem is not NULL and mem is not NULL and *lenmem is large enough,
 *      then the function places the cfg in mem and the size used in *lenmem
 *      and returns mem.
 *  
 *  If lenmem is not NULL and ( mem is NULL or *lenmem is not large enough),
 *      then the function returns NULL and places the minimum cfg 
 *      buffer size in *lenmem.
 * */
 kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem); 
 /*
 * kiss_fft(cfg,in_out_buf)
 *
 * Perform an FFT on a complex input buffer.
 * for a forward FFT,
 * fin should be  f[0] , f[1] , ... ,f[nfft-1]
 * fout will be   F[0] , F[1] , ... ,F[nfft-1]
 * Note that each element is complex and can be accessed like
    f[k].r and f[k].i
 * */
 void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
 /*
 A more generic version of the above function. It reads its input from every Nth sample.
 * */
 void kiss_fft_stride(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int fin_stride);
 /* If kiss_fft_alloc allocated a buffer, it is one contiguous 
   buffer and can be simply free()d when no longer needed*/
 #define kiss_fft_free free
 /*
 Cleans up some memory that gets managed internally. Not necessary to call, but it might clean up 
 your compiler output to call this before you exit.
 */
 void kiss_fft_cleanup(void);
 /*
 * Returns the smallest integer k, such that k>=n and k has only "fast" factors (2,3,5)
 */
 int kiss_fft_next_fast_size(int n);
 /* for real ffts, we need an even size */
 #define kiss_fftr_next_fast_size_real(n) \
        (kiss_fft_next_fast_size( ((n)+1)>>1)<<1)
 #ifdef __cplusplus
 } 
 #endif
 #endif
--- a/code/lib/tfmicro/kissfft/tools/kiss_fftr.h
+++ b/code/lib/tfmicro/kissfft/tools/kiss_fftr.h
@@ -1,46 +0,0 @@
 #ifndef KISS_FTR_H
 #define KISS_FTR_H
 #include "kiss_fft.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* 
 Real optimized version can save about 45% cpu time vs. complex fft of a real seq.
 */
 typedef struct kiss_fftr_state *kiss_fftr_cfg;
 kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem, size_t * lenmem);
 /*
 nfft must be even
 If you don't care to allocate space, use mem = lenmem = NULL 
 */
 void kiss_fftr(kiss_fftr_cfg cfg,const kiss_fft_scalar *timedata,kiss_fft_cpx *freqdata);
 /*
 input timedata has nfft scalar points
 output freqdata has nfft/2+1 complex points
 */
 void kiss_fftri(kiss_fftr_cfg cfg,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata);
 /*
 input freqdata has  nfft/2+1 complex points
 output timedata has nfft scalar points
 */
 #define kiss_fftr_free free
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/code/lib/tfmicro/ruy/ruy/profiler/instrumentation.h
+++ b/code/lib/tfmicro/ruy/ruy/profiler/instrumentation.h
@@ -1,203 +0,0 @@
 /* Copyright 2020 Google LLC. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef RUY_RUY_PROFILER_INSTRUMENTATION_H_
 #define RUY_RUY_PROFILER_INSTRUMENTATION_H_
 #ifdef RUY_PROFILER
 #include <cstdio>
 #include <mutex>
 #include <vector>
 #endif
 namespace ruy {
 namespace profiler {
 #ifdef RUY_PROFILER
 // A label is how a code scope is annotated to appear in profiles.
 // The stacks that are sampled by the profiler are stacks of such labels.
 // A label consists of a literal string, plus optional integer arguments.
 class Label {
 public:
  Label() {}
  template <typename... Args>
  explicit Label(Args... args) {
    Set(args...);
  }
  void Set(const char* format) {
    format_ = format;
    args_count_ = 0;
  }
  template <typename... Args>
  void Set(const char* format, Args... args) {
    format_ = format;
    args_count_ = sizeof...(args);
    SetArgs(0, args...);
  }
  void operator=(const Label& other);
  bool operator==(const Label& other) const;
  std::string Formatted() const;
  const char* format() const { return format_; }
 private:
  void SetArgs(int position, int arg0) { args_[position] = arg0; }
  template <typename... Args>
  void SetArgs(int position, int arg0, Args... args) {
    SetArgs(position, arg0);
    SetArgs(position + 1, args...);
  }
  static constexpr int kMaxArgs = 4;
  const char* format_ = nullptr;
  int args_count_ = 0;
  int args_[kMaxArgs];
 };
 namespace detail {
 // Forward-declaration, see class ThreadStack below.
 class ThreadStack;
 bool& GlobalIsProfilerRunning();
 // Returns the global vector of pointers to all stacks, there being one stack
 // per thread executing instrumented code.
 std::vector<ThreadStack*>* GlobalAllThreadStacks();
 // Returns the mutex to be locked around any access to GlobalAllThreadStacks().
 std::mutex* GlobalsMutex();
 // Returns the thread-local stack, specific to the current thread.
 ThreadStack* ThreadLocalThreadStack();
 // This 'stack' is what may be more appropriately called a 'pseudostack':
 // It contains Label entries that are 'manually' entered by instrumentation
 // code. It's unrelated to real call stacks.
 struct Stack {
  std::uint32_t id = 0;
  static constexpr int kMaxSize = 64;
  int size = 0;
  Label labels[kMaxSize];
 };
 // Returns the buffer byte size required by CopyToSample.
 int GetBufferSize(const Stack& stack);
 // Copies this Stack into a byte buffer, called a 'sample'.
 void CopyToBuffer(const Stack& stack, char* dst);
 // Populates this Stack from an existing sample buffer, typically
 // produced by CopyToSample.
 void ReadFromBuffer(const char* src, Stack* stack);
 // ThreadStack is meant to be used as a thread-local singleton, assigning to
 // each thread a Stack object holding its pseudo-stack of profile labels,
 // plus a mutex allowing to synchronize accesses to this pseudo-stack between
 // this thread and a possible profiler thread sampling it.
 class ThreadStack {
 public:
  ThreadStack();
  ~ThreadStack();
  const Stack& stack() const { return stack_; }
  // Returns the mutex to lock around any access to this stack. Each stack is
  // accessed by potentially two threads: the thread that it belongs to
  // (which calls Push and Pop) and the profiler thread during profiling
  // (which calls CopyToSample).
  std::mutex& Mutex() const { return mutex_; }
  // Pushes a new label on the top of this Stack.
  template <typename... Args>
  void Push(Args... args) {
    // This mutex locking is needed to guard against race conditions as both
    // the current thread and the profiler thread may be concurrently accessing
    // this stack. In addition to that, this mutex locking also serves the other
    // purpose of acting as a barrier (of compiler code reordering, of runtime
    // CPU instruction reordering, and of memory access reordering), which
    // gives a measure of correctness to this profiler. The downside is some
    // latency. As this lock will be uncontended most of the times, the cost
    // should be roughly that of an sequentially-consistent atomic access,
    // comparable to an access to the level of CPU data cache that is shared
    // among all cores, typically 60 cycles on current ARM CPUs, plus side
    // effects from barrier instructions.
    std::lock_guard<std::mutex> lock(mutex_);
    // Avoid overrunning the stack, even in 'release' builds. This profiling
    // instrumentation code should not ship in release builds anyway, the
    // overhead of this check is negligible, and overrunning a stack array would
    // be bad.
    if (stack_.size >= Stack::kMaxSize) {
      abort();
    }
    stack_.labels[stack_.size++].Set(args...);
  }
  // Pops the top-most label from this Stack.
  void Pop() {
    // See the comment in Push about this lock. While it would be tempting to
    // try to remove this lock and just atomically decrement size_ with a
    // store-release, that would not necessarily be a substitute for all of the
    // purposes that this lock serves, or if it was done carefully to serve all
    // of the same purposes, then that wouldn't be faster than this (mostly
    // uncontended) lock.
    std::lock_guard<std::mutex> lock(mutex_);
    stack_.size--;
  }
 private:
  mutable std::mutex mutex_;
  Stack stack_;
 };
 }  // namespace detail
 // RAII user-facing way to construct Labels associated with their life scope
 // and get them pushed to / popped from the current thread stack.
 class ScopeLabel {
 public:
  template <typename... Args>
  ScopeLabel(Args... args) : thread_stack_(detail::ThreadLocalThreadStack()) {
    thread_stack_->Push(args...);
  }
  ~ScopeLabel() { thread_stack_->Pop(); }
 private:
  detail::ThreadStack* thread_stack_;
 };
 #else  // no RUY_PROFILER
 class ScopeLabel {
 public:
  template <typename... Args>
  explicit ScopeLabel(Args...) {}
  // This destructor is needed to consistently silence clang's -Wunused-variable
  // which seems to trigger semi-randomly.
  ~ScopeLabel() {}
 };
 #endif
 }  // namespace profiler
 }  // namespace ruy
 #endif  // RUY_RUY_PROFILER_INSTRUMENTATION_H_
--- a/code/lib/tfmicro/tensorflow/core/public/version.h
+++ b/code/lib/tfmicro/tensorflow/core/public/version.h
@@ -21,7 +21,7 @@ limitations under the License.
 // Also update tensorflow/tensorflow.bzl and
 // tensorflow/tools/pip_package/setup.py
 #define TF_MAJOR_VERSION 2
-#define TF_MINOR_VERSION 1
+#define TF_MINOR_VERSION 5
 #define TF_PATCH_VERSION 0
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
@@ -108,7 +108,7 @@ limitations under the License.
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 389  // Updated: 2020/5/2
+#define TF_GRAPH_DEF_VERSION 578  // Updated: 2020/11/7
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
--- a/code/lib/tfmicro/tensorflow/lite/c/builtin_op_data.h
+++ b/code/lib/tfmicro/tensorflow/lite/c/builtin_op_data.h
@@ -67,8 +67,9 @@ typedef struct {
 typedef enum {
  kTfLiteActNone = 0,
  kTfLiteActRelu,
-  kTfLiteActRelu1,  // min(max(-1, x), 1)
+  kTfLiteActReluN1To1,                    // min(max(-1, x), 1)
-  kTfLiteActRelu6,  // min(max(0, x), 6)
+  kTfLiteActRelu1 = kTfLiteActReluN1To1,  // kTfLiteActRelu1 will be deprecated.
  kTfLiteActRelu6,                        // min(max(0, x), 6)
  kTfLiteActTanh,
  kTfLiteActSignBit,
  kTfLiteActSigmoid,
@@ -198,6 +199,8 @@ typedef struct {
 typedef struct {
  TfLiteFusedActivation activation;
  // Parameter added for the version 4.
  bool pot_scale_int16;
 } TfLiteAddParams;
 typedef struct {
@@ -219,6 +222,8 @@ typedef struct {
 typedef struct {
  TfLiteFusedActivation activation;
  // Parameter added for the version 5.
  bool pot_scale_int16;
 } TfLiteSubParams;
 typedef struct {
@@ -297,6 +302,7 @@ typedef struct {
 typedef struct {
  bool align_corners;
  bool half_pixel_centers;
 } TfLiteResizeNearestNeighborParams;
 typedef struct {
@@ -459,6 +465,15 @@ typedef struct {
  int body_subgraph_index;
 } TfLiteWhileParams;
 typedef struct {
  bool exclusive;
  bool reverse;
 } TfLiteCumsumParams;
 typedef struct {
  int init_subgraph_index;
 } TfLiteCallOnceParams;
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
--- a/code/lib/tfmicro/tensorflow/lite/c/common.c
+++ b/code/lib/tfmicro/tensorflow/lite/c/common.c
@@ -79,7 +79,8 @@ TfLiteFloatArray* TfLiteFloatArrayCreate(int size) {
 void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); }
 void TfLiteTensorDataFree(TfLiteTensor* t) {
-  if (t->allocation_type == kTfLiteDynamic) {
+  if (t->allocation_type == kTfLiteDynamic ||
      t->allocation_type == kTfLitePersistentRo) {
    free(t->data.raw);
  }
  t->data.raw = NULL;
@@ -172,7 +173,8 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
 }
 void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
-  if (tensor->allocation_type != kTfLiteDynamic) {
+  if (tensor->allocation_type != kTfLiteDynamic &&
      tensor->allocation_type != kTfLitePersistentRo) {
    return;
  }
  // TODO(b/145340303): Tensor data should be aligned.
@@ -205,6 +207,8 @@ const char* TfLiteTypeGetName(TfLiteType type) {
      return "BOOL";
    case kTfLiteComplex64:
      return "COMPLEX64";
    case kTfLiteComplex128:
      return "COMPLEX128";
    case kTfLiteString:
      return "STRING";
    case kTfLiteFloat16:
--- a/code/lib/tfmicro/tensorflow/lite/c/common.h
+++ b/code/lib/tfmicro/tensorflow/lite/c/common.h
@@ -29,6 +29,9 @@ limitations under the License.
 // TfLiteDelegate - allows delegation of nodes to alternative backends.
 //
 // Some abstractions in this file are created and managed by Interpreter.
 //
 // NOTE: The order of values in these structs are "semi-ABI stable". New values
 // should be added only to the end of structs and never reordered.
 #ifndef TENSORFLOW_LITE_C_COMMON_H_
 #define TENSORFLOW_LITE_C_COMMON_H_
@@ -43,8 +46,18 @@ extern "C" {
 typedef enum TfLiteStatus {
  kTfLiteOk = 0,
  // Generally referring to an error in the runtime (i.e. interpreter)
  kTfLiteError = 1,
-  kTfLiteDelegateError = 2
+
  // Generally referring to an error from a TfLiteDelegate itself.
  kTfLiteDelegateError = 2,
  // Generally referring to an error in applying a delegate due to
  // incompatibility between runtime and delegate, e.g., this error is returned
  // when trying to apply a TfLite delegate onto a model graph that's already
  // immutable.
  kTfLiteApplicationError = 3
 } TfLiteStatus;
 // The list of external context types known to TF Lite. This list exists solely
@@ -55,7 +68,7 @@ typedef enum TfLiteExternalContextType {
  kTfLiteEigenContext = 0,       // include eigen_support.h to use.
  kTfLiteGemmLowpContext = 1,    // include gemm_support.h to use.
  kTfLiteEdgeTpuContext = 2,     // Placeholder for Edge TPU support.
-  kTfLiteCpuBackendContext = 3,  // include cpu_backend_support.h to use.
+  kTfLiteCpuBackendContext = 3,  // include cpu_backend_context.h to use.
  kTfLiteMaxExternalContexts = 4
 } TfLiteExternalContextType;
@@ -83,8 +96,9 @@ typedef struct TfLiteIntArray {
  int size;
 // gcc 6.1+ have a bug where flexible members aren't properly handled
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
-    __GNUC_MINOR__ >= 1
+     __GNUC_MINOR__ >= 1) ||                                      \
    defined(HEXAGON) || (__clang_major__ == 7 && __clang_minor__ == 1)
  int data[0];
 #else
  int data[];
@@ -122,6 +136,7 @@ typedef struct TfLiteFloatArray {
  int size;
 // gcc 6.1+ have a bug where flexible members aren't properly handled
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
 // This also applies to the toolchain used for Qualcomm Hexagon DSPs.
 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
    __GNUC_MINOR__ >= 1
  float data[0];
@@ -200,6 +215,7 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 // the current function, while also reporting the location of the error.
 // `a` and `b` may be evaluated more than once, so no side effects or
 // extremely expensive computations should be done.
 // NOTE: Use TF_LITE_ENSURE_TYPES_EQ if comparing TfLiteTypes.
 #define TF_LITE_ENSURE_EQ(context, a, b)                                   \
  do {                                                                     \
    if ((a) != (b)) {                                                      \
@@ -219,6 +235,17 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
    }                                                                      \
  } while (0)
 #define TF_LITE_ENSURE_NEAR(context, a, b, epsilon)                          \
  do {                                                                       \
    auto delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a));                    \
    if (delta > epsilon) {                                                   \
      TF_LITE_KERNEL_LOG((context), "%s:%d %s not near %s (%f != %f)",       \
                         __FILE__, __LINE__, #a, #b, static_cast<double>(a), \
                         static_cast<double>(b));                            \
      return kTfLiteError;                                                   \
    }                                                                        \
  } while (0)
 #define TF_LITE_ENSURE_OK(context, status) \
  do {                                     \
    const TfLiteStatus s = (status);       \
@@ -227,11 +254,32 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
    }                                      \
  } while (0)
 // Define TFL_CAPI_EXPORT macro to export a function properly with a shared
 // library.
 #ifdef SWIG
 #define TFL_CAPI_EXPORT
 #else
 #if defined(_WIN32)
 #ifdef TFL_COMPILE_LIBRARY
 #define TFL_CAPI_EXPORT __declspec(dllexport)
 #else
 #define TFL_CAPI_EXPORT __declspec(dllimport)
 #endif  // TFL_COMPILE_LIBRARY
 #else
 #define TFL_CAPI_EXPORT __attribute__((visibility("default")))
 #endif  // _WIN32
 #endif  // SWIG
 // Single-precision complex data type compatible with the C99 definition.
 typedef struct TfLiteComplex64 {
  float re, im;  // real and imaginary parts, respectively.
 } TfLiteComplex64;
 // Double-precision complex data type compatible with the C99 definition.
 typedef struct TfLiteComplex128 {
  double re, im;  // real and imaginary parts, respectively.
 } TfLiteComplex128;
 // Half precision data type compatible with the C99 definition.
 typedef struct TfLiteFloat16 {
  uint16_t data;
@@ -251,6 +299,7 @@ typedef enum {
  kTfLiteInt8 = 9,
  kTfLiteFloat16 = 10,
  kTfLiteFloat64 = 11,
  kTfLiteComplex128 = 12,
 } TfLiteType;
 // Return the name of a given type, for error reporting purposes.
@@ -307,26 +356,39 @@ typedef union TfLitePtrUnion {
  int64_t* i64;
  float* f;
  TfLiteFloat16* f16;
  double* f64;
  char* raw;
  const char* raw_const;
  uint8_t* uint8;
  bool* b;
  int16_t* i16;
  TfLiteComplex64* c64;
  TfLiteComplex128* c128;
  int8_t* int8;
  /* Only use this member. */
  void* data;
 } TfLitePtrUnion;
-// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
+// Memory allocation strategies.
-// data (or data externally allocated). kTfLiteArenaRw is arena allocated
+//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
-// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
+//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
 //        and available during eval.
 //  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
 //        only available during eval.
 //  * kTfLiteDynamic: Allocated during eval, or for string tensors.
 //  * kTfLitePersistentRo: Allocated and populated during prepare. This is
 //        useful for tensors that can be computed during prepare and treated
 //        as constant inputs for downstream ops (also in prepare).
 //  * kTfLiteCustom: Custom memory allocation provided by the user. See
 //        TfLiteCustomAllocation below.
 typedef enum TfLiteAllocationType {
  kTfLiteMemNone = 0,
  kTfLiteMmapRo,
  kTfLiteArenaRw,
  kTfLiteArenaRwPersistent,
  kTfLiteDynamic,
  kTfLitePersistentRo,
  kTfLiteCustom,
 } TfLiteAllocationType;
 // The delegates should use zero or positive integers to represent handles.
@@ -359,8 +421,18 @@ typedef struct TfLiteSparsity {
  int dim_metadata_size;
 } TfLiteSparsity;
-// An tensor in the interpreter system which is a wrapper around a buffer of
+// Defines a custom memory allocation not owned by the runtime.
 // `data` should be aligned to kDefaultTensorAlignment defined in
 // lite/util.h. (Currently 64 bytes)
 // NOTE: See Interpreter.SetCustomAllocationForTensor for details on usage.
 typedef struct TfLiteCustomAllocation {
  void* data;
  size_t bytes;
 } TfLiteCustomAllocation;
 // A tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
 #ifndef TF_LITE_STATIC_MEMORY
 typedef struct TfLiteTensor {
  // The data type specification for data stored in `data`. This affects
  // what member of `data` union should be used.
@@ -426,31 +498,6 @@ typedef struct TfLiteTensor {
  const TfLiteIntArray* dims_signature;
 } TfLiteTensor;
 #ifndef TF_LITE_STATIC_MEMORY
 // Free data memory of tensor `t`.
 void TfLiteTensorDataFree(TfLiteTensor* t);
 // Free quantization data.
 void TfLiteQuantizationFree(TfLiteQuantization* quantization);
 // Free sparsity parameters.
 void TfLiteSparsityFree(TfLiteSparsity* sparsity);
 // Free memory of tensor `t`.
 void TfLiteTensorFree(TfLiteTensor* t);
 // Set all of a tensor's fields (and free any previously allocated data).
 void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
                       TfLiteQuantizationParams quantization, char* buffer,
                       size_t size, TfLiteAllocationType allocation_type,
                       const void* allocation, bool is_variable,
                       TfLiteTensor* tensor);
 // Resize the allocated data of a (dynamic) tensor. Tensors with allocation
 // types other than kTfLiteDynamic will be ignored.
 void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
 #endif  // TF_LITE_STATIC_MEMORY
 // A structure representing an instance of a node.
 // This structure only exhibits the inputs, outputs and user defined data, not
 // other features like the type.
@@ -487,6 +534,130 @@ typedef struct TfLiteNode {
  // WARNING: This is an experimental interface that is subject to change.
  struct TfLiteDelegate* delegate;
 } TfLiteNode;
 #else  // defined(TF_LITE_STATIC_MEMORY)?
 // NOTE: This flag is opt-in only at compile time.
 //
 // Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
 // contains only the minimum fields required to initialize and prepare a micro
 // inference graph. The fields in this struct have been ordered from
 // largest-to-smallest for optimal struct sizeof.
 //
 // This struct does not use:
 // - allocation
 // - buffer_handle
 // - data_is_stale
 // - delegate
 // - dims_signature
 // - name
 // - sparsity
 typedef struct TfLiteTensor {
  // TODO(b/155784997): Consider consolidating these quantization fields:
  // Quantization information. Replaces params field above.
  TfLiteQuantization quantization;
  // Quantization information.
  TfLiteQuantizationParams params;
  // A union of data pointers. The appropriate type should be used for a typed
  // tensor based on `type`.
  TfLitePtrUnion data;
  // A pointer to a structure representing the dimensionality interpretation
  // that the buffer should have. NOTE: the product of elements of `dims`
  // and the element datatype size should be equal to `bytes` below.
  TfLiteIntArray* dims;
  // The number of bytes required to store the data of this Tensor. I.e.
  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
  // type is kTfLiteFloat32 and dims = {3, 2} then
  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
  size_t bytes;
  // The data type specification for data stored in `data`. This affects
  // what member of `data` union should be used.
  TfLiteType type;
  // How memory is mapped
  //  kTfLiteMmapRo: Memory mapped read only.
  //  i.e. weights
  //  kTfLiteArenaRw: Arena allocated read write memory
  //  (i.e. temporaries, outputs).
  TfLiteAllocationType allocation_type;
  // True if the tensor is a variable.
  bool is_variable;
 } TfLiteTensor;
 // Specific reduced TfLiteNode struct for TF Micro runtime. This struct contains
 // only the minimum fields required to represent a node.
 //
 // This struct does not use:
 // - delegate
 // - intermediates
 // - temporaries
 typedef struct TfLiteNode {
  // Inputs to this node expressed as indices into the simulator's tensors.
  TfLiteIntArray* inputs;
  // Outputs to this node expressed as indices into the simulator's tensors.
  TfLiteIntArray* outputs;
  // Opaque data provided by the node implementer through `Registration.init`.
  void* user_data;
  // Opaque data provided to the node if the node is a builtin. This is usually
  // a structure defined in builtin_op_data.h
  void* builtin_data;
  // Custom initial data. This is the opaque data provided in the flatbuffer.
  // WARNING: This is an experimental interface that is subject to change.
  const void* custom_initial_data;
  int custom_initial_data_size;
 } TfLiteNode;
 #endif  // TF_LITE_STATIC_MEMORY
 // Light-weight tensor struct for TF Micro runtime. Provides the minimal amount
 // of information required for a kernel to run during TfLiteRegistration::Eval.
 // TODO(b/160955687): Move this field into TF_LITE_STATIC_MEMORY when TFLM
 // builds with this flag by default internally.
 typedef struct TfLiteEvalTensor {
  // A union of data pointers. The appropriate type should be used for a typed
  // tensor based on `type`.
  TfLitePtrUnion data;
  // A pointer to a structure representing the dimensionality interpretation
  // that the buffer should have.
  TfLiteIntArray* dims;
  // The data type specification for data stored in `data`. This affects
  // what member of `data` union should be used.
  TfLiteType type;
 } TfLiteEvalTensor;
 #ifndef TF_LITE_STATIC_MEMORY
 // Free data memory of tensor `t`.
 void TfLiteTensorDataFree(TfLiteTensor* t);
 // Free quantization data.
 void TfLiteQuantizationFree(TfLiteQuantization* quantization);
 // Free sparsity parameters.
 void TfLiteSparsityFree(TfLiteSparsity* sparsity);
 // Free memory of tensor `t`.
 void TfLiteTensorFree(TfLiteTensor* t);
 // Set all of a tensor's fields (and free any previously allocated data).
 void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
                       TfLiteQuantizationParams quantization, char* buffer,
                       size_t size, TfLiteAllocationType allocation_type,
                       const void* allocation, bool is_variable,
                       TfLiteTensor* tensor);
 // Resize the allocated data of a (dynamic) tensor. Tensors with allocation
 // types other than kTfLiteDynamic will be ignored.
 void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
 #endif  // TF_LITE_STATIC_MEMORY
 // WARNING: This is an experimental interface that is subject to change.
 //
@@ -578,12 +749,11 @@ typedef struct TfLiteContext {
  void* profiler;
  // Allocate persistent buffer which has the same life time as the interpreter.
  // Returns nullptr on failure.
  // The memory is allocated from heap for TFL, and from tail in TFLM.
-  // If *ptr is not nullptr, the pointer will be reallocated.
+  // This method is only available in Init or Prepare stage.
  // This method is only available in Prepare stage.
  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*AllocatePersistentBuffer)(struct TfLiteContext* ctx,
+  void* (*AllocatePersistentBuffer)(struct TfLiteContext* ctx, size_t bytes);
                                           size_t bytes, void** ptr);
  // Allocate a buffer which will be deallocated right after invoke phase.
  // The memory is allocated from heap in TFL, and from volatile arena in TFLM.
@@ -638,6 +808,18 @@ typedef struct TfLiteContext {
  TfLiteStatus (*PreviewDelegatePartitioning)(
      struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
      TfLiteDelegateParams** partition_params_array, int* num_partitions);
  // Returns a TfLiteTensor struct for a given index.
  // WARNING: This is an experimental interface that is subject to change.
  // WARNING: This method may not be available on all platforms.
  TfLiteTensor* (*GetTensor)(const struct TfLiteContext* context,
                             int tensor_idx);
  // Returns a TfLiteEvalTensor struct for a given index.
  // WARNING: This is an experimental interface that is subject to change.
  // WARNING: This method may not be available on all platforms.
  TfLiteEvalTensor* (*GetEvalTensor)(const struct TfLiteContext* context,
                                     int tensor_idx);
 } TfLiteContext;
 typedef struct TfLiteRegistration {
@@ -712,7 +894,26 @@ typedef enum TfLiteDelegateFlags {
  //
  // If the delegate isn't capable to handle dynamic tensors, this flag need
  // to be set to false.
-  kTfLiteDelegateFlagsAllowDynamicTensors = 1
+  kTfLiteDelegateFlagsAllowDynamicTensors = 1,
  // This flag can be used by delegates (that allow dynamic tensors) to ensure
  // applicable tensor shapes are automatically propagated in the case of tensor
  // resizing.
  // This means that non-dynamic (allocation_type != kTfLiteDynamic) I/O tensors
  // of a delegate kernel will have correct shapes before its Prepare() method
  // is called. The runtime leverages TFLite builtin ops in the original
  // execution plan to propagate shapes.
  //
  // A few points to note:
  // 1. This requires kTfLiteDelegateFlagsAllowDynamicTensors. If that flag is
  // false, this one is redundant since the delegate kernels are re-initialized
  // every time tensors are resized.
  // 2. Enabling this flag adds some overhead to AllocateTensors(), since extra
  // work is required to prepare the original execution plan.
  // 3. This flag requires that the original execution plan only have ops with
  // valid registrations (and not 'dummy' custom ops like with Flex).
  // WARNING: This feature is experimental and subject to change.
  kTfLiteDelegateFlagsRequirePropagatedShapes = 2
 } TfLiteDelegateFlags;
 // WARNING: This is an experimental interface that is subject to change.
@@ -731,8 +932,9 @@ typedef struct TfLiteDelegate {
                          struct TfLiteDelegate* delegate);
  // Copy the data from delegate buffer handle into raw memory of the given
-  // 'tensor'. This cannot be null. The delegate is allowed to allocate the raw
+  // 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
-  // bytes as long as it follows the rules for kTfLiteDynamic tensors.
+  // long as it follows the rules for kTfLiteDynamic tensors, in which case this
  // cannot be null.
  TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
                                       struct TfLiteDelegate* delegate,
                                       TfLiteBufferHandle buffer_handle,
--- a/code/lib/tfmicro/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/code/lib/tfmicro/tensorflow/lite/core/api/flatbuffer_conversions.cc
--- a/code/lib/tfmicro/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/code/lib/tfmicro/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -19,9 +19,12 @@ limitations under the License.
 // flatbuffer serialization format into in-memory values that are used by the
 // runtime API and interpreter.
 #include <cstddef>
 #include <new>
 #include <type_traits>
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 namespace tflite {
@@ -42,7 +45,7 @@ class BuiltinDataAllocator {
    // platform targets support that properly.
    static_assert(std::is_pod<T>::value, "Builtin data structure must be POD.");
    void* allocated_memory = this->Allocate(sizeof(T), alignof(T));
-    return new (allocated_memory) T;
+    return new (allocated_memory) T();
  }
  virtual ~BuiltinDataAllocator() {}
@@ -66,6 +69,196 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
 TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
                               ErrorReporter* error_reporter);
 TfLiteStatus ParseAbs(const Operator* op, ErrorReporter* error_reporter,
                      BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseAdd(const Operator* op, ErrorReporter* error_reporter,
                      BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseArgMax(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseArgMin(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseCeil(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseConcatenation(const Operator* op,
                                ErrorReporter* error_reporter,
                                BuiltinDataAllocator* allocator,
                                void** builtin_data);
 TfLiteStatus ParseConv2D(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseCos(const Operator* op, ErrorReporter* error_reporter,
                      BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseDepthwiseConv2D(const Operator* op,
                                  ErrorReporter* error_reporter,
                                  BuiltinDataAllocator* allocator,
                                  void** builtin_data);
 TfLiteStatus ParseDequantize(const Operator* op, ErrorReporter* error_reporter,
                             BuiltinDataAllocator* allocator,
                             void** builtin_data);
 TfLiteStatus ParseEqual(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseFloor(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseFullyConnected(const Operator* op,
                                 ErrorReporter* error_reporter,
                                 BuiltinDataAllocator* allocator,
                                 void** builtin_data);
 TfLiteStatus ParseGreater(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseGreaterEqual(const Operator* op,
                               ErrorReporter* error_reporter,
                               BuiltinDataAllocator* allocator,
                               void** builtin_data);
 TfLiteStatus ParseHardSwish(const Operator* op, ErrorReporter* error_reporter,
                            BuiltinDataAllocator* allocator,
                            void** builtin_data);
 TfLiteStatus ParseL2Normalization(const Operator* op,
                                  ErrorReporter* error_reporter,
                                  BuiltinDataAllocator* allocator,
                                  void** builtin_data);
 TfLiteStatus ParseLess(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseLessEqual(const Operator* op, ErrorReporter* error_reporter,
                            BuiltinDataAllocator* allocator,
                            void** builtin_data);
 TfLiteStatus ParseLog(const Operator* op, ErrorReporter* error_reporter,
                      BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseLogicalAnd(const Operator* op, ErrorReporter* error_reporter,
                             BuiltinDataAllocator* allocator,
                             void** builtin_data);
 TfLiteStatus ParseLogicalNot(const Operator* op, ErrorReporter* error_reporter,
                             BuiltinDataAllocator* allocator,
                             void** builtin_data);
 TfLiteStatus ParseLogicalOr(const Operator* op, ErrorReporter* error_reporter,
                            BuiltinDataAllocator* allocator,
                            void** builtin_data);
 TfLiteStatus ParseLogistic(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator,
                           void** builtin_data);
 TfLiteStatus ParseMaximum(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseMinimum(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseMul(const Operator* op, ErrorReporter* error_reporter,
                      BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseNeg(const Operator* op, ErrorReporter* error_reporter,
                      BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseNotEqual(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator,
                           void** builtin_data);
 TfLiteStatus ParsePack(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParsePad(const Operator* op, ErrorReporter* error_reporter,
                      BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParsePadV2(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParsePool(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParsePrelu(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseQuantize(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator,
                           void** builtin_data);
 TfLiteStatus ParseReducer(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseRelu(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseRelu6(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseReshape(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseResizeBilinear(const Operator* op,
                                 ErrorReporter* error_reporter,
                                 BuiltinDataAllocator* allocator,
                                 void** builtin_data);
 TfLiteStatus ParseResizeNearestNeighbor(const Operator* op,
                                        ErrorReporter* error_reporter,
                                        BuiltinDataAllocator* allocator,
                                        void** builtin_data);
 TfLiteStatus ParseRound(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseRsqrt(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseShape(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseSin(const Operator* op, ErrorReporter* error_reporter,
                      BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseSoftmax(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseSplit(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseSplitV(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseSqrt(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseSquare(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseStridedSlice(const Operator* op,
                               ErrorReporter* error_reporter,
                               BuiltinDataAllocator* allocator,
                               void** builtin_data);
 TfLiteStatus ParseSub(const Operator* op, ErrorReporter* error_reporter,
                      BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseSvdf(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseTanh(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 TfLiteStatus ParseUnpack(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
--- a/code/lib/tfmicro/tensorflow/lite/core/api/op_resolver.cc
+++ b/code/lib/tfmicro/tensorflow/lite/core/api/op_resolver.cc
@@ -15,6 +15,11 @@ limitations under the License.
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 namespace tflite {
 TfLiteStatus GetRegistrationFromOpCode(
@@ -22,7 +27,7 @@ TfLiteStatus GetRegistrationFromOpCode(
    ErrorReporter* error_reporter, const TfLiteRegistration** registration) {
  TfLiteStatus status = kTfLiteOk;
  *registration = nullptr;
-  auto builtin_code = opcode->builtin_code();
+  auto builtin_code = GetBuiltinCode(opcode);
  int version = opcode->version();
  if (builtin_code > BuiltinOperator_MAX ||
--- a/code/lib/tfmicro/tensorflow/lite/core/api/op_resolver.h
+++ b/code/lib/tfmicro/tensorflow/lite/core/api/op_resolver.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
 #define TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
 #include <vector>
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -32,6 +34,16 @@ class OpResolver {
  /// Finds the op registration of a custom operator by op name.
  virtual const TfLiteRegistration* FindOp(const char* op,
                                           int version) const = 0;
  // Returns optional delegates for resolving and handling ops in the flatbuffer
  // model. This may be used in addition to the standard TfLiteRegistration
  // lookup for graph resolution.
  using TfLiteDelegatePtrVector =
      std::vector<std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>>;
  virtual TfLiteDelegatePtrVector GetDelegates(int num_threads) const {
    return TfLiteDelegatePtrVector();
  }
  virtual ~OpResolver() {}
 };
--- a/code/lib/tfmicro/tensorflow/lite/core/api/profiler.h
+++ b/code/lib/tfmicro/tensorflow/lite/core/api/profiler.h
@@ -0,0 +1,194 @@
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_CORE_API_PROFILER_H_
 #define TENSORFLOW_LITE_CORE_API_PROFILER_H_
 #include <cstdint>
 namespace tflite {
 // A simple utility for enabling profiled event tracing in TensorFlow Lite.
 class Profiler {
 public:
  // As certain Profiler instance might be only interested in certain event
  // types, we define each event type value to allow a Profiler to use
  // bitmasking bitwise operations to determine whether an event should be
  // recorded or not.
  enum class EventType {
    // Default event type, the metadata field has no special significance.
    DEFAULT = 1,
    // The event is an operator invocation and the event_metadata field is the
    // index of operator node.
    OPERATOR_INVOKE_EVENT = 2,
    // The event is an invocation for an internal operator of a TFLite delegate.
    // The event_metadata field is the index of operator node that's specific to
    // the delegate.
    DELEGATE_OPERATOR_INVOKE_EVENT = 4,
    // The event is a recording of runtime instrumentation such as the overall
    // TFLite runtime status, the TFLite delegate status (if a delegate
    // is applied), and the overall model inference latency etc.
    // Note, the delegate status and overall status are stored as separate
    // event_metadata fields. In particular, the delegate status is encoded
    // as DelegateStatus::full_status().
    GENERAL_RUNTIME_INSTRUMENTATION_EVENT = 8,
  };
  virtual ~Profiler() {}
  // Signals the beginning of an event and returns a handle to the profile
  // event. The `event_metadata1` and `event_metadata2` have different
  // interpretations based on the actual Profiler instance and the `event_type`.
  // For example, as for the 'SubgraphAwareProfiler' defined in
  // lite/core/subgraph.h, when the event_type is OPERATOR_INVOKE_EVENT,
  // `event_metadata1` represents the index of a TFLite node, and
  // `event_metadata2` represents the index of the subgraph that this event
  // comes from.
  virtual uint32_t BeginEvent(const char* tag, EventType event_type,
                              int64_t event_metadata1,
                              int64_t event_metadata2) = 0;
  // Similar w/ the above, but `event_metadata2` defaults to 0.
  uint32_t BeginEvent(const char* tag, EventType event_type,
                      int64_t event_metadata) {
    return BeginEvent(tag, event_type, event_metadata, /*event_metadata2*/ 0);
  }
  // Signals an end to the specified profile event with 'event_metadata's, This
  // is useful when 'event_metadata's are not available when the event begins
  // or when one wants to overwrite the 'event_metadata's set at the beginning.
  virtual void EndEvent(uint32_t event_handle, int64_t event_metadata1,
                        int64_t event_metadata2) {}
  // Signals an end to the specified profile event.
  virtual void EndEvent(uint32_t event_handle) = 0;
  // Appends an event of type 'event_type' with 'tag' and 'event_metadata'
  // which started at 'start' and ended at 'end'
  // Note:
  // In cases were ProfileSimmarizer and tensorflow::StatsCalculator are used
  // they assume the value is in "usec", if in any case subclasses
  // didn't put usec, then the values are not meaningful.
  // TODO karimnosseir: Revisit and make the function more clear.
  void AddEvent(const char* tag, EventType event_type, uint64_t start,
                uint64_t end, int64_t event_metadata) {
    AddEvent(tag, event_type, start, end, event_metadata,
             /*event_metadata2*/ 0);
  }
  virtual void AddEvent(const char* tag, EventType event_type, uint64_t start,
                        uint64_t end, int64_t event_metadata1,
                        int64_t event_metadata2) {}
 protected:
  friend class ScopedProfile;
 };
 // Adds a profile event to `profiler` that begins with the construction
 // of the object and ends when the object goes out of scope.
 // The lifetime of tag should be at least the lifetime of `profiler`.
 // `profiler` may be null, in which case nothing is profiled.
 class ScopedProfile {
 public:
  ScopedProfile(Profiler* profiler, const char* tag,
                Profiler::EventType event_type = Profiler::EventType::DEFAULT,
                int64_t event_metadata = 0)
      : profiler_(profiler), event_handle_(0) {
    if (profiler) {
      event_handle_ = profiler_->BeginEvent(tag, event_type, event_metadata);
    }
  }
  ~ScopedProfile() {
    if (profiler_) {
      profiler_->EndEvent(event_handle_);
    }
  }
 protected:
  Profiler* profiler_;
  uint32_t event_handle_;
 };
 class ScopedOperatorProfile : public ScopedProfile {
 public:
  ScopedOperatorProfile(Profiler* profiler, const char* tag, int node_index)
      : ScopedProfile(profiler, tag, Profiler::EventType::OPERATOR_INVOKE_EVENT,
                      static_cast<uint32_t>(node_index)) {}
 };
 class ScopedDelegateOperatorProfile : public ScopedProfile {
 public:
  ScopedDelegateOperatorProfile(Profiler* profiler, const char* tag,
                                int node_index)
      : ScopedProfile(profiler, tag,
                      Profiler::EventType::DELEGATE_OPERATOR_INVOKE_EVENT,
                      static_cast<uint32_t>(node_index)) {}
 };
 class ScopedRuntimeInstrumentationProfile : public ScopedProfile {
 public:
  ScopedRuntimeInstrumentationProfile(Profiler* profiler, const char* tag)
      : ScopedProfile(
            profiler, tag,
            Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT, -1) {}
  void set_runtime_status(int64_t delegate_status, int64_t interpreter_status) {
    if (profiler_) {
      delegate_status_ = delegate_status;
      interpreter_status_ = interpreter_status;
    }
  }
  ~ScopedRuntimeInstrumentationProfile() {
    if (profiler_) {
      profiler_->EndEvent(event_handle_, delegate_status_, interpreter_status_);
    }
  }
 private:
  int64_t delegate_status_;
  int64_t interpreter_status_;
 };
 }  // namespace tflite
 #define TFLITE_VARNAME_UNIQ_IMPL(name, ctr) name##ctr
 #define TFLITE_VARNAME_UNIQ(name, ctr) TFLITE_VARNAME_UNIQ_IMPL(name, ctr)
 #define TFLITE_SCOPED_TAGGED_DEFAULT_PROFILE(profiler, tag)          \
  tflite::ScopedProfile TFLITE_VARNAME_UNIQ(_profile_, __COUNTER__)( \
      (profiler), (tag))
 #define TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE(profiler, tag, node_index)     \
  tflite::ScopedOperatorProfile TFLITE_VARNAME_UNIQ(_profile_, __COUNTER__)( \
      (profiler), (tag), (node_index))
 #define TFLITE_SCOPED_DELEGATE_OPERATOR_PROFILE(profiler, tag, node_index) \
  tflite::ScopedDelegateOperatorProfile TFLITE_VARNAME_UNIQ(               \
      _profile_, __COUNTER__)((profiler), (tag), (node_index))
 #define TFLITE_ADD_RUNTIME_INSTRUMENTATION_EVENT(                          \
    profiler, tag, delegate_status, interpreter_status)                    \
  do {                                                                     \
    if (!profiler) {                                                       \
      const auto handle = profiler->BeginEvent(                            \
          tag, Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT, \
          delegate_status, interpreter_status);                            \
      profiler->EndEvent(handle);                                          \
    }                                                                      \
  } while (false);
 #endif  // TENSORFLOW_LITE_CORE_API_PROFILER_H_
--- a/code/lib/tfmicro/tensorflow/lite/core/api/tensor_utils.cc
+++ b/code/lib/tfmicro/tensorflow/lite/core/api/tensor_utils.cc
@@ -17,6 +17,8 @@ limitations under the License.
 #include <string.h>
 #include "tensorflow/lite/c/common.h"
 namespace tflite {
 TfLiteStatus ResetVariableTensor(TfLiteTensor* tensor) {
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/common.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/common.h
@@ -55,9 +55,12 @@ inline void GetActivationMinMax(FusedActivationFunctionType ac,
  }
 }
-inline float ActivationFunctionWithMinMax(float x, float output_activation_min,
+template <typename T>
-                                          float output_activation_max) {
+inline T ActivationFunctionWithMinMax(T x, T output_activation_min,
-  return std::min(std::max(x, output_activation_min), output_activation_max);
+                                      T output_activation_max) {
  using std::max;
  using std::min;
  return min(max(x, output_activation_min), output_activation_max);
 }
 // Legacy function, left for compatibility only.
@@ -135,23 +138,24 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
 #endif
 }
-inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp(
+inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
-    int32 x, int32 quantized_multiplier, int left_shift) {
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
  using gemmlowp::RoundingDivideByPOT;
  using gemmlowp::SaturatingRoundingDoublingHighMul;
  return RoundingDivideByPOT(
      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
 }
-inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
+inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
-    int32 x, int32 quantized_multiplier, int left_shift) {
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
  using gemmlowp::SaturatingRoundingDoublingHighMul;
  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
                                           quantized_multiplier);
 }
-inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
+inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
-                                           int shift) {
+                                             int32_t quantized_multiplier,
                                             int shift) {
  using gemmlowp::RoundingDivideByPOT;
  using gemmlowp::SaturatingRoundingDoublingHighMul;
  int left_shift = shift > 0 ? shift : 0;
@@ -161,16 +165,16 @@ inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
                             right_shift);
 }
-inline int32 MultiplyByQuantizedMultiplier(int64_t x,
+inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
-                                           int32 quantized_multiplier,
+                                             int32_t quantized_multiplier,
-                                           int shift) {
+                                             int shift) {
  // Inputs:
  // - quantized_multiplier has fixed point at bit 31
  // - shift is -31 to +7 (negative for right shift)
  //
  // Assumptions: The following input ranges are assumed
  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
-  // - scaling is chosen so final scaled result fits in int32
+  // - scaling is chosen so final scaled result fits in int32_t
  // - input x is in the range -(1<<47) <= x < (1<<47)
  assert(quantized_multiplier >= 0);
  assert(shift >= -31 && shift < 8);
@@ -215,9 +219,9 @@ inline int CountLeadingSignBits(T integer_input) {
  using U = typename std::make_unsigned<T>::type;
  return integer_input >= 0
             ? CountLeadingZeros(static_cast<U>(integer_input)) - 1
-             : integer_input != std::numeric_limits<T>::min()
+         : integer_input != std::numeric_limits<T>::min()
-                   ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1)
+             ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1)
-                   : 0;
+             : 0;
 #endif
 }
@@ -237,8 +241,12 @@ inline Integer FloorLog2(Integer n) {
 // generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
 // softmax
-inline void gen_lut(const std::function<double(double)>& func, double min,
+// func - the function to build the LUT for (e.g exp(x))
-                    double max, int16_t* table, const int num) {
+// min,max - table limits
 // table - pointer to buffer
 // num - number of elements in the LUT
 inline void gen_lut(double (*func)(double), double min, double max,
                    int16_t* table, const int num) {
  // size of table should equal to num + 1
  // last element only for slope calculation
  double step = (max - min) / (num - 1);
@@ -259,7 +267,35 @@ inline void gen_lut(const std::function<double(double)>& func, double min,
      std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
 }
-// int16 func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
+// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
 // softmax
 // func - the function to build the LUT for (e.g exp(x))
 // min,max - table limits
 // table - pointer to buffer
 // num - number of elements in the LUT
 inline void gen_lut(float (*func)(float), float min, float max, int16_t* table,
                    const int num) {
  // size of table should equal to num + 1
  // last element only for slope calculation
  float step = (max - min) / (num - 1);
  float half_step = step / 2.0f;
  for (int i = 0; i < num - 1; i++) {
    float sample_val = TfLiteRound(func(min + i * step) * 32768.0f);
    float midpoint_interp_val =
        TfLiteRound((func(min + (i + 1) * step) * 32768.0f +
                     TfLiteRound(func(min + i * step) * 32768.0f)) /
                    2.0f);
    float midpoint_val =
        TfLiteRound(func(min + i * step + half_step) * 32768.0f);
    float midpoint_err = midpoint_interp_val - midpoint_val;
    float bias = TfLiteRound(midpoint_err / 2.0f);
    table[i] = std::min(std::max(sample_val - bias, -32768.0f), 32767.0f);
  }
  table[num - 1] = std::min(
      std::max(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
 }
 // int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
 inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) {
  // 512 base value, lut[513] only for calculate slope
  uint16_t index = static_cast<uint16_t>(256 + (value >> 7));
@@ -410,6 +446,23 @@ SaturatingRoundingMultiplyByPOTParam(
      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
 }
 // Convert int32_t multiplier to int16_t with rounding.
 inline void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32_t,
                                            int16_t* multiplier_int16_t) {
  TFLITE_DCHECK_GE(multiplier_int32_t, 0);
  static constexpr int32_t kRoundingOffset = 1 << 15;
  if (multiplier_int32_t >=
      std::numeric_limits<int32_t>::max() - kRoundingOffset) {
    *multiplier_int16_t = std::numeric_limits<int16_t>::max();
    return;
  }
  const int32_t result = (multiplier_int32_t + kRoundingOffset) >> 16;
  TFLITE_DCHECK_LE(result << 16, multiplier_int32_t + kRoundingOffset);
  TFLITE_DCHECK_GT(result << 16, multiplier_int32_t - kRoundingOffset);
  *multiplier_int16_t = result;
  TFLITE_DCHECK_EQ(*multiplier_int16_t, result);
 }
 // Minimum output bits to accommodate log of maximum input range.  It actually
 // does not matter if one considers, say, [-64,64] or [-64,64).
 //
@@ -418,15 +471,13 @@ SaturatingRoundingMultiplyByPOTParam(
 //  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
 //  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
 constexpr int min_log_x_output_bits(int input_bits) {
-  return input_bits > 90
+  return input_bits > 90   ? 7
-             ? 7
+         : input_bits > 44 ? 6
-             : input_bits > 44
+         : input_bits > 21 ? 5
-                   ? 6
+         : input_bits > 10 ? 4
-                   : input_bits > 21
+         : input_bits > 4  ? 3
-                         ? 5
+         : input_bits > 1  ? 2
-                         : input_bits > 10
+                           : 1;
                               ? 4
                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
 }
 // Although currently the name of this function says that it cannot handle
@@ -434,17 +485,17 @@ constexpr int min_log_x_output_bits(int input_bits) {
 // x_max is the largest representable input.  In other words, the output range
 // is symmetric.
 template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
 log_x_for_x_greater_than_or_equal_to_1_impl(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
-  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1);
+  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32_t>::digits - 1);
-  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits);
+  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32_t>::digits);
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
  // The reason for accumulating the result with an extra bit of headroom is
  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
  // recip_denom will otherwise introduce an error.
  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumIntegerBits>;
  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
      FixedPoint0, 1488522236, std::log(2.0));
@@ -472,10 +523,10 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
  // required shift "ourselves" instead of using, say, Rescale.
  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
  // z_a_pow_2 = input_integer_bits - z_a_headroom;
-  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
+  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32_t>(z_a.raw()));
  FixedPoint0 r_a_tmp =
      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
-  const int32 r_a_raw =
+  const int32_t r_a_raw =
      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
@@ -487,8 +538,8 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
  FixedPoint0 z_b = z_a * sqrt_half;
-  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
+  int z_b_headroom = CountLeadingZeros(static_cast<uint32_t>(z_b.raw())) - 1;
-  const int32 r_b_raw =
+  const int32_t r_b_raw =
      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
@@ -516,9 +567,9 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
 }
 template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
 log_x_for_x_greater_than_or_equal_to_1(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
  static_assert(
      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
      "Output integer bits must be sufficient to accommodate logs of inputs.");
@@ -527,25 +578,25 @@ log_x_for_x_greater_than_or_equal_to_1(
      input_val);
 }
-inline int32 GetReciprocal(int32 x, int x_integer_digits,
+inline int32_t GetReciprocal(int32_t x, int x_integer_digits,
-                           int* num_bits_over_unit) {
+                             int* num_bits_over_unit) {
-  int headroom_plus_one = CountLeadingZeros(static_cast<uint32>(x));
+  int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(x));
  // This is the number of bits to the left of the binary point above 1.0.
  // Consider x=1.25.  In that case shifted_scale=0.8 and
  // no later adjustment will be needed.
  *num_bits_over_unit = x_integer_digits - headroom_plus_one;
-  const int32 shifted_sum_minus_one =
+  const int32_t shifted_sum_minus_one =
-      static_cast<int32>((static_cast<uint32>(x) << headroom_plus_one) -
+      static_cast<int32_t>((static_cast<uint32_t>(x) << headroom_plus_one) -
-                         (static_cast<uint32>(1) << 31));
+                           (static_cast<uint32_t>(1) << 31));
-  gemmlowp::FixedPoint<int32, 0> shifted_scale =
+  gemmlowp::FixedPoint<int32_t, 0> shifted_scale =
      gemmlowp::one_over_one_plus_x_for_x_in_0_1(
-          gemmlowp::FixedPoint<int32, 0>::FromRaw(shifted_sum_minus_one));
+          gemmlowp::FixedPoint<int32_t, 0>::FromRaw(shifted_sum_minus_one));
  return shifted_scale.raw();
 }
-inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
+inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
-                                             int32* output_inv_sqrt,
+                                             int32_t* output_inv_sqrt,
                                             int* output_shift) {
  TFLITE_DCHECK_GE(input, 0);
  if (input <= 1) {
@@ -565,7 +616,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
    ++*output_shift;
  }
  const unsigned max_left_shift_bits =
-      CountLeadingZeros(static_cast<uint32>(input)) - 1;
+      CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
  *output_shift -= left_shift_bit_pairs;
@@ -577,8 +628,8 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
  using gemmlowp::SaturatingRoundingMultiplyByPOT;
  // Using 3 integer bits gives us enough room for the internal arithmetic in
  // this Newton-Raphson iteration.
-  using F3 = FixedPoint<int32, 3>;
+  using F3 = FixedPoint<int32_t, 3>;
-  using F0 = FixedPoint<int32, 0>;
+  using F0 = FixedPoint<int32_t, 0>;
  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
  const F3 fixedpoint_half_input =
      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
@@ -645,6 +696,13 @@ inline int SubscriptToIndex(const NdArrayDesc<5>& desc, int indexes[5]) {
         indexes[4] * desc.strides[4];
 }
 inline int SubscriptToIndex(const NdArrayDesc<8>& desc, int indexes[8]) {
  return indexes[0] * desc.strides[0] + indexes[1] * desc.strides[1] +
         indexes[2] * desc.strides[2] + indexes[3] * desc.strides[3] +
         indexes[4] * desc.strides[4] + indexes[5] * desc.strides[5] +
         indexes[6] * desc.strides[6] + indexes[7] * desc.strides[7];
 }
 // Given the dimensions of the operands for an element-wise binary broadcast,
 // adjusts them so that they can be directly iterated over with simple loops.
 // Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/compatibility.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/compatibility.h
@@ -76,13 +76,15 @@ limitations under the License.
 #define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
 #endif
-// TODO(ahentz): Clean up.
+#ifndef TF_LITE_STATIC_MEMORY
 // TODO(b/162019032): Consider removing these type-aliases.
 using int8 = std::int8_t;
 using uint8 = std::uint8_t;
 using int16 = std::int16_t;
 using uint16 = std::uint16_t;
 using int32 = std::int32_t;
 using uint32 = std::uint32_t;
 #endif  // !defined(TF_LITE_STATIC_MEMORY)
 // TFLITE_DEPRECATED()
 //
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/cppmath.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/cppmath.h
@@ -19,8 +19,9 @@ limitations under the License.
 namespace tflite {
-#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
+#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) ||                           \
-    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO)
+    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO) || \
    defined(__ZEPHYR__)
 #define TF_LITE_GLOBAL_STD_PREFIX
 #else
 #define TF_LITE_GLOBAL_STD_PREFIX std
--- a/code/lib/tfmicro/tensorflow/lite/micro/micro_optional_debug_tools.h
+++ b/code/lib/tfmicro/tensorflow/lite/micro/micro_optional_debug_tools.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,16 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// Optional debugging functionality. For small sized binaries, these are not
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
-// needed.
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_OPTIONAL_DEBUG_TOOLS_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_OPTIONAL_DEBUG_TOOLS_H_
-#include "tensorflow/lite/micro/micro_interpreter.h"
+#include <cmath>
 namespace tflite {
-// Prints a dump of what tensors and what nodes are in the interpreter.
+
-void PrintInterpreterState(MicroInterpreter* interpreter);
+#if defined(TF_LITE_USE_GLOBAL_MAX) || defined(__ZEPHYR__)
 inline float TfLiteMax(const float& x, const float& y) {
  return std::max(x, y);
 }
 #else
 template <class T>
 inline T TfLiteMax(const T& x, const T& y) {
  return std::fmax(x, y);
 }
 #endif
 }  // namespace tflite
-#endif  // TENSORFLOW_LITE_MICRO_MICRO_OPTIONAL_DEBUG_TOOLS_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/min.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/min.h
@@ -0,0 +1,35 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
 #include <cmath>
 namespace tflite {
 #if defined(TF_LITE_USE_GLOBAL_MIN) || defined(__ZEPHYR__)
 inline float TfLiteMin(const float& x, const float& y) {
  return std::min(x, y);
 }
 #else
 template <class T>
 inline T TfLiteMin(const T& x, const T& y) {
  return std::fmin(x, y);
 }
 #endif
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/portable_tensor.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/portable_tensor.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
-#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
 #include <complex>
 #include <vector>
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/string_util.h"
 namespace tflite {
@@ -76,12 +75,12 @@ class VectorOfTensors {
 // A list of quantized tensors in a format that can be used by kernels like
 // split and concatenation.
-class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
+class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t> {
 public:
  // Build with the tensors in 'tensor_list'.
  VectorOfQuantizedTensors(const TfLiteContext& context,
                           const TfLiteIntArray& tensor_list)
-      : VectorOfTensors<uint8>(context, tensor_list) {
+      : VectorOfTensors<uint8_t>(context, tensor_list) {
    for (int i = 0; i < tensor_list.size; ++i) {
      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
      zero_point_.push_back(t->params.zero_point);
@@ -90,10 +89,10 @@ class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
  }
  const float* scale() const { return scale_.data(); }
-  const int32* zero_point() const { return zero_point_.data(); }
+  const int32_t* zero_point() const { return zero_point_.data(); }
 private:
-  std::vector<int32> zero_point_;
+  std::vector<int32_t> zero_point_;
  std::vector<float> scale_;
 };
@@ -119,26 +118,6 @@ class SequentialTensorWriter {
  T* output_ptr_;
 };
 template <>
 class SequentialTensorWriter<string> {
 public:
  SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output)
      : input_(input), output_(output) {}
  ~SequentialTensorWriter() { buffer_.WriteToTensor(output_, nullptr); }
  void Write(int position) { this->WriteN(position, 1); }
  void WriteN(int position, int len) {
    for (int i = 0; i < len; i++) {
      buffer_.AddString(GetString(input_, position + i));
    }
  }
 private:
  const TfLiteTensor* input_;
  TfLiteTensor* output_;
  DynamicBuffer buffer_;
 };
 }  // namespace tflite
-#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/quantization_util.cc
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/quantization_util.cc
@@ -342,13 +342,13 @@ void NudgeQuantizationRange(const float min, const float max,
  const float quant_max_float = static_cast<float>(quant_max);
  *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
  const float zero_point_from_min = quant_min_float - min / *nudged_scale;
-  uint16 nudged_zero_point;
+  uint16_t nudged_zero_point;
  if (zero_point_from_min < quant_min_float) {
-    nudged_zero_point = static_cast<uint16>(quant_min);
+    nudged_zero_point = static_cast<uint16_t>(quant_min);
  } else if (zero_point_from_min > quant_max_float) {
-    nudged_zero_point = static_cast<uint16>(quant_max);
+    nudged_zero_point = static_cast<uint16_t>(quant_max);
  } else {
-    nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
+    nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
  }
  *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
  *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/add.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/add.h
@@ -51,34 +51,39 @@ inline void Add(const ArithmeticParams& params,
 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
 // This function is used for 8-bit as well as for 16-bit, but the accumulator
 // is 32-bit for both cases. The overflow does not happen due to the
 // choice of the shift (20 or 15, accordingly - see add.cc for more comments).
 template <typename T>
 inline void AddElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
+                           const T* input1_data, const T* input2_data,
-                           uint8* output_data) {
+                           T* output_data) {
-  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
-  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
-  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
-  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32_t input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sum, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<T>(clamped_output);
  }
 }
@@ -86,40 +91,40 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
 // broadcast add, so that, for example, scalar-broadcast with batch will still
 // be fast.
 inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
-                               uint8 input1_data, const uint8* input2_data,
+                               uint8_t input1_data, const uint8_t* input2_data,
-                               uint8* output_data) {
+                               uint8_t* output_data) {
  TFLITE_DCHECK_GT(params.input1_offset, -256);
  TFLITE_DCHECK_GT(params.input2_offset, -256);
  TFLITE_DCHECK_LT(params.input1_offset, 256);
  TFLITE_DCHECK_LT(params.input2_offset, 256);
-  const int32 input1_val = params.input1_offset + input1_data;
+  const int32_t input1_val = params.input1_offset + input1_data;
-  const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
-  const int32 scaled_input1_val =
+  const int32_t scaled_input1_val =
      MultiplyByQuantizedMultiplierSmallerThanOneExp(
          shifted_input1_val, params.input1_multiplier, params.input1_shift);
  for (int i = 0; i < size; ++i) {
-    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sum, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
  }
 }
 inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& output_shape, uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  const int flat_size =
@@ -132,24 +137,53 @@ inline void Add(const ArithmeticParams& params,
  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 inline void AddGeneralParamScale(const ArithmeticParams& params,
                                 const RuntimeShape& input1_shape,
                                 const int16_t* input1_data,
                                 const RuntimeShape& input2_shape,
                                 const int16_t* input2_data,
                                 const RuntimeShape& output_shape,
                                 int16_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);
  int max_value = std::numeric_limits<int16_t>::max();
  TFLITE_DCHECK_GT(params.input1_offset, -max_value);
  TFLITE_DCHECK_GT(params.input2_offset, -max_value);
  TFLITE_DCHECK_LT(params.input1_offset, max_value);
  TFLITE_DCHECK_LT(params.input2_offset, max_value);
  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
-                const RuntimeShape& output_shape, int16* output_data) {
+                const RuntimeShape& output_shape, int16_t* output_data,
                bool pot_scale = true) {
  if (!pot_scale) {
    AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
                         input2_data, output_shape, output_data);
    return;
  }
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  const int input1_shift = params.input1_shift;
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  const int16 output_activation_min = params.quantized_activation_min;
+  const int16_t output_activation_min = params.quantized_activation_min;
-  const int16 output_activation_max = params.quantized_activation_max;
+  const int16_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
  TFLITE_DCHECK_LE(input1_shift, 0);
  TFLITE_DCHECK_LE(params.input2_shift, 0);
-  const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
+  const int16_t* not_shift_input =
-  const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
+      input1_shift == 0 ? input1_data : input2_data;
  const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
  const int input_right_shift =
      input1_shift == 0 ? -params.input2_shift : -input1_shift;
@@ -161,8 +195,8 @@ inline void Add(const ArithmeticParams& params,
    F0 scaled_input = F0::FromRaw(
        gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
    F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
-    const int16 raw_output = result.raw();
+    const int16_t raw_output = result.raw();
-    const int16 clamped_output = std::min(
+    const int16_t clamped_output = std::min(
        output_activation_max, std::max(output_activation_min, raw_output));
    output_data[i] = clamped_output;
  }
@@ -218,11 +252,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                               const RuntimeShape& input1_shape,
-                               const int32* input1_data,
+                               const int32_t* input1_data,
                               const RuntimeShape& input2_shape,
-                               const int32* input2_data,
+                               const int32_t* input2_data,
                               const RuntimeShape& output_shape,
-                               int32* output_data) {
+                               int32_t* output_data) {
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -257,13 +291,14 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
  }
 }
-inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
-                               const RuntimeShape& input1_shape,
+// is 32-bit for both cases. The overflow does not happen due to the
-                               const uint8* input1_data,
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
-                               const RuntimeShape& input2_shape,
+template <typename T>
-                               const uint8* input2_data,
+inline void BroadcastAdd4DSlow(
-                               const RuntimeShape& output_shape,
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
-                               uint8* output_data) {
+    const T* input1_data, const RuntimeShape& input2_shape,
    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -286,34 +321,34 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
              params.input1_offset +
              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
              params.input2_offset +
              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 shifted_input1_val =
+          const int32_t shifted_input1_val =
              input1_val * (1 << params.left_shift);
-          const int32 shifted_input2_val =
+          const int32_t shifted_input2_val =
              input2_val * (1 << params.left_shift);
-          const int32 scaled_input1_val =
+          const int32_t scaled_input1_val =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  shifted_input1_val, params.input1_multiplier,
                  params.input1_shift);
-          const int32 scaled_input2_val =
+          const int32_t scaled_input2_val =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  shifted_input2_val, params.input2_multiplier,
                  params.input2_shift);
-          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
-          const int32 raw_output =
+          const int32_t raw_output =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  raw_sum, params.output_multiplier, params.output_shift) +
              params.output_offset;
-          const int32 clamped_output =
+          const int32_t clamped_output =
              std::min(params.quantized_activation_max,
                       std::max(params.quantized_activation_min, raw_output));
          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<uint8>(clamped_output);
+              static_cast<T>(clamped_output);
        }
      }
    }
@@ -322,11 +357,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
                                 const RuntimeShape& unswitched_input1_shape,
-                                 const uint8* unswitched_input1_data,
+                                 const uint8_t* unswitched_input1_data,
                                 const RuntimeShape& unswitched_input2_shape,
-                                 const uint8* unswitched_input2_data,
+                                 const uint8_t* unswitched_input2_data,
                                 const RuntimeShape& output_shape,
-                                 uint8* output_data) {
+                                 uint8_t* output_data) {
  ArithmeticParams switched_params = unswitched_params;
  switched_params.input1_offset = unswitched_params.input2_offset;
  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
@@ -341,18 +376,18 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
  const ArithmeticParams& params =
      use_unswitched ? unswitched_params : switched_params;
-  const uint8* input1_data =
+  const uint8_t* input1_data =
      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const uint8* input2_data =
+  const uint8_t* input2_data =
      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
  // Fivefold nested loops. The second input resets its position for each
  // iteration of the second loop. The first input resets its position at the
  // beginning of the fourth loop. The innermost loop is an elementwise add of
  // sections of the arrays.
-  uint8* output_data_ptr = output_data;
+  uint8_t* output_data_ptr = output_data;
-  const uint8* input1_data_ptr = input1_data;
+  const uint8_t* input1_data_ptr = input1_data;
-  const uint8* input2_data_reset = input2_data;
+  const uint8_t* input2_data_reset = input2_data;
  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
  // between input shapes. y3 for input 1 is always broadcast, and so the
  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
@@ -368,7 +403,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
    // dimension.
    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8_t* input2_data_ptr;
      for (int i1 = 0; i1 < y1; ++i1) {
        input2_data_ptr = input2_data_reset;
        for (int i2 = 0; i2 < y2; ++i2) {
@@ -397,7 +432,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
    // for y4 == 1 and the loop over y3 is contained within the
    // AddScalarBroadcast function.
    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8_t* input2_data_ptr;
      for (int i1 = 0; i1 < y1; ++i1) {
        input2_data_ptr = input2_data_reset;
        for (int i2 = 0; i2 < y2; ++i2) {
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/comparisons.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/string_util.h"
 namespace tflite {
@@ -51,18 +50,6 @@ inline bool LessEqualFn(T lhs, T rhs) {
  return lhs <= rhs;
 }
 inline bool StringRefEqualFn(const StringRef& lhs, const StringRef& rhs) {
  if (lhs.len != rhs.len) return false;
  for (int i = 0; i < lhs.len; ++i) {
    if (lhs.str[i] != rhs.str[i]) return false;
  }
  return true;
 }
 inline bool StringRefNotEqualFn(const StringRef& lhs, const StringRef& rhs) {
  return !StringRefEqualFn(lhs, rhs);
 }
 template <typename T>
 using ComparisonFn = bool (*)(T, T);
@@ -78,22 +65,6 @@ inline void ComparisonImpl(
  }
 }
 template <bool (*F)(const StringRef&, const StringRef&)>
 inline void ComparisonStringImpl(const RuntimeShape& input1_shape,
                                 const TfLiteTensor* input1,
                                 const RuntimeShape& input2_shape,
                                 const TfLiteTensor* input2,
                                 const RuntimeShape& output_shape,
                                 bool* output_data) {
  const int64_t flatsize =
      MatchingFlatSize(input1_shape, input2_shape, output_shape);
  for (int64_t i = 0; i < flatsize; ++i) {
    const auto lhs = GetString(input1, i);
    const auto rhs = GetString(input2, i);
    output_data[i] = F(lhs, rhs);
  }
 }
 template <ComparisonFn<float> F>
 inline void Comparison(const ComparisonParams& op_params,
                       const RuntimeShape& input1_shape,
@@ -105,30 +76,30 @@ inline void Comparison(const ComparisonParams& op_params,
                           input2_data, output_shape, output_data);
 }
-template <typename T, ComparisonFn<int32> F>
+template <typename T, ComparisonFn<int32_t> F>
 inline void ComparisonWithScaling(
    const ComparisonParams& op_params, const RuntimeShape& input1_shape,
    const T* input1_data, const RuntimeShape& input2_shape,
    const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
  int left_shift = op_params.left_shift;
-  int32 input1_offset = op_params.input1_offset;
+  int32_t input1_offset = op_params.input1_offset;
-  int32 input1_multiplier = op_params.input1_multiplier;
+  int32_t input1_multiplier = op_params.input1_multiplier;
  int input1_shift = op_params.input1_shift;
-  int32 input2_offset = op_params.input2_offset;
+  int32_t input2_offset = op_params.input2_offset;
-  int32 input2_multiplier = op_params.input2_multiplier;
+  int32_t input2_multiplier = op_params.input2_multiplier;
  int input2_shift = op_params.input2_shift;
  const int64_t flatsize =
      MatchingFlatSize(input1_shape, input2_shape, output_shape);
  for (int64_t i = 0; i < flatsize; ++i) {
-    const int32 input1_val = input1_offset + input1_data[i];
+    const int32_t input1_val = input1_offset + input1_data[i];
-    const int32 input2_val = input2_offset + input2_data[i];
+    const int32_t input2_val = input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << left_shift);
+    const int32_t shifted_input1_val = input1_val * (1 << left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << left_shift);
-    const int32 scaled_input1_val =
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, input1_multiplier, input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, input2_multiplier, input2_shift);
    output_data[i] = F(scaled_input1_val, scaled_input2_val);
@@ -180,31 +151,6 @@ inline void BroadcastComparison4DSlowImpl(
  }
 }
 template <bool (*F)(const StringRef&, const StringRef&)>
 inline void BroadcastComparison4DSlowStringImpl(
    const RuntimeShape& unextended_input1_shape, const TfLiteTensor* input1,
    const RuntimeShape& unextended_input2_shape, const TfLiteTensor* input2,
    const RuntimeShape& unextended_output_shape, bool* output_data) {
  const BroadcastComparison4DSlowCommon dims =
      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
                                          unextended_input2_shape,
                                          unextended_output_shape);
  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
          const auto lhs =
              GetString(input1, SubscriptToIndex(dims.desc1, b, y, x, c));
          const auto rhs =
              GetString(input2, SubscriptToIndex(dims.desc2, b, y, x, c));
          output_data[Offset(dims.output_shape, b, y, x, c)] = F(lhs, rhs);
        }
      }
    }
  }
 }
 template <ComparisonFn<float> F>
 inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
                                      const RuntimeShape& input1_shape,
@@ -218,7 +164,7 @@ inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
                                          output_shape, output_data);
 }
-template <typename T, ComparisonFn<int32> F>
+template <typename T, ComparisonFn<int32_t> F>
 inline void BroadcastComparison4DSlowWithScaling(
    const ComparisonParams& op_params,
    const RuntimeShape& unextended_input1_shape, const T* input1_data,
@@ -230,29 +176,29 @@ inline void BroadcastComparison4DSlowWithScaling(
                                          unextended_output_shape);
  int left_shift = op_params.left_shift;
-  int32 input1_offset = op_params.input1_offset;
+  int32_t input1_offset = op_params.input1_offset;
-  int32 input1_multiplier = op_params.input1_multiplier;
+  int32_t input1_multiplier = op_params.input1_multiplier;
  int input1_shift = op_params.input1_shift;
-  int32 input2_offset = op_params.input2_offset;
+  int32_t input2_offset = op_params.input2_offset;
-  int32 input2_multiplier = op_params.input2_multiplier;
+  int32_t input2_multiplier = op_params.input2_multiplier;
  int input2_shift = op_params.input2_shift;
  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
              input1_offset +
              input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
              input2_offset +
              input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)];
-          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32_t shifted_input1_val = input1_val * (1 << left_shift);
-          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32_t shifted_input2_val = input2_val * (1 << left_shift);
-          const int32 scaled_input1_val =
+          const int32_t scaled_input1_val =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  shifted_input1_val, input1_multiplier, input1_shift);
-          const int32 scaled_input2_val =
+          const int32_t scaled_input2_val =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  shifted_input2_val, input2_multiplier, input2_shift);
          output_data[Offset(dims.output_shape, b, y, x, c)] =
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/concatenation.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/concatenation.h
@@ -74,14 +74,14 @@ inline void Concatenation(const ConcatenationParams& params,
 // when optimizng this routine further.
 inline void ConcatenationWithScaling(const ConcatenationParams& params,
                                     const RuntimeShape* const* input_shapes,
-                                     const uint8* const* input_data,
+                                     const uint8_t* const* input_data,
                                     const RuntimeShape& output_shape,
-                                     uint8* output_data) {
+                                     uint8_t* output_data) {
  int axis = params.axis;
-  const int32* input_zeropoint = params.input_zeropoint;
+  const int32_t* input_zeropoint = params.input_zeropoint;
  const float* input_scale = params.input_scale;
  int inputs_count = params.inputs_count;
-  const int32 output_zeropoint = params.output_zeropoint;
+  const int32_t output_zeropoint = params.output_zeropoint;
  const float output_scale = params.output_scale;
  const int concat_dimensions = output_shape.DimensionsCount();
@@ -110,11 +110,11 @@ inline void ConcatenationWithScaling(const ConcatenationParams& params,
  }
  const float inverse_output_scale = 1.f / output_scale;
-  uint8* output_ptr = output_data;
+  uint8_t* output_ptr = output_data;
  for (int k = 0; k < outer_size; k++) {
    for (int i = 0; i < inputs_count; ++i) {
      const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
-      const uint8* input_ptr = input_data[i] + k * copy_size;
+      const uint8_t* input_ptr = input_data[i] + k * copy_size;
      if (input_zeropoint[i] == output_zeropoint &&
          input_scale[i] == output_scale) {
        memcpy(output_ptr, input_ptr, copy_size);
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/conv.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/conv.h
@@ -59,28 +59,31 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
  const int output_width = output_shape.Dims(2);
  for (int batch = 0; batch < batches; ++batch) {
    for (int out_y = 0; out_y < output_height; ++out_y) {
      const int in_y_origin = (out_y * stride_height) - pad_height;
      for (int out_x = 0; out_x < output_width; ++out_x) {
        const int in_x_origin = (out_x * stride_width) - pad_width;
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
          const int in_x_origin = (out_x * stride_width) - pad_width;
          const int in_y_origin = (out_y * stride_height) - pad_height;
          float total = 0.f;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            const int in_y = in_y_origin + dilation_height_factor * filter_y;
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
              const int in_x = in_x_origin + dilation_width_factor * filter_x;
              // Zero padding by omitting the areas outside the image.
              const bool is_point_inside_image =
                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                  (in_y < input_height);
              if (!is_point_inside_image) {
                continue;
              }
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                float input_value = input_data[Offset(input_shape, batch, in_y,
-                const int in_y =
+                                                      in_x, in_channel)];
-                    in_y_origin + dilation_height_factor * filter_y;
+                float filter_value = filter_data[Offset(
-                // If the location is outside the bounds of the input image,
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
-                // use zero as a default value.
+                total += (input_value * filter_value);
                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height)) {
                  float input_value = input_data[Offset(
                      input_shape, batch, in_y, in_x, in_channel)];
                  float filter_value =
                      filter_data[Offset(filter_shape, out_channel, filter_y,
                                         filter_x, in_channel)];
                  total += (input_value * filter_value);
                }
              }
            }
          }
@@ -99,11 +102,11 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
 }
 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
-                 const uint8* input_data, const RuntimeShape& filter_shape,
+                 const uint8_t* input_data, const RuntimeShape& filter_shape,
-                 const uint8* filter_data, const RuntimeShape& bias_shape,
+                 const uint8_t* filter_data, const RuntimeShape& bias_shape,
-                 const int32* bias_data, const RuntimeShape& output_shape,
+                 const int32_t* bias_data, const RuntimeShape& output_shape,
-                 uint8* output_data, const RuntimeShape& im2col_shape,
+                 uint8_t* output_data, const RuntimeShape& im2col_shape,
-                 uint8* im2col_data, void* cpu_backend_context) {
+                 uint8_t* im2col_data, void* cpu_backend_context) {
  (void)cpu_backend_context;  // only used in optimized code.
  (void)im2col_data;   // only used in optimized code.
  (void)im2col_shape;  // only used in optimized code.
@@ -113,13 +116,13 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
  const int dilation_height_factor = params.dilation_height_factor;
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
-  const int32 input_offset = params.input_offset;
+  const int32_t input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
+  const int32_t filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
+  const int32_t output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -139,29 +142,32 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
  const int output_width = output_shape.Dims(2);
  for (int batch = 0; batch < batches; ++batch) {
    for (int out_y = 0; out_y < output_height; ++out_y) {
      const int in_y_origin = (out_y * stride_height) - pad_height;
      for (int out_x = 0; out_x < output_width; ++out_x) {
        const int in_x_origin = (out_x * stride_width) - pad_width;
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
+          int32_t acc = 0;
          const int in_y_origin = (out_y * stride_height) - pad_height;
          int32 acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            const int in_y = in_y_origin + dilation_height_factor * filter_y;
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
              const int in_x = in_x_origin + dilation_width_factor * filter_x;
              // Zero padding by omitting the areas outside the image.
              const bool is_point_inside_image =
                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                  (in_y < input_height);
              if (!is_point_inside_image) {
                continue;
              }
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
                const int in_y =
                    in_y_origin + dilation_height_factor * filter_y;
                // If the location is outside the bounds of the input image,
                // use zero as a default value.
                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height)) {
                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
                                                      in_x, in_channel)];
-                  int32 filter_val =
+                int32_t filter_val = filter_data[Offset(
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
-                                         filter_x, in_channel)];
+                acc +=
-                  acc +=
+                    (filter_val + filter_offset) * (input_val + input_offset);
                      (filter_val + filter_offset) * (input_val + input_offset);
                }
              }
            }
          }
@@ -174,7 +180,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
          acc = std::max(acc, output_activation_min);
          acc = std::min(acc, output_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-              static_cast<uint8>(acc);
+              static_cast<uint8_t>(acc);
        }
      }
    }
@@ -220,7 +226,7 @@ inline void HybridConvPerChannel(
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
          const int in_x_origin = (out_x * stride_width) - pad_width;
          const int in_y_origin = (out_y * stride_height) - pad_height;
-          int32 acc = 0;
+          int32_t acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
@@ -231,9 +237,9 @@ inline void HybridConvPerChannel(
                // use zero as a default value.
                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height)) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                  int32_t input_val = input_data[Offset(
-                                                      in_x, in_channel)];
+                      input_shape, batch, in_y, in_x, in_channel)];
-                  int32 filter_val =
+                  int32_t filter_val =
                      filter_data[Offset(filter_shape, out_channel, filter_y,
                                         filter_x, in_channel)];
                  acc += filter_val * (input_val - input_offset[batch]);
@@ -258,5 +264,4 @@ inline void HybridConvPerChannel(
 }  // namespace reference_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -62,21 +62,21 @@ namespace reference_ops {
 namespace depthwise_conv {
 template <DepthwiseConvOutputRounding output_rounding>
-inline int32 DepthwiseConvRound(int32 x, int32 quantized_multiplier,
+inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
-                                int shift) {
+                                  int shift) {
  TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
 }
 template <>
-inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
-    int32 x, int32 quantized_multiplier, int shift) {
+    int32_t x, int32_t quantized_multiplier, int shift) {
  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
 }
 template <>
-inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
-    int32 x, int32 quantized_multiplier, int shift) {
+    int32_t x, int32_t quantized_multiplier, int shift) {
  using gemmlowp::SaturatingRoundingDoublingHighMul;
  const int left_shift = shift > 0 ? shift : 0;
  const int right_shift = shift > 0 ? 0 : -shift;
@@ -89,13 +89,12 @@ inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
 template <DepthwiseConvOutputRounding output_rounding>
 struct DepthwiseConvBasicKernel {
-  static inline void Run(const DepthwiseParams& params,
+  static inline void Run(
-                         const RuntimeShape& input_shape,
+      const DepthwiseParams& params, const RuntimeShape& input_shape,
-                         const uint8* input_data,
+      const uint8_t* input_data, const RuntimeShape& filter_shape,
-                         const RuntimeShape& filter_shape,
+      const uint8_t* filter_data, const RuntimeShape& bias_shape,
-                         const uint8* filter_data,
+      const int32_t* bias_data, const RuntimeShape& output_shape,
-                         const RuntimeShape& bias_shape, const int32* bias_data,
+      uint8_t* output_data) {
                         const RuntimeShape& output_shape, uint8* output_data) {
    const int stride_width = params.stride_width;
    const int stride_height = params.stride_height;
    const int dilation_width_factor = params.dilation_width_factor;
@@ -103,12 +102,12 @@ struct DepthwiseConvBasicKernel {
    const int pad_width = params.padding_values.width;
    const int pad_height = params.padding_values.height;
    const int depth_multiplier = params.depth_multiplier;
-    const int32 output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_min = params.quantized_activation_min;
-    const int32 output_activation_max = params.quantized_activation_max;
+    const int32_t output_activation_max = params.quantized_activation_max;
-    const int32 input_offset = params.input_offset;
+    const int32_t input_offset = params.input_offset;
-    const int32 filter_offset = params.weights_offset;
+    const int32_t filter_offset = params.weights_offset;
-    const int32 output_offset = params.output_offset;
+    const int32_t output_offset = params.output_offset;
-    const int32 output_multiplier = params.output_multiplier;
+    const int32_t output_multiplier = params.output_multiplier;
    const int output_shift = params.output_shift;
    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
@@ -135,7 +134,7 @@ struct DepthwiseConvBasicKernel {
              const int oc = m + ic * depth_multiplier;
              const int in_x_origin = (out_x * stride_width) - pad_width;
              const int in_y_origin = (out_y * stride_height) - pad_height;
-              int32 acc = 0;
+              int32_t acc = 0;
              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                  const int in_x =
@@ -146,9 +145,9 @@ struct DepthwiseConvBasicKernel {
                  // use zero as a default value.
                  if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                      (in_y < input_height)) {
-                    int32 input_val =
+                    int32_t input_val =
                        input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                    int32 filter_val = filter_data[Offset(
+                    int32_t filter_val = filter_data[Offset(
                        filter_shape, 0, filter_y, filter_x, oc)];
                    acc += (filter_val + filter_offset) *
                           (input_val + input_offset);
@@ -164,7 +163,7 @@ struct DepthwiseConvBasicKernel {
              acc = std::max(acc, output_activation_min);
              acc = std::min(acc, output_activation_max);
              output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                  static_cast<uint8>(acc);
+                  static_cast<uint8_t>(acc);
            }
          }
        }
@@ -176,10 +175,10 @@ struct DepthwiseConvBasicKernel {
  // MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
  static inline void RunPerChannel(
      const DepthwiseParams& params, const RuntimeShape& input_shape,
-      const int8* input_data, const RuntimeShape& filter_shape,
+      const int8_t* input_data, const RuntimeShape& filter_shape,
-      const int8* filter_data, const RuntimeShape& bias_shape,
+      const int8_t* filter_data, const RuntimeShape& bias_shape,
-      const int32* bias_data, const RuntimeShape& output_shape,
+      const int32_t* bias_data, const RuntimeShape& output_shape,
-      int8* output_data) {
+      int8_t* output_data) {
    // Get parameters.
    // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
    const int stride_width = params.stride_width;
@@ -189,12 +188,12 @@ struct DepthwiseConvBasicKernel {
    const int pad_width = params.padding_values.width;
    const int pad_height = params.padding_values.height;
    const int depth_multiplier = params.depth_multiplier;
-    const int32 input_offset = params.input_offset;
+    const int32_t input_offset = params.input_offset;
-    const int32 output_offset = params.output_offset;
+    const int32_t output_offset = params.output_offset;
-    const int32 output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_min = params.quantized_activation_min;
-    const int32 output_activation_max = params.quantized_activation_max;
+    const int32_t output_activation_max = params.quantized_activation_max;
-    const int32* output_multiplier = params.output_multiplier_per_channel;
+    const int32_t* output_multiplier = params.output_multiplier_per_channel;
-    const int32* output_shift = params.output_shift_per_channel;
+    const int32_t* output_shift = params.output_shift_per_channel;
    // Check dimensions of the tensors.
    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -222,7 +221,7 @@ struct DepthwiseConvBasicKernel {
              const int output_channel = m + in_channel * depth_multiplier;
              const int in_x_origin = (out_x * stride_width) - pad_width;
              const int in_y_origin = (out_y * stride_height) - pad_height;
-              int32 acc = 0;
+              int32_t acc = 0;
              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                  const int in_x =
@@ -234,17 +233,18 @@ struct DepthwiseConvBasicKernel {
                      (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                      (in_y < input_height);
                  if (is_point_inside_image) {
-                    int32 input_val = input_data[Offset(
+                    int32_t input_val = input_data[Offset(
                        input_shape, batch, in_y, in_x, in_channel)];
-                    int32 filter_val = filter_data[Offset(
+                    int32_t filter_val = filter_data[Offset(
                        filter_shape, 0, filter_y, filter_x, output_channel)];
                    // Accumulate with 32 bits accumulator.
                    // In the nudging process during model quantization, we
                    // force real value of 0.0 be represented by a quantized
-                    // value. This guarantees that the input_offset is a int8,
+                    // value. This guarantees that the input_offset is a int8_t,
-                    // even though it is represented using int32. int32 += int8
+                    // even though it is represented using int32_t. int32_t +=
-                    // * (int8 - int8) so the highest value we can get from each
+                    // int8_t
-                    // accumulation is [-127, 127] * ([-128, 127] -
+                    // * (int8_t - int8_t) so the highest value we can get from
                    // each accumulation is [-127, 127] * ([-128, 127] -
                    // [-128, 127]), which is [-32512, 32512]. log2(32512)
                    // = 14.98, which means we can accumulate at least 2^16
                    // multiplications without overflow. The accumulator is
@@ -279,10 +279,10 @@ struct DepthwiseConvBasicKernel {
 inline void DepthwiseConv(
    const DepthwiseParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
+    uint8_t* output_data) {
  return depthwise_conv::DepthwiseConvBasicKernel<
      DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
                                                       input_data, filter_shape,
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/dequantize.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/dequantize.h
@@ -32,12 +32,12 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
                       const RuntimeShape& input_shape,
                       const InputT* input_data,
                       const RuntimeShape& output_shape, OutputT* output_data) {
-  int32 zero_point = op_params.zero_point;
+  int32_t zero_point = op_params.zero_point;
  const double scale = op_params.scale;
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < flat_size; i++) {
-    const int32 val = input_data[i];
+    const int32_t val = input_data[i];
    const OutputT result = static_cast<OutputT>(scale * (val - zero_point));
    output_data[i] = result;
  }
@@ -52,11 +52,11 @@ inline void PerChannelDequantize(
  // Ensure flat size is same.
  MatchingFlatSize(input_shape, output_shape);
-  const int32* zero_point = op_params.zero_point;
+  const int32_t* zero_point = op_params.zero_point;
  const float* scale = op_params.scale;
-  const int32 quantized_dimension = op_params.quantized_dimension;
+  const int32_t quantized_dimension = op_params.quantized_dimension;
-  const int32 num_dims = input_shape.DimensionsCount();
+  const int32_t num_dims = input_shape.DimensionsCount();
-  const int32* dims_data = input_shape.DimsData();
+  const int32_t* dims_data = input_shape.DimsData();
  std::vector<int> current_dim(num_dims, 0);
  do {
@@ -64,7 +64,7 @@ inline void PerChannelDequantize(
        ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
                            current_dim.data(), 0, nullptr);
    const int channel = current_dim[quantized_dimension];
-    const int32 val = input_data[offset];
+    const int32_t val = input_data[offset];
    const float result =
        static_cast<float>(scale[channel] * (val - zero_point[channel]));
    output_data[offset] = result;
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/fully_connected.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/fully_connected.h
@@ -61,17 +61,17 @@ inline void FullyConnected(
 inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
+    uint8_t* output_data) {
-  const int32 input_offset = params.input_offset;
+  const int32_t input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
+  const int32_t filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
+  const int32_t output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
@@ -89,10 +89,10 @@ inline void FullyConnected(
  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
  for (int b = 0; b < batches; ++b) {
    for (int out_c = 0; out_c < output_depth; ++out_c) {
-      int32 acc = 0;
+      int32_t acc = 0;
      for (int d = 0; d < accum_depth; ++d) {
-        int32 input_val = input_data[b * accum_depth + d];
+        int32_t input_val = input_data[b * accum_depth + d];
-        int32 filter_val = filter_data[out_c * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
        acc += (filter_val + filter_offset) * (input_val + input_offset);
      }
      if (bias_data) {
@@ -102,24 +102,24 @@ inline void FullyConnected(
      acc += output_offset;
      acc = std::max(acc, output_activation_min);
      acc = std::min(acc, output_activation_max);
-      output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
+      output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
    }
  }
 }
 inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
-    int16* output_data) {
+    int16_t* output_data) {
-  const int32 input_offset = params.input_offset;
+  const int32_t input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
+  const int32_t filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
+  const int32_t output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  TFLITE_DCHECK_EQ(output_offset, 0);
@@ -138,20 +138,21 @@ inline void FullyConnected(
    for (int out_c = 0; out_c < output_depth; ++out_c) {
      // Internal accumulation.
      // Initialize accumulator with the bias-value.
-      int32 accum = bias_data[out_c];
+      int32_t accum = bias_data[out_c];
      // Accumulation loop.
      for (int d = 0; d < accum_depth; ++d) {
-        int16 input_val = input_data[b * accum_depth + d] + input_offset;
+        int16_t input_val = input_data[b * accum_depth + d] + input_offset;
-        int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset;
+        int16_t filter_val =
            filter_data[out_c * accum_depth + d] + filter_offset;
        accum += filter_val * input_val;
      }
-      // Down-scale the final int32 accumulator to the scale used by our
+      // Down-scale the final int32_t accumulator to the scale used by our
      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
      // multiplier and shift here have been pre-computed offline
      // (e.g. by toco).
      accum =
          MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
-      // Saturate, cast to int16, and store to output array.
+      // Saturate, cast to int16_t, and store to output array.
      accum = std::max(accum, output_activation_min - output_offset);
      accum = std::min(accum, output_activation_max - output_offset);
      accum += output_offset;
@@ -162,14 +163,14 @@ inline void FullyConnected(
 inline void ShuffledFullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& weights_shape,
+    const uint8_t* input_data, const RuntimeShape& weights_shape,
-    const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
+    const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
-    int16* output_data, uint8* shuffled_input_workspace_data) {
+    int16_t* output_data, uint8_t* shuffled_input_workspace_data) {
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
@@ -190,7 +191,7 @@ inline void ShuffledFullyConnected(
  TFLITE_DCHECK((output_depth % 4) == 0);
  // Shuffling and xoring of input activations into the workspace buffer
-  uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+  uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
  if (batches == 1) {
    for (int i = 0; i < accum_depth; i++) {
      shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
@@ -198,13 +199,13 @@ inline void ShuffledFullyConnected(
  } else if (batches == 4) {
    for (int c = 0; c < accum_depth; c += 16) {
      for (int b = 0; b < 4; b++) {
-        const uint8* src_data_ptr = input_data + b * accum_depth + c;
+        const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
        for (int j = 0; j < 16; j++) {
-          uint8 src_val = *src_data_ptr++;
+          uint8_t src_val = *src_data_ptr++;
          // Flip the sign bit, so that the kernel will only need to
-          // reinterpret these uint8 values as int8, getting for free the
+          // reinterpret these uint8_t values as int8_t, getting for free the
          // subtraction of the zero_point value 128.
-          uint8 dst_val = src_val ^ 0x80;
+          uint8_t dst_val = src_val ^ 0x80;
          *shuffled_input_workspace_ptr++ = dst_val;
        }
      }
@@ -216,62 +217,62 @@ inline void ShuffledFullyConnected(
  // Actual computation
  if (batches == 1) {
-    int16* output_ptr = output_data;
+    int16_t* output_ptr = output_data;
    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-    // so that just reinterpreting them as int8 values is equivalent to
+    // so that just reinterpreting them as int8_t values is equivalent to
    // subtracting 128 from them, thus implementing for free the subtraction of
    // the zero_point value 128.
-    const int8* shuffled_weights_ptr =
+    const int8_t* shuffled_weights_ptr =
-        reinterpret_cast<const int8*>(shuffled_weights_data);
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
    // Likewise, we preshuffled and pre-xored the input data above.
-    const int8* shuffled_input_data =
+    const int8_t* shuffled_input_data =
-        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
    for (int c = 0; c < output_depth; c += 4) {
      // Internal accumulation.
      // Initialize accumulator with the bias-value.
-      int32 accum[4] = {0};
+      int32_t accum[4] = {0};
      // Accumulation loop.
      for (int d = 0; d < accum_depth; d += 16) {
        for (int i = 0; i < 4; i++) {
          for (int j = 0; j < 16; j++) {
-            int8 input_val = shuffled_input_data[d + j];
+            int8_t input_val = shuffled_input_data[d + j];
-            int8 weights_val = *shuffled_weights_ptr++;
+            int8_t weights_val = *shuffled_weights_ptr++;
            accum[i] += weights_val * input_val;
          }
        }
      }
      for (int i = 0; i < 4; i++) {
        // Add bias value
-        int32 acc = accum[i] + bias_data[c + i];
+        int32_t acc = accum[i] + bias_data[c + i];
-        // Down-scale the final int32 accumulator to the scale used by our
+        // Down-scale the final int32_t accumulator to the scale used by our
        // (16-bit, typically 3 integer bits) fixed-point format. The quantized
        // multiplier and shift here have been pre-computed offline
        // (e.g. by toco).
        acc =
            MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
-        // Saturate, cast to int16, and store to output array.
+        // Saturate, cast to int16_t, and store to output array.
        acc = std::max(acc, output_activation_min);
        acc = std::min(acc, output_activation_max);
        output_ptr[c + i] = acc;
      }
    }
  } else if (batches == 4) {
-    int16* output_ptr = output_data;
+    int16_t* output_ptr = output_data;
    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-    // so that just reinterpreting them as int8 values is equivalent to
+    // so that just reinterpreting them as int8_t values is equivalent to
    // subtracting 128 from them, thus implementing for free the subtraction of
    // the zero_point value 128.
-    const int8* shuffled_weights_ptr =
+    const int8_t* shuffled_weights_ptr =
-        reinterpret_cast<const int8*>(shuffled_weights_data);
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
    // Likewise, we preshuffled and pre-xored the input data above.
-    const int8* shuffled_input_data =
+    const int8_t* shuffled_input_data =
-        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
    for (int c = 0; c < output_depth; c += 4) {
-      const int8* shuffled_input_ptr = shuffled_input_data;
+      const int8_t* shuffled_input_ptr = shuffled_input_data;
      // Accumulation loop.
      // Internal accumulation.
      // Initialize accumulator with the bias-value.
-      int32 accum[4][4];
+      int32_t accum[4][4];
      for (int i = 0; i < 4; i++) {
        for (int b = 0; b < 4; b++) {
          accum[i][b] = 0;
@@ -281,8 +282,8 @@ inline void ShuffledFullyConnected(
        for (int i = 0; i < 4; i++) {
          for (int b = 0; b < 4; b++) {
            for (int j = 0; j < 16; j++) {
-              int8 input_val = shuffled_input_ptr[16 * b + j];
+              int8_t input_val = shuffled_input_ptr[16 * b + j];
-              int8 weights_val = shuffled_weights_ptr[16 * i + j];
+              int8_t weights_val = shuffled_weights_ptr[16 * i + j];
              accum[i][b] += weights_val * input_val;
            }
          }
@@ -293,14 +294,14 @@ inline void ShuffledFullyConnected(
      for (int i = 0; i < 4; i++) {
        for (int b = 0; b < 4; b++) {
          // Add bias value
-          int32 acc = accum[i][b] + bias_data[c + i];
+          int32_t acc = accum[i][b] + bias_data[c + i];
-          // Down-scale the final int32 accumulator to the scale used by our
+          // Down-scale the final int32_t accumulator to the scale used by our
          // (16-bit, typically 3 integer bits) fixed-point format. The
          // quantized multiplier and shift here have been pre-computed offline
          // (e.g. by toco).
          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
                                              output_shift);
-          // Saturate, cast to int16, and store to output array.
+          // Saturate, cast to int16_t, and store to output array.
          acc = std::max(acc, output_activation_min);
          acc = std::min(acc, output_activation_max);
          output_ptr[b * output_depth + c + i] = acc;
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/hard_swish.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/hard_swish.h
@@ -0,0 +1,166 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 namespace tflite {
 namespace reference_ops {
 inline int16_t SaturatingLeftShift(int16_t value, int amount) {
  int32_t result = static_cast<int32_t>(value) * (1 << amount);
  result = std::min<int32_t>(result, std::numeric_limits<int16_t>::max());
  result = std::max<int32_t>(result, std::numeric_limits<int16_t>::min());
  return result;
 }
 // Similar to ARM instruction SQDMULH.
 // Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
 // rounding to zero instead of to nearest (SQRDMULH).
 inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
  bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
  std::int32_t a_32(a);
  std::int32_t b_32(b);
  std::int32_t ab_32 = a_32 * b_32;
  std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
  return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
 }
 template <typename T>
 inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
                      const RuntimeShape& output_shape, T* output_data) {
  ruy::profiler::ScopeLabel label("ReferenceHardSwish/Float");
  auto matching_size = MatchingFlatSize(input_shape, output_shape);
  const T* in_end = input_data + matching_size;
  for (; input_data < in_end; input_data++, output_data++) {
    const float in = *input_data;
    *output_data =
        in * std::min(static_cast<T>(6), std::max(static_cast<T>(0), in + 3)) /
        6;
  }
 }
 template <typename T>
 inline void HardSwish(const HardSwishParams& params,
                      const RuntimeShape& input_shape, const T* input_data,
                      const RuntimeShape& output_shape, T* output_data) {
  ruy::profiler::ScopeLabel label("ReferenceHardSwish/Quantized");
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < flat_size; i++) {
    const int16_t input_value = input_data[i] - params.input_zero_point;
    // Left-shift as much as we can without overflow/saturation to put
    // significant bits in the high bits of our 16-bit fixedpoint values, so
    // that fixed-point approximate computations below are as accurate as
    // possible.
    const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
    // Compute the input value on essentially the output scale, just not
    // right-shifted yet. This is the value that we'll use in the (x >= +3)
    // case, and that in the general case we'll multiply against the "relu-ish"
    // fixed-point multiplier in [0, 1].
    const int16_t input_value_on_preshift_output_scale =
        gemmlowp::SaturatingRoundingDoublingHighMul(
            input_value_on_hires_input_scale,
            params.output_multiplier_fixedpoint_int16);
    // Now compute the "relu-ish multiplier". In the (-3 <= x <= +3) case, that
    // is just an affine rescaling of x from [-3, 3] to [0, 1]. In the general
    // case, it is just that plus saturation at the boundaries of [-3, 3].
    // First, we rescale from [-3, 3] to [-1, 1], saturating.
    // That is done by rescaling the input value with a fixed-point multiplier
    // (reluish_multiplier_fixedpoint) and bit-shift such that we represent
    // that input value on the scale where the real value 3.0f is represented
    // by the quantized value 32768.  (+32768 is actually not representable as
    // int16_t, so this saturates at +32767, and that is seen empirically to be
    // a negligible contribution to numerical error/bias).
    //
    // This code is careful to correctly implement any magnitude of multiplier,
    // involving either a right shift or a left shift, with correct saturation
    // behavior in the left-shift case. This forces this code to be more
    // complicated, but is necessary for real applications: a partially
    // trained quantized MobileNet v3-small model that motivated this code
    // exhibits some large [min, max] range boundaries, of the order of
    // magnitude of 10 or 100 depending on layers.
    //
    // The next few lines are basically just an ordinary
    // MultiplyByQuantizedMultiplier, except that we are more careful here
    // about the fine details of saturation when left-shifting, because here
    // overflow in left-shift is a common case, not an anomaly as
    // MultiplyByQuantizedMultiplier assumes.
    int16_t reluish_value = input_value_on_hires_input_scale;
    // Shift left, saturating, as much as we can while ensuring that this
    // saturation will not contribute to the result. That is, left shift amount
    // reduced by 1.
    if (params.reluish_multiplier_exponent > 0) {
      reluish_value = SaturatingLeftShift(
          reluish_value, params.reluish_multiplier_exponent - 1);
    }
    // Apply the fixed-point multiplier, dividing the value by a divisor
    // ranging in [1, 2].
    reluish_value = gemmlowp::SaturatingRoundingDoublingHighMul(
        reluish_value, params.reluish_multiplier_fixedpoint_int16);
    // Apply the last bit of left-shift. Thus, in the left-shifting case, if
    // any saturation affects the result, it is happening here --- any
    // saturation having occurred above is overwritten here, not affecting the
    // result.
    if (params.reluish_multiplier_exponent > 0) {
      reluish_value = SaturatingLeftShift(reluish_value, 1);
    }
    // Shift right, in the right-shifting case.
    if (params.reluish_multiplier_exponent < 0) {
      reluish_value = gemmlowp::RoundingDivideByPOT(
          reluish_value, -params.reluish_multiplier_exponent);
    }
    // At this point we have rescaled the value into a 16bit fixedpoint
    // reluish_value in [-1, 1].
    // We now convert that to a 16bit fixedpoint value in [0, 1].
    reluish_value = (reluish_value + (1 << 15)) >> 1;
    // Use of SaturatingDoublingHighMul here is important to cancel the biases
    // from the above SaturatingRoundingDoublingHighMul.
    //
    // On a partially trained MobileNet-v3-small,
    //
    //                                       | bias on    |  ImageNet
    //                                       | quantized  |  Top-1
    // Operation used here                   | values     |  accuracy (50k)
    // --------------------------------------+------------+-----------
    // SaturatingDoublingHighMul             | -0.0024    |  58.920
    // SaturatingRoundingDoublingHighMul     | -0.0067    |  58.064
    //
    // In activations_test, this is covered by this testcase:
    //     QuantizedActivationsOpTest.HardSwishBias
    //
    const int16_t preshift_output_value = SaturatingDoublingHighMul(
        reluish_value, input_value_on_preshift_output_scale);
    // We were so far operating on the pre-shift output scale. Now we finally
    // apply that output shift, arriving at the final output scale.
    int16_t output_value = gemmlowp::RoundingDivideByPOT(
        preshift_output_value, -params.output_multiplier_exponent);
    output_value += params.output_zero_point;
    output_value =
        std::min<int16_t>(output_value, std::numeric_limits<T>::max());
    output_value =
        std::max<int16_t>(output_value, std::numeric_limits<T>::min());
    output_data[i] = output_value;
  }
 }
 }  // namespace reference_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -23,34 +23,41 @@ limitations under the License.
 namespace tflite {
 namespace reference_integer_ops {
 inline void CheckArithmeticParams(const ArithmeticParams& params) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  // Input offset is negative input zero point. Activation tensors are
  // asymmetric quantized so they span the full int8 range.
  TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
  TFLITE_DCHECK_GE(-params.input2_offset, std::numeric_limits<int8_t>::min());
  TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
  TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
 }
 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
 inline void AddElementwise(int size, const ArithmeticParams& params,
                           const int8_t* input1_data, const int8_t* input2_data,
                           int8_t* output_data) {
-  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
+  CheckArithmeticParams(params);
  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32_t input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sum, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
    output_data[i] = static_cast<int8_t>(clamped_output);
@@ -61,16 +68,11 @@ inline void Add(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const int8_t* input1_data,
                const RuntimeShape& input2_shape, const int8_t* input2_data,
                const RuntimeShape& output_shape, int8_t* output_data) {
-  TFLITE_DCHECK_LE(params.quantized_activation_min,
+  CheckArithmeticParams(params);
-                   params.quantized_activation_max);
+
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);
  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -22,27 +22,27 @@ namespace reference_integer_ops {
 // Fixed-point per-channel-quantization convolution reference kernel.
 inline void ConvPerChannel(
-    const ConvParams& params, const int32* output_multiplier,
+    const ConvParams& params, const int32_t* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
-    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
-    int8* output_data) {
+    int8_t* output_data) {
  // Get parameters.
-  const int32 input_offset = params.input_offset;  // r = s(q - Z)
+  const int32_t input_offset = params.input_offset;  // r = s(q - Z)
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
  const int dilation_width_factor = params.dilation_width_factor;
  const int dilation_height_factor = params.dilation_height_factor;
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
-  const int32 output_offset = params.output_offset;
+  const int32_t output_offset = params.output_offset;
  // Set min and max value of the output.
-  const int32 output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_max = params.quantized_activation_max;
-  // Sanity check.
+  // Consistency check.
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
@@ -63,45 +63,47 @@ inline void ConvPerChannel(
  const int output_width = output_shape.Dims(2);
  for (int batch = 0; batch < batches; ++batch) {
    for (int out_y = 0; out_y < output_height; ++out_y) {
      const int in_y_origin = (out_y * stride_height) - pad_height;
      for (int out_x = 0; out_x < output_width; ++out_x) {
        const int in_x_origin = (out_x * stride_width) - pad_width;
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
+          int32_t acc = 0;
          const int in_y_origin = (out_y * stride_height) - pad_height;
          int32 acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            const int in_y = in_y_origin + dilation_height_factor * filter_y;
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
              const int in_x = in_x_origin + dilation_width_factor * filter_x;
              // Zero padding by omitting the areas outside the image.
              const bool is_point_inside_image =
                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                  (in_y < input_height);
              if (!is_point_inside_image) {
                continue;
              }
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
                const int in_y =
                    in_y_origin + dilation_height_factor * filter_y;
                // Zero padding by omitting the areas outside the image.
                const bool is_point_inside_image =
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
                                                      in_x, in_channel)];
-                  int32 filter_val =
+                int32_t filter_val = filter_data[Offset(
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
-                                         filter_x, in_channel)];
+                // Accumulate with 32 bits accumulator.
-                  // Accumulate with 32 bits accumulator.
+                // In the nudging process during model quantization, we force
-                  // In the nudging process during model quantization, we force
+                // real value of 0.0 be represented by a quantized value. This
-                  // real value of 0.0 be represented by a quantized value. This
+                // guarantees that the input_offset is a int8_t, even though
-                  // guarantees that the input_offset is a int8, even though it
+                // it is represented using int32_t. int32_t += int8_t *
-                  // is represented using int32.
+                // (int8_t - int8_t) so the highest value we can get from each
-                  // int32 += int8 * (int8 - int8) so the highest value we can
+                // accumulation is [-127, 127] * ([-128, 127] -
-                  // get from each accumulation is [-127, 127] * ([-128, 127] -
+                // [-128, 127]), which is [-32512, 32512]. log2(32512)
-                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                // = 14.98, which means we can accumulate at least 2^16
-                  // = 14.98, which means we can accumulate at least 2^16
+                // multiplications without overflow. The accumulator is
-                  // multiplications without overflow. The accumulator is
+                // applied to a filter so the accumulation logic will hold as
-                  // applied to a filter so the accumulation logic will hold as
+                // long as the filter size (filter_y * filter_x * in_channel)
-                  // long as the filter size (filter_y * filter_x * in_channel)
+                // does not exceed 2^16, which is the case in all the models
-                  // does not exceed 2^16, which is the case in all the models
+                // we have seen so far.
-                  // we have seen so far.
+                // TODO(jianlijianli): Add a check to make sure the
-                  // TODO(jianlijianli): Add a check to make sure the
+                // accumulator depth is smaller than 2^16.
-                  // accumulator depth is smaller than 2^16.
+                acc += filter_val * (input_val + input_offset);
                  acc += filter_val * (input_val + input_offset);
                }
              }
            }
          }
@@ -125,12 +127,12 @@ inline void ConvPerChannel(
 // Fixed-point per-channel-quantization convolution reference kernel.
 // 16-bit data and 8-bit filter
 inline void ConvPerChannel(
-    const ConvParams& params, const int32* output_multiplier,
+    const ConvParams& params, const int32_t* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
-    const int16* input_data, const RuntimeShape& filter_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const std::int64_t* bias_data, const RuntimeShape& output_shape,
-    int16* output_data) {
+    int16_t* output_data) {
  // Get parameters.
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
@@ -140,10 +142,10 @@ inline void ConvPerChannel(
  const int pad_height = params.padding_values.height;
  // Set min and max value of the output.
-  const int32 output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_max = params.quantized_activation_max;
-  // Sanity check.
+  // Consistency check.
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
@@ -164,35 +166,37 @@ inline void ConvPerChannel(
  const int output_width = output_shape.Dims(2);
  for (int batch = 0; batch < batches; ++batch) {
    for (int out_y = 0; out_y < output_height; ++out_y) {
      const int in_y_origin = (out_y * stride_height) - pad_height;
      for (int out_x = 0; out_x < output_width; ++out_x) {
        const int in_x_origin = (out_x * stride_width) - pad_width;
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
          const int in_x_origin = (out_x * stride_width) - pad_width;
          const int in_y_origin = (out_y * stride_height) - pad_height;
          std::int64_t acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            const int in_y = in_y_origin + dilation_height_factor * filter_y;
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
              const int in_x = in_x_origin + dilation_width_factor * filter_x;
              // Zero padding by omitting the areas outside the image.
              const bool is_point_inside_image =
                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                  (in_y < input_height);
              if (!is_point_inside_image) {
                continue;
              }
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
                const int in_y =
                    in_y_origin + dilation_height_factor * filter_y;
                // Zero padding by omitting the areas outside the image.
                const bool is_point_inside_image =
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
                                                      in_x, in_channel)];
-                  int32 filter_val =
+                int32_t filter_val = filter_data[Offset(
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
-                                         filter_x, in_channel)];
+                // Accumulate with 64 bits accumulator.
-                  // Accumulate with 64 bits accumulator.
+                // int64_t += int8_t * int16_t so the highest value we can
-                  // int64 += int8 * int16 so the highest value we can
+                // get from each accumulation is [-127, 127] * ([-32768,
-                  // get from each accumulation is [-127, 127] * ([-32768,
+                // 32767] -
-                  // 32767] -
+                // [-32768, 32767]), which is [-8322945, 8322945].
-                  // [-32768, 32767]), which is [-8322945, 8322945].
+                // log2(8322945) = 22.99.
-                  // log2(8322945) = 22.99.
+                acc += filter_val * input_val;
                  acc += filter_val * input_val;
                }
              }
            }
          }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -20,12 +20,12 @@ limitations under the License.
 namespace tflite {
 namespace reference_integer_ops {
 inline void DepthwiseConvPerChannel(
-    const DepthwiseParams& params, const int32* output_multiplier,
+    const DepthwiseParams& params, const int32_t* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
-    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
-    int8* output_data) {
+    int8_t* output_data) {
  // Get parameters.
  // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
  const int stride_width = params.stride_width;
@@ -35,10 +35,10 @@ inline void DepthwiseConvPerChannel(
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
  const int depth_multiplier = params.depth_multiplier;
-  const int32 input_offset = params.input_offset;
+  const int32_t input_offset = params.input_offset;
-  const int32 output_offset = params.output_offset;
+  const int32_t output_offset = params.output_offset;
-  const int32 output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_max = params.quantized_activation_max;
  // Check dimensions of the tensors.
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -66,7 +66,7 @@ inline void DepthwiseConvPerChannel(
            const int output_channel = m + in_channel * depth_multiplier;
            const int in_x_origin = (out_x * stride_width) - pad_width;
            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
+            int32_t acc = 0;
            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                const int in_x = in_x_origin + dilation_width_factor * filter_x;
@@ -77,17 +77,17 @@ inline void DepthwiseConvPerChannel(
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                  int32_t input_val = input_data[Offset(
-                                                      in_x, in_channel)];
+                      input_shape, batch, in_y, in_x, in_channel)];
-                  int32 filter_val = filter_data[Offset(
+                  int32_t filter_val = filter_data[Offset(
                      filter_shape, 0, filter_y, filter_x, output_channel)];
                  // Accumulate with 32 bits accumulator.
                  // In the nudging process during model quantization, we force
                  // real value of 0.0 be represented by a quantized value. This
-                  // guarantees that the input_offset is a int8, even though it
+                  // guarantees that the input_offset is a int8_t, even though
-                  // is represented using int32.
+                  // it is represented using int32_t. int32_t += int8_t *
-                  // int32 += int8 * (int8 - int8) so the highest value we can
+                  // (int8_t - int8_t) so the highest value we can get from each
-                  // get from each accumulation is [-127, 127] * ([-128, 127] -
+                  // accumulation is [-127, 127] * ([-128, 127] -
                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
                  // = 14.98, which means we can accumulate at least 2^16
                  // multiplications without overflow. The accumulator is
@@ -120,12 +120,12 @@ inline void DepthwiseConvPerChannel(
 }
 inline void DepthwiseConvPerChannel(
-    const DepthwiseParams& params, const int32* output_multiplier,
+    const DepthwiseParams& params, const int32_t* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
-    const int16* input_data, const RuntimeShape& filter_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const std::int64_t* bias_data, const RuntimeShape& output_shape,
-    int16* output_data) {
+    int16_t* output_data) {
  // Get parameters.
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
@@ -134,8 +134,8 @@ inline void DepthwiseConvPerChannel(
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
  const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_max = params.quantized_activation_max;
  // Check dimensions of the tensors.
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -174,9 +174,9 @@ inline void DepthwiseConvPerChannel(
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                  int32_t input_val = input_data[Offset(
-                                                      in_x, in_channel)];
+                      input_shape, batch, in_y, in_x, in_channel)];
-                  int32 filter_val = filter_data[Offset(
+                  int32_t filter_val = filter_data[Offset(
                      filter_shape, 0, filter_y, filter_x, output_channel)];
                  // Accumulate with 64 bits accumulator.
                  // We assume maximum of 2^16 accumulations as with the 8-bit
@@ -190,7 +190,7 @@ inline void DepthwiseConvPerChannel(
            if (bias_data) {
              acc += bias_data[output_channel];
            }
-            int32 scaled_acc = MultiplyByQuantizedMultiplier(
+            int32_t scaled_acc = MultiplyByQuantizedMultiplier(
                acc, output_multiplier[output_channel],
                output_shift[output_channel]);
            scaled_acc = std::max(scaled_acc, output_activation_min);
@@ -207,8 +207,8 @@ inline void DepthwiseConvPerChannel(
 inline void DepthwiseConvHybridPerChannel(
    const DepthwiseParams& params, float* scaling_factors_ptr,
-    const RuntimeShape& input_shape, const int8* input_data,
+    const RuntimeShape& input_shape, const int8_t* input_data,
-    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
    const RuntimeShape& bias_shape, const float* bias_data,
    const RuntimeShape& output_shape, float* output_data,
    const float* per_channel_scale, int32_t* input_offset) {
@@ -247,7 +247,7 @@ inline void DepthwiseConvHybridPerChannel(
            const int output_channel = m + in_channel * depth_multiplier;
            const int in_x_origin = (out_x * stride_width) - pad_width;
            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
+            int32_t acc = 0;
            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                const int in_x = in_x_origin + dilation_width_factor * filter_x;
@@ -258,9 +258,9 @@ inline void DepthwiseConvHybridPerChannel(
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                  int32_t input_val = input_data[Offset(
-                                                      in_x, in_channel)];
+                      input_shape, batch, in_y, in_x, in_channel)];
-                  int32 filter_val = filter_data[Offset(
+                  int32_t filter_val = filter_data[Offset(
                      filter_shape, 0, filter_y, filter_x, output_channel)];
                  acc += filter_val * (input_val - input_offset[batch]);
                }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -24,15 +24,15 @@ inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
    const int8_t* input_data, const RuntimeShape& filter_shape,
    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
    int8_t* output_data) {
-  const int32 input_offset = params.input_offset;
+  const int32_t input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
+  const int32_t filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
+  const int32_t output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
@@ -44,10 +44,10 @@ inline void FullyConnected(
  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
  for (int b = 0; b < batches; ++b) {
    for (int out_c = 0; out_c < output_depth; ++out_c) {
-      int32 acc = 0;
+      int32_t acc = 0;
      for (int d = 0; d < accum_depth; ++d) {
-        int32 input_val = input_data[b * accum_depth + d];
+        int32_t input_val = input_data[b * accum_depth + d];
-        int32 filter_val = filter_data[out_c * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
        acc += (filter_val + filter_offset) * (input_val + input_offset);
      }
      if (bias_data) {
@@ -68,11 +68,11 @@ inline void FullyConnected(
    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const int64_t* bias_data, const RuntimeShape& output_shape,
    int16_t* output_data) {
-  const int32 filter_offset = params.weights_offset;
+  const int32_t filter_offset = params.weights_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
@@ -86,8 +86,8 @@ inline void FullyConnected(
    for (int out_c = 0; out_c < output_depth; ++out_c) {
      int64_t acc = 0;
      for (int d = 0; d < accum_depth; ++d) {
-        int32 input_val = input_data[b * accum_depth + d];
+        int32_t input_val = input_data[b * accum_depth + d];
-        int32 filter_val = filter_data[out_c * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
        acc += (filter_val + filter_offset) * input_val;
      }
      if (bias_data) {
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
@@ -21,8 +21,8 @@ namespace tflite {
 namespace reference_integer_ops {
 inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
-                            int32_t depth, const int8* input_data,
+                            int32_t depth, const int8_t* input_data,
-                            int8* output_data) {
+                            int8_t* output_data) {
  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
  // The output scale must be in sync with Prepare().
@@ -30,7 +30,7 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
  // to [-1, 127/128].
  static constexpr int32_t kOutputScale = 7;
  for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
-    // int32 = (int8 - int8) ^ 2.
+    // int32_t = (int8_t - int8_t) ^ 2.
    // ([-128, 127] - [-128, 127]) ^ 2 = [0, (2^8 - 1)^2] so the accumulator is
    // safe from overflowing in at least 2^16 steps.
    int32_t acc = 0;
@@ -55,7 +55,7 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
          std::min(static_cast<int32_t>(kMaxInt8),
                   std::max(static_cast<int32_t>(kMinInt8), output_in_q24));
      output_data[depth * outer_index + inner_index] =
-          static_cast<int8>(output_in_q24);
+          static_cast<int8_t>(output_in_q24);
    }
  }
 }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -58,12 +58,15 @@ inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
  }
 }
-inline void Logistic(int32_t input_size, const int16_t* ptr_input_data,
+inline void Logistic(int32_t input_multiplier, int32_t input_size,
-                     int16_t* ptr_output_data) {
+                     const int16_t* ptr_input_data, int16_t* ptr_output_data) {
  // We use the LUT for sigmoid and take into account, that
  // tanh(x) = 2*sigmoid(2*x) - 1
  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
  for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = *ptr_input_data;
+    int32_t input_data = (*ptr_input_data) * input_data_mul;
    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
    // we do interpolation on unsigned values.
@@ -72,13 +75,20 @@ inline void Logistic(int32_t input_size, const int16_t* ptr_input_data,
    // We divide by 2 power of 9, because
    // we need to divide by 2 in power of 7 for
    // the input conversion + 1/4 from the scale above.
-    uint8_t uh = abs_input_data >> 9;
+    // Define uh as uint32_t type not to make this function overflow.
-    uint32_t ua = sigmoid_table_uint16[uh];
+    uint32_t uh = abs_input_data >> 9;
-    uint32_t ub = sigmoid_table_uint16[uh + 1];
+    uint32_t result;
    uint32_t ut = abs_input_data & 0x1ff;
-    // Interpolation is done using the fractional bit.
+    if (uh >= 255) {
-    uint32_t result = (ua << 9) + ut * (ub - ua);
+      // Saturate to maximum.
      result = 0x7FFF << 10;
    } else {
      uint32_t ua = sigmoid_table_uint16[uh];
      uint32_t ub = sigmoid_table_uint16[uh + 1];
      uint32_t ut = abs_input_data & 0x1ff;
      // Interpolation is done using the fractional bit.
      result = (ua << 9) + ut * (ub - ua);
    }
    result = (input_data >= 0) ? (result + (1 << 9))
                               : ((1 << (16 + 9)) - result + (1 << 9) - 1);
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
@@ -0,0 +1,77 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
 #include "tensorflow/lite/kernels/internal/common.h"
 namespace tflite {
 namespace reference_integer_ops {
 template <typename integer_type>
 inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
                 int32_t shift, const RuntimeShape& unextended_input_shape,
                 const integer_type* input_data, int32_t input_zero_point,
                 const RuntimeShape& unextended_output_shape,
                 integer_type* output_data, int32_t output_zero_point) {
  // Current implementation only supports dimension equals 4 and simultaneous
  // reduction over width and height.
  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
  const RuntimeShape input_shape =
      RuntimeShape::ExtendedShape(4, unextended_input_shape);
  const RuntimeShape output_shape =
      RuntimeShape::ExtendedShape(4, unextended_output_shape);
  const int output_batch = output_shape.Dims(0);
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
  const int output_depth = output_shape.Dims(3);
  const int input_height = input_shape.Dims(1);
  const int input_width = input_shape.Dims(2);
  const int num_elements_in_axis = input_width * input_height;
  TFLITE_CHECK_EQ(op_params.axis_count, 2);
  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
  TFLITE_CHECK_EQ(output_height, 1);
  TFLITE_CHECK_EQ(output_width, 1);
  static constexpr int32_t kMinInt = std::numeric_limits<integer_type>::min();
  static constexpr int32_t kMaxInt = std::numeric_limits<integer_type>::max();
  for (int out_b = 0; out_b < output_batch; ++out_b) {
    for (int out_d = 0; out_d < output_depth; ++out_d) {
      int32_t acc = 0;
      for (int in_h = 0; in_h < input_height; ++in_h) {
        for (int in_w = 0; in_w < input_width; ++in_w) {
          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)] -
                 input_zero_point;
        }
      }
      acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
      acc = acc > 0 ? (acc + num_elements_in_axis / 2) / num_elements_in_axis
                    : (acc - num_elements_in_axis / 2) / num_elements_in_axis;
      acc += output_zero_point;
      acc = std::min(std::max(acc, kMinInt), kMaxInt);
      output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
          static_cast<integer_type>(acc);
    }
  }
 }
 }  // namespace reference_integer_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@@ -27,14 +27,14 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
                           const T* input1_data, const T* input2_data,
                           T* output_data) {
  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32_t input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
-    const int32 unclamped_result =
+    const int32_t unclamped_result =
        params.output_offset +
        MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                      params.output_multiplier,
                                      params.output_shift);
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, unclamped_result));
    output_data[i] = static_cast<T>(clamped_output);
@@ -57,13 +57,13 @@ inline void Mul(const ArithmeticParams& params,
 // Mul with 16 bit inputs and int8_t outputs.
 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
                const RuntimeShape& output_shape, int8_t* output_data) {
  ruy::profiler::ScopeLabel label("Mul/Int16Int8");
-  int32 output_offset = params.output_offset;
+  int32_t output_offset = params.output_offset;
-  int32 output_activation_min = params.quantized_activation_min;
+  int32_t output_activation_min = params.quantized_activation_min;
-  int32 output_activation_max = params.quantized_activation_max;
+  int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  const int flat_size =
@@ -75,12 +75,12 @@ inline void Mul(const ArithmeticParams& params,
    F0 unclamped_result =
        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
-    int16 rescaled_result =
+    int16_t rescaled_result =
        gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
-    int16 clamped_result =
+    int16_t clamped_result = std::min<int16_t>(
-        std::min<int16>(output_activation_max - output_offset, rescaled_result);
+        output_activation_max - output_offset, rescaled_result);
-    clamped_result =
+    clamped_result = std::max<int16_t>(output_activation_min - output_offset,
-        std::max<int16>(output_activation_min - output_offset, clamped_result);
+                                       clamped_result);
    output_data[i] = output_offset + clamped_result;
  }
 }
@@ -104,18 +104,18 @@ inline void BroadcastMul4DSlow(
    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
              params.input1_offset +
              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
              params.input2_offset +
              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 unclamped_result =
+          const int32_t unclamped_result =
              params.output_offset +
              MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                            params.output_multiplier,
                                            params.output_shift);
-          const int32 clamped_output = std::min(
+          const int32_t clamped_output = std::min(
              params.quantized_activation_max,
              std::max(params.quantized_activation_min, unclamped_result));
          output_data[Offset(extended_output_shape, b, y, x, c)] =
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
@@ -22,8 +22,9 @@ namespace tflite {
 namespace reference_integer_ops {
 inline void AveragePool(const PoolParams& params,
-                        const RuntimeShape& input_shape, const int8* input_data,
+                        const RuntimeShape& input_shape,
-                        const RuntimeShape& output_shape, int8* output_data) {
+                        const int8_t* input_data,
                        const RuntimeShape& output_shape, int8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -52,7 +53,7 @@ inline void AveragePool(const PoolParams& params,
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end =
              std::min(params.filter_height, input_height - in_y_origin);
-          int32 acc = 0;
+          int32_t acc = 0;
          int filter_count = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end;
               ++filter_y) {
@@ -71,7 +72,7 @@ inline void AveragePool(const PoolParams& params,
          acc = std::max(acc, params.quantized_activation_min);
          acc = std::min(acc, params.quantized_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<int8>(acc);
+              static_cast<int8_t>(acc);
        }
      }
    }
@@ -79,8 +80,8 @@ inline void AveragePool(const PoolParams& params,
 }
 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const int8* input_data, const RuntimeShape& output_shape,
+                    const int8_t* input_data, const RuntimeShape& output_shape,
-                    int8* output_data) {
+                    int8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_GE(params.quantized_activation_min,
@@ -137,8 +138,9 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
 inline void AveragePool(const PoolParams& params,
                        const RuntimeShape& input_shape,
-                        const int16* input_data,
+                        const int16_t* input_data,
-                        const RuntimeShape& output_shape, int16* output_data) {
+                        const RuntimeShape& output_shape,
                        int16_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -167,7 +169,7 @@ inline void AveragePool(const PoolParams& params,
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end =
              std::min(params.filter_height, input_height - in_y_origin);
-          int32 acc = 0;
+          int32_t acc = 0;
          int filter_count = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end;
               ++filter_y) {
@@ -186,7 +188,7 @@ inline void AveragePool(const PoolParams& params,
          acc = std::max(acc, params.quantized_activation_min);
          acc = std::min(acc, params.quantized_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<int16>(acc);
+              static_cast<int16_t>(acc);
        }
      }
    }
@@ -194,8 +196,8 @@ inline void AveragePool(const PoolParams& params,
 }
 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const int16* input_data, const RuntimeShape& output_shape,
+                    const int16_t* input_data, const RuntimeShape& output_shape,
-                    int16* output_data) {
+                    int16_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_GE(params.quantized_activation_min,
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -0,0 +1,110 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
 #include <limits>
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 namespace tflite {
 namespace reference_integer_ops {
 inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
                 int32_t input_multiplier, int32_t input_shift,
                 const RuntimeShape& input_shape, const int8_t* input_data,
                 const RuntimeShape& output_shape, int8_t* output_data) {
  // Integer bits must be in sync with Prepare() function.
  static constexpr int32_t kInputIntegerBits = 4;
  static constexpr int32_t kOutputScale = 7;
  static constexpr int32_t kMinInt8 = std::numeric_limits<int8_t>::min();
  static constexpr int32_t kMaxInt8 = std::numeric_limits<int8_t>::max();
  using F4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < flat_size; ++i) {
    const int32_t input =
        static_cast<int32_t>(input_data[i]) - input_zero_point;
    if (input <= -input_range_radius) {
      output_data[i] = kMinInt8;
    } else if (input >= input_range_radius) {
      output_data[i] = kMaxInt8;
    } else {
      const int32_t input_in_q4 =
          MultiplyByQuantizedMultiplier(input, input_multiplier, input_shift);
      const int32_t output_in_q0 =
          gemmlowp::tanh(F4::FromRaw(input_in_q4)).raw();
      // Rescale and downcast.
      using gemmlowp::RoundingDivideByPOT;
      int32_t output_in_q24 =
          RoundingDivideByPOT(output_in_q0, 31 - kOutputScale);
      output_in_q24 = std::min(std::max(output_in_q24, kMinInt8), kMaxInt8);
      output_data[i] = static_cast<int8_t>(output_in_q24);
    }
  }
 }
 inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
                 const RuntimeShape& input_shape, const int16_t* ptr_input_data,
                 const RuntimeShape& output_shape, int16_t* ptr_output_data) {
  // We use the LUT for sigmoid and take into account, that
  // tanh(x) = 2*sigmoid(2*x) - 1
  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
  int flat_size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++) {
    int32_t input_data = (*ptr_input_data) * input_data_mul;
    if (input_left_shift == 1) {
      input_data <<= 1;
    }
    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
    uint32_t abs_input_data = 3 * abs(input_data);
    uint32_t uh = abs_input_data >> 8;
    int32_t result;
    if (uh >= 255) {
      // Saturate to maximum.
      result = 0xFFFF << 8;
    } else {
      uint32_t ua = sigmoid_table_uint16[uh];
      uint32_t ub = sigmoid_table_uint16[uh + 1];
      uint8_t ut = abs_input_data & 0xFF;
      result = (ua << 8) + ut * (ub - ua);
    }
    result = (input_data >= 0)
                 ? (result - (1 << (14 + 9)) + (1 << (9 - 2)))
                 : (-result + (1 << (14 + 9)) + (1 << (9 - 2)) - 1);
    // Convert back to 16-bit.
    result >>= (9 - 1);
    *ptr_output_data = result;
  }
 }
 }  // namespace reference_integer_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/l2normalization.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/l2normalization.h
@@ -52,40 +52,39 @@ inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                            const RuntimeShape& input_shape,
-                            const uint8* input_data,
+                            const uint8_t* input_data,
                            const RuntimeShape& output_shape,
-                            uint8* output_data) {
+                            uint8_t* output_data) {
  const int trailing_dim = input_shape.DimensionsCount() - 1;
  const int depth =
      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
  const int outer_size =
      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-  const int32 input_zero_point = op_params.input_zero_point;
+  const int32_t input_zero_point = op_params.input_zero_point;
  for (int i = 0; i < outer_size; ++i) {
-    int32 square_l2_norm = 0;
+    int32_t square_l2_norm = 0;
    for (int c = 0; c < depth; c++) {
-      int32 diff = input_data[depth * i + c] - input_zero_point;
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
      square_l2_norm += diff * diff;
    }
-    int32 inv_l2norm_multiplier;
+    int32_t inv_l2norm_multiplier;
    int inv_l2norm_shift;
    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
    for (int c = 0; c < depth; c++) {
-      int32 diff = input_data[depth * i + c] - input_zero_point;
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
-      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
-      int32 unclamped_output_val = 128 + rescaled_diff;
+      int32_t unclamped_output_val = 128 + rescaled_diff;
-      int32 output_val =
+      int32_t output_val =
-          std::min(static_cast<int32>(255),
+          std::min(static_cast<int32_t>(255),
-                   std::max(static_cast<int32>(0), unclamped_output_val));
+                   std::max(static_cast<int32_t>(0), unclamped_output_val));
-      output_data[depth * i + c] = static_cast<uint8>(output_val);
+      output_data[depth * i + c] = static_cast<uint8_t>(output_val);
    }
  }
 }
 }  // namespace reference_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/logistic.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/logistic.h
@@ -66,8 +66,8 @@ inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
 }
 inline void Logistic(const LogisticParams& params,
-                     const RuntimeShape& input_shape, const int16* input_data,
+                     const RuntimeShape& input_shape, const int16_t* input_data,
-                     const RuntimeShape& output_shape, int16* output_data) {
+                     const RuntimeShape& output_shape, int16_t* output_data) {
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < flat_size; i++) {
@@ -84,12 +84,12 @@ inline void Logistic(const LogisticParams& params,
  }
 }
-// Quantized int8 logistic activation.  Cheats by dequantizing and requantizing
+// Quantized int8_t logistic activation.  Cheats by dequantizing and
-// around the floating point logistic method.  This implementation is slow on
+// requantizing around the floating point logistic method.  This implementation
-// platforms without a floating point unit.
+// is slow on platforms without a floating point unit.
-// TODO(b/141211002): Delete this int8 implementation once we can reuse the
+// TODO(b/141211002): Delete this int8_t implementation once we can reuse the
-// approach used in TFLite for int8 Logistic.
+// approach used in TFLite for int8_t Logistic.
 inline void Logistic(const RuntimeShape& input_shape, const int8_t* input_data,
                     float input_scale, int input_zero_point,
                     const RuntimeShape& output_shape, int8_t* output_data,
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/mul.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/mul.h
@@ -24,20 +24,20 @@ namespace reference_ops {
 // Element-wise mul that can often be used for inner loop of broadcast Mul as
 // well as the non-broadcast Mul.
 inline void MulElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
+                           const uint8_t* input1_data,
-                           uint8* output_data) {
+                           const uint8_t* input2_data, uint8_t* output_data) {
  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32_t input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
-    const int32 unclamped_result =
+    const int32_t unclamped_result =
        params.output_offset +
        MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                      params.output_multiplier,
                                      params.output_shift);
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, unclamped_result));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
  }
 }
@@ -60,9 +60,9 @@ inline void Mul(const ArithmeticParams& params,
 }
 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& output_shape, uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  const int flat_size =
@@ -73,11 +73,11 @@ inline void Mul(const ArithmeticParams& params,
 inline void BroadcastMul4DSlow(const ArithmeticParams& params,
                               const RuntimeShape& input1_shape,
-                               const uint8* input1_data,
+                               const uint8_t* input1_data,
                               const RuntimeShape& input2_shape,
-                               const uint8* input2_data,
+                               const uint8_t* input2_data,
                               const RuntimeShape& output_shape,
-                               uint8* output_data) {
+                               uint8_t* output_data) {
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -89,22 +89,22 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
              params.input1_offset +
              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
              params.input2_offset +
              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 unclamped_result =
+          const int32_t unclamped_result =
              params.output_offset +
              MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                            params.output_multiplier,
                                            params.output_shift);
-          const int32 clamped_output = std::min(
+          const int32_t clamped_output = std::min(
              params.quantized_activation_max,
              std::max(params.quantized_activation_min, unclamped_result));
          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<uint8>(clamped_output);
+              static_cast<uint8_t>(clamped_output);
        }
      }
    }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/pad.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/pad.h
@@ -32,8 +32,8 @@ constexpr int PadKernelMaxDimensionCount() { return 4; }
 // equivalent to a simple input1_data.  For Pad, it should point to a zero
 // value.
 //
-// Note that two typenames are required, so that T=P=int32 is considered a
+// Note that two typenames are required, so that T=P=int32_t is considered a
-// specialization distinct from P=int32.
+// specialization distinct from P=int32_t.
 template <typename T, typename P>
 inline void PadImpl(const tflite::PadParams& op_params,
                    const RuntimeShape& input_shape, const T* input_data,
@@ -116,11 +116,11 @@ inline void Pad(const tflite::PadParams& op_params,
          output_data);
 }
-// The second (pad-value) input can be int32 when, say, the first is uint8.
+// The second (pad-value) input can be int32_t when, say, the first is uint8_t.
 template <typename T>
 inline void Pad(const tflite::PadParams& op_params,
                const RuntimeShape& input_shape, const T* input_data,
-                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
                T* output_data) {
  const T converted_pad_value = static_cast<T>(*pad_value_ptr);
  PadImpl(op_params, input_shape, input_data, &converted_pad_value,
@@ -130,40 +130,18 @@ inline void Pad(const tflite::PadParams& op_params,
 // This version avoids conflicting template matching.
 template <>
 inline void Pad(const tflite::PadParams& op_params,
-                const RuntimeShape& input_shape, const int32* input_data,
+                const RuntimeShape& input_shape, const int32_t* input_data,
-                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
-                int32* output_data) {
+                int32_t* output_data) {
  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
          output_data);
 }
 // One could make all PadImageStyle calls simply delegate the work to the
 // ordinary Pad.  However, it is better that the reference code asserts false in
 // similar cases.
 template <typename T, typename P>
 inline void PadImageStyle(const tflite::PadParams& op_params,
                          const RuntimeShape& input_shape, const T* input_data,
                          const P* pad_value_ptr,
                          const RuntimeShape& output_shape, T* output_data) {
  TFLITE_ASSERT_FALSE;
 }
 template <typename P>
 inline void PadImageStyle(const tflite::PadParams& op_params,
                          const RuntimeShape& input_shape,
                          const uint8* input_data, const P* pad_value_ptr,
                          const RuntimeShape& output_shape,
                          uint8* output_data) {
  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
      output_data);
 }
 template <typename P>
 inline void PadImageStyle(const tflite::PadParams& op_params,
                          const RuntimeShape& input_shape,
                          const int8_t* input_data, const P* pad_value_ptr,
                          const RuntimeShape& output_shape,
                          int8_t* output_data) {
  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
      output_data);
 }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/pooling.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/pooling.h
@@ -78,8 +78,9 @@ inline void AveragePool(const PoolParams& params,
 inline void AveragePool(const PoolParams& params,
                        const RuntimeShape& input_shape,
-                        const uint8* input_data,
+                        const uint8_t* input_data,
-                        const RuntimeShape& output_shape, uint8* output_data) {
+                        const RuntimeShape& output_shape,
                        uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -108,7 +109,7 @@ inline void AveragePool(const PoolParams& params,
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end =
              std::min(params.filter_height, input_height - in_y_origin);
-          int32 acc = 0;
+          int32_t acc = 0;
          int filter_count = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end;
               ++filter_y) {
@@ -125,7 +126,7 @@ inline void AveragePool(const PoolParams& params,
          acc = std::max(acc, params.quantized_activation_min);
          acc = std::min(acc, params.quantized_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<uint8>(acc);
+              static_cast<uint8_t>(acc);
        }
      }
    }
@@ -237,8 +238,8 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
 }
 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const uint8* input_data, const RuntimeShape& output_shape,
+                    const uint8_t* input_data, const RuntimeShape& output_shape,
-                    uint8* output_data) {
+                    uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_GE(params.quantized_activation_min, 0);
@@ -269,7 +270,7 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end =
              std::min(params.filter_height, input_height - in_y_origin);
-          uint8 max = 0;
+          uint8_t max = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end;
               ++filter_y) {
            for (int filter_x = filter_x_start; filter_x < filter_x_end;
@@ -281,10 +282,10 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
            }
          }
-          max = std::max<uint8>(max, params.quantized_activation_min);
+          max = std::max<uint8_t>(max, params.quantized_activation_min);
-          max = std::min<uint8>(max, params.quantized_activation_max);
+          max = std::min<uint8_t>(max, params.quantized_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<uint8>(max);
+              static_cast<uint8_t>(max);
        }
      }
    }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/prelu.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -23,7 +23,7 @@ namespace tflite {
 namespace reference_ops {
-// Broadcast prelu to output_shape for quantized uint8/int8 data.
+// Broadcast prelu to output_shape for quantized uint8_t/int8_t data.
 template <typename T>
 inline void BroadcastPrelu4DSlow(
    const PreluParams& params, const RuntimeShape& input_shape,
@@ -44,24 +44,26 @@ inline void BroadcastPrelu4DSlow(
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
          int output_index = Offset(extended_output_shape, b, y, x, c);
          int input_index = SubscriptToIndex(desc1, b, y, x, c);
-          const int32 input_value =
+          const int32_t input_value =
              params.input_offset + input_data[input_index];
-          int32 output_value;
+          int32_t output_value;
          if (input_value >= 0) {
-            output_value = input_value;
+            output_value = MultiplyByQuantizedMultiplier(
                input_value, params.output_multiplier_1, params.output_shift_1);
          } else {
            auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
-            const int32 alpha_value =
+            const int32_t alpha_value =
                params.alpha_offset + alpha_data[alpha_index];
            output_value = MultiplyByQuantizedMultiplier(
-                input_value * alpha_value, params.output_multiplier,
+                input_value * alpha_value, params.output_multiplier_2,
-                params.output_shift);
+                params.output_shift_2);
          }
          output_value += params.output_offset;
-          const int32 quantized_min = std::numeric_limits<T>::min();
+          const int32_t quantized_min = std::numeric_limits<T>::min();
-          const int32 quantized_max = std::numeric_limits<T>::max();
+          const int32_t quantized_max = std::numeric_limits<T>::max();
-          const int32 clamped_output =
+          const int32_t clamped_output =
              std::min(quantized_max, std::max(quantized_min, output_value));
          output_data[output_index] = static_cast<T>(clamped_output);
        }
@@ -70,6 +72,37 @@ inline void BroadcastPrelu4DSlow(
  }
 }
 template <typename T>
 inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
                  const T* input_data, const RuntimeShape& alpha_shape,
                  const T* alpha_data, const RuntimeShape& output_shape,
                  T* output_data) {
  const int32_t quantized_min = std::numeric_limits<T>::min();
  const int32_t quantized_max = std::numeric_limits<T>::max();
  const int flat_size =
      MatchingElementsSize(input_shape, alpha_shape, output_shape);
  for (int i = 0; i < flat_size; ++i) {
    const int32_t input_value = params.input_offset + input_data[i];
    int32_t output_value;
    if (input_value >= 0) {
      output_value = MultiplyByQuantizedMultiplier(
          input_value, params.output_multiplier_1, params.output_shift_1);
    } else {
      const int32_t alpha_value = params.alpha_offset + alpha_data[i];
      output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value,
                                                   params.output_multiplier_2,
                                                   params.output_shift_2);
    }
    output_value += params.output_offset;
    const int32_t clamped_output =
        std::min(quantized_max, std::max(quantized_min, output_value));
    output_data[i] = static_cast<T>(clamped_output);
  }
 }
 }  // namespace reference_ops
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
@@ -76,6 +76,10 @@ inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
          BroadcastableOpCategory::kFirstInputBroadcastsFast &&
      params->broadcast_category !=
          BroadcastableOpCategory::kSecondInputBroadcastsFast) {
    // This is unreachable because at least one else clause in the above loop
    // must be reached.
    TFLITE_DCHECK(false);
    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
    return false;
  }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/quantize.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/quantize.h
@@ -15,7 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
 #include <algorithm>
 #include <limits>
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -29,18 +33,18 @@ inline void AffineQuantize(const tflite::QuantizationParams& op_params,
                           const InputT* input_data,
                           const RuntimeShape& output_shape,
                           OutputT* output_data) {
-  const int32 zero_point = op_params.zero_point;
+  const int32_t zero_point = op_params.zero_point;
  const double scale = op_params.scale;
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  static constexpr int32 min_val = std::numeric_limits<OutputT>::min();
+  static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
-  static constexpr int32 max_val = std::numeric_limits<OutputT>::max();
+  static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();
  for (int i = 0; i < flat_size; i++) {
    const InputT val = input_data[i];
-    int32 unclamped =
+    int32_t unclamped =
-        static_cast<int32>(TfLiteRound(val / static_cast<float>(scale))) +
+        static_cast<int32_t>(TfLiteRound(val / static_cast<float>(scale))) +
        zero_point;
-    int32 clamped = std::min(std::max(unclamped, min_val), max_val);
+    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
    output_data[i] = clamped;
  }
 }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/reduce.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/reduce.h
@@ -18,6 +18,8 @@ limitations under the License.
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/max.h"
 #include "tensorflow/lite/kernels/internal/min.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -68,6 +70,9 @@ inline bool ResolveAxis(const int num_dims, const int* axis,
    // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
    int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
    TFLITE_DCHECK(current >= 0 && current < num_dims);
    if (current < 0 || current >= num_dims) {
      return false;
    }
    bool is_dup = false;
    for (int j = 0; j < *out_num_axis; ++j) {
      if (out_axis[j] == current) {
@@ -127,6 +132,11 @@ inline bool ReduceGeneric(const T* input_data, const int* input_dims,
                          bool keep_dims, int* temp_index, int* resolved_axis,
                          T init_value,
                          T reducer(const T current, const T in)) {
  // Return early when input shape has zero dim.
  for (int i = 0; i < input_num_dims; ++i) {
    if (input_dims[i] == 0) return true;
  }
  // Reset output data.
  if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
                               output_data)) {
@@ -184,11 +194,11 @@ inline bool Mean(const T* input_data, const int* input_dims,
  }
  // Calculate mean by dividing output_data by num of aggregated element.
-  U num_elements_in_axis = 1;
+  size_t num_elements_in_axis = 1;
  for (int idx = 0; idx < num_resolved_axis; ++idx) {
    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
    // Overflow prevention.
-    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
      return false;
    }
    num_elements_in_axis *= current;
@@ -249,9 +259,9 @@ inline void Mean(const tflite::MeanParams& op_params,
 inline void Mean(const tflite::MeanParams& op_params,
                 const RuntimeShape& unextended_input_shape,
-                 const uint8_t* input_data, int32 input_zero_point,
+                 const uint8_t* input_data, int32_t input_zero_point,
                 float input_scale, const RuntimeShape& unextended_output_shape,
-                 uint8_t* output_data, int32 output_zero_point,
+                 uint8_t* output_data, int32_t output_zero_point,
                 float output_scale) {
  ruy::profiler::ScopeLabel label("Mean4D/Uint8");
@@ -280,9 +290,9 @@ inline void Mean(const tflite::MeanParams& op_params,
  constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
  constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
-  int32 bias =
+  int32_t bias =
      output_zero_point -
-      static_cast<int32>(input_zero_point * input_scale / output_scale);
+      static_cast<int32_t>(input_zero_point * input_scale / output_scale);
  double real_scale =
      static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
@@ -291,7 +301,7 @@ inline void Mean(const tflite::MeanParams& op_params,
  QuantizeMultiplier(real_scale, &multiplier, &shift);
  for (int out_b = 0; out_b < output_batch; ++out_b) {
    for (int out_d = 0; out_d < output_depth; ++out_d) {
-      int32 acc = 0;
+      int32_t acc = 0;
      for (int in_h = 0; in_h < input_height; ++in_h) {
        for (int in_w = 0; in_w < input_width; ++in_w) {
          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
@@ -310,18 +320,21 @@ inline void Mean(const tflite::MeanParams& op_params,
 // It does so in two stages, first calculates the sum of elements along the axis
 // then divides it by the number of element in axis for quantized values.
 template <typename T, typename U>
-inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
+inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
                               float input_scale, const int* input_dims,
                               const int input_num_dims, T* output_data,
-                               int32 output_zero_point, float output_scale,
+                               int32_t output_zero_point, float output_scale,
                               const int* output_dims,
                               const int output_num_dims, const int* axis,
                               const int num_axis_dimensions, bool keep_dims,
                               int* temp_index, int* resolved_axis, U* temp_sum,
                               bool compute_sum) {
-  const bool uint8_case = std::is_same<T, int8_t>::value;
+  const bool uint8_case = std::is_same<T, uint8_t>::value;
  const bool int16_case = std::is_same<T, int16_t>::value;
  if (uint8_case) {
    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
  } else if (int16_case) {
    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
  } else {
    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
  }
@@ -354,11 +367,11 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
  }
  // Calculate mean by dividing output_data by num of aggregated element.
-  U num_elements_in_axis = 1;
+  size_t num_elements_in_axis = 1;
  for (int idx = 0; idx < num_resolved_axis; ++idx) {
    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
    // Overflow prevention.
-    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
      return false;
    }
    num_elements_in_axis *= current;
@@ -368,8 +381,7 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
    const float scale = input_scale / output_scale;
    if (compute_sum) {
      // TODO(b/116341117): Eliminate float and do this completely in 8bit.
-      const float bias =
+      const float bias = -input_zero_point * scale * num_elements_in_axis;
          -input_zero_point * scale * num_elements_in_axis + 0.5f;
      for (size_t idx = 0; idx < num_outputs; ++idx) {
        const U value =
            static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
@@ -377,15 +389,15 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
        output_data[idx] = static_cast<T>(value);
      }
    } else {
-      const float bias = -input_zero_point * scale + 0.5f;
+      const float bias = -input_zero_point * scale;
      for (size_t idx = 0; idx < num_outputs; ++idx) {
        float float_mean = static_cast<float>(temp_sum[idx]) /
                           static_cast<float>(num_elements_in_axis);
-        float result =
+        float result = TfLiteMin(
-            std::min(TfLiteRound(float_mean * scale + bias) + output_zero_point,
+            TfLiteRound(float_mean * scale + bias) + output_zero_point,
-                     static_cast<float>(std::numeric_limits<T>::max()));
+            static_cast<float>(std::numeric_limits<T>::max()));
-        result =
+        result = TfLiteMax(result,
-            std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
+                           static_cast<float>(std::numeric_limits<T>::min()));
        output_data[idx] = static_cast<T>(result);
      }
    }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
@@ -17,28 +17,30 @@ limitations under the License.
 #include <cmath>
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 namespace tflite {
 namespace reference_ops {
-inline int32 GetNearestNeighbor(const int input_value, const int32 input_size,
+inline int32_t GetNearestNeighbor(const int input_value,
-                                const int32 output_size,
+                                  const int32_t input_size,
-                                const bool align_corners,
+                                  const int32_t output_size,
-                                const bool half_pixel_centers) {
+                                  const bool align_corners,
                                  const bool half_pixel_centers) {
  const float scale =
      (align_corners && output_size > 1)
          ? (input_size - 1) / static_cast<float>(output_size - 1)
          : input_size / static_cast<float>(output_size);
  const float offset = half_pixel_centers ? 0.5f : 0.0f;
-  int32 output_value = std::min(
+  int32_t output_value = std::min(
      align_corners
-          ? static_cast<int32>(std::round((input_value + offset) * scale))
+          ? static_cast<int32_t>(TfLiteRound((input_value + offset) * scale))
-          : static_cast<int32>(std::floor((input_value + offset) * scale)),
+          : static_cast<int32_t>(std::floor((input_value + offset) * scale)),
      input_size - 1);
  if (half_pixel_centers) {
-    output_value = std::max(static_cast<int32>(0), output_value);
+    output_value = std::max(static_cast<int32_t>(0), output_value);
  }
  return output_value;
 }
@@ -47,7 +49,7 @@ template <typename T>
 inline void ResizeNearestNeighbor(
    const tflite::ResizeNearestNeighborParams& op_params,
    const RuntimeShape& unextended_input_shape, const T* input_data,
-    const RuntimeShape& output_size_shape, const int32* output_size_data,
+    const RuntimeShape& output_size_shape, const int32_t* output_size_data,
    const RuntimeShape& unextended_output_shape, T* output_data) {
  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
@@ -57,16 +59,16 @@ inline void ResizeNearestNeighbor(
  const RuntimeShape output_shape =
      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-  int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
-  int32 input_height = input_shape.Dims(1);
+  int32_t input_height = input_shape.Dims(1);
-  int32 input_width = input_shape.Dims(2);
+  int32_t input_width = input_shape.Dims(2);
-  int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
  // The Tensorflow version of this op allows resize on the width and height
  // axis only.
  TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
-  int32 output_height = output_size_data[0];
+  int32_t output_height = output_size_data[0];
-  int32 output_width = output_size_data[1];
+  int32_t output_width = output_size_data[1];
  const int col_offset = input_shape.Dims(3);
  const int row_offset = input_shape.Dims(2) * col_offset;
@@ -76,14 +78,14 @@ inline void ResizeNearestNeighbor(
  T* output_ptr = output_data;
  for (int b = 0; b < batches; ++b) {
    for (int y = 0; y < output_height; ++y) {
-      int32 in_y = GetNearestNeighbor(y, input_height, output_height,
+      int32_t in_y = GetNearestNeighbor(y, input_height, output_height,
                                      op_params.align_corners,
                                      op_params.half_pixel_centers);
      const T* y_input_ptr = input_ptr + in_y * row_offset;
      for (int x = 0; x < output_width; ++x) {
        int32 in_x = GetNearestNeighbor(x, input_width, output_width,
                                        op_params.align_corners,
                                        op_params.half_pixel_centers);
      const T* y_input_ptr = input_ptr + in_y * row_offset;
      for (int x = 0; x < output_width; ++x) {
        int32_t in_x = GetNearestNeighbor(x, input_width, output_width,
                                          op_params.align_corners,
                                          op_params.half_pixel_centers);
        const T* x_input_ptr = y_input_ptr + in_x * col_offset;
        memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
        output_ptr += depth;
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
 #include <limits>
 #include <vector>
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -49,26 +48,27 @@ inline void Softmax(const SoftmaxParams& params,
    // Compute sum.
    float sum = 0.f;
    for (int c = 0; c < depth; ++c) {
-      sum += std::exp((input_data[i * depth + c] - max) *
+      const float exp_c = std::exp((input_data[i * depth + c] - max) *
-                      static_cast<float>(params.beta));
+                                   static_cast<float>(params.beta));
      output_data[i * depth + c] = exp_c;
      sum += exp_c;
    }
    // Compute result.
    for (int c = 0; c < depth; ++c) {
-      output_data[i * depth + c] = std::exp((input_data[i * depth + c] - max) *
+      output_data[i * depth + c] = output_data[i * depth + c] / sum;
                                            static_cast<float>(params.beta)) /
                                   sum;
    }
  }
 }
-// Quantized softmax with int8/uint8 input and int8/uint8/int16 output.
+// Quantized softmax with int8_t/uint8_t input and int8_t/uint8_t/int16_t
 // output.
 template <typename InputT, typename OutputT>
 inline void Softmax(const SoftmaxParams& params,
                    const RuntimeShape& input_shape, const InputT* input_data,
                    const RuntimeShape& output_shape, OutputT* output_data) {
-  const int32 input_beta_multiplier = params.input_multiplier;
+  const int32_t input_beta_multiplier = params.input_multiplier;
-  const int32 input_beta_left_shift = params.input_left_shift;
+  const int32_t input_beta_left_shift = params.input_left_shift;
  const int diff_min = params.diff_min;
  // The representation chosen for the input to the exp() function is Q5.26.
  // We need to leave extra space since values that we skip might be as large as
@@ -78,9 +78,10 @@ inline void Softmax(const SoftmaxParams& params,
  static const int kScaledDiffIntegerBits = 5;
  static const int kAccumulationIntegerBits = 12;
  using FixedPointScaledDiff =
-      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+      gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPointAccum =
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
  const int trailing_dim = input_shape.DimensionsCount() - 1;
  const int outer_size =
@@ -96,10 +97,10 @@ inline void Softmax(const SoftmaxParams& params,
    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
    for (int c = 0; c < depth; ++c) {
-      int32 input_diff =
+      int32_t input_diff =
-          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
      if (input_diff >= diff_min) {
-        const int32 input_diff_rescaled =
+        const int32_t input_diff_rescaled =
            MultiplyByQuantizedMultiplierGreaterThanOne(
                input_diff, input_beta_multiplier, input_beta_left_shift);
        const FixedPointScaledDiff scaled_diff_f8 =
@@ -114,28 +115,28 @@ inline void Softmax(const SoftmaxParams& params,
        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
    for (int c = 0; c < depth; ++c) {
-      int32 input_diff =
+      int32_t input_diff =
-          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
      if (input_diff >= diff_min) {
-        const int32 input_diff_rescaled =
+        const int32_t input_diff_rescaled =
            MultiplyByQuantizedMultiplierGreaterThanOne(
                input_diff, input_beta_multiplier, input_beta_left_shift);
        const FixedPointScaledDiff scaled_diff_f8 =
            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-        int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+        int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
            (shifted_scale * exp_in_0).raw(),
            num_bits_over_unit + 31 - (sizeof(OutputT) * 8));
-        const int32 shifted_output =
+        const int32_t shifted_output =
            unsat_output +
-            static_cast<int32>(std::numeric_limits<OutputT>::min());
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min());
        output_data[i * depth + c] = static_cast<OutputT>(std::max(
            std::min(shifted_output,
-                     static_cast<int32>(std::numeric_limits<OutputT>::max())),
+                     static_cast<int32_t>(std::numeric_limits<OutputT>::max())),
-            static_cast<int32>(std::numeric_limits<OutputT>::min())));
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min())));
      } else {
        output_data[i * depth + c] = std::numeric_limits<OutputT>::min();
      }
@@ -143,7 +144,24 @@ inline void Softmax(const SoftmaxParams& params,
  }
 }
-// Quantized softmax with int16 input and int16 output.
+// Computes exp(input - max_input)
 inline int16_t SoftMaxCalculateExp(const SoftmaxParams& params,
                                   const int16_t* input_data, const int depth,
                                   int16_t max_in_row, int i, int c) {
  int32_t input_diff = input_data[i * depth + c] - max_in_row;
  // scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
  // exp lut generated with range [-10, 0], as exp(-10) is negligible.
  int32_t scaled_diff = MultiplyByQuantizedMultiplier(
      input_diff, params.input_multiplier, params.input_left_shift);
  // recenter to [-32768, 32767]
  int32_t sym_scaled_diff = scaled_diff + 32767;
  int16_t sat_sym_scaled_diff =
      std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
               static_cast<int32_t>(32767));
  // apply the exp() LUT activation function
  return generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
 }
 // Quantized softmax with int16_t input and int16_t output.
 inline void SoftmaxInt16(const SoftmaxParams& params,
                         const RuntimeShape& input_shape,
                         const int16_t* input_data,
@@ -162,28 +180,16 @@ inline void SoftmaxInt16(const SoftmaxParams& params,
      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
    }
-    // Compute exp(input - max_input)
+    // This loops computes the exp values and their sum. We will need the exp
-    std::vector<int16_t> exp_result_Q015(depth);
+    // values later on in the function so we cache them in the output_data
    // buffer. This is an optimization done to avoid calculating the exp values
    // twice making use of the output_data buffer as scratch memory.
    int32_t sum_of_exps = 0;  // Q16.15 fixed point format.
    int16_t* exp_results_Q015 = output_data + i * depth;
    for (int c = 0; c < depth; ++c) {
-      int32_t input_diff = input_data[i * depth + c] - max_in_row;
+      exp_results_Q015[c] =
-      // scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
+          SoftMaxCalculateExp(params, input_data, depth, max_in_row, i, c);
-      int32_t scaled_diff = MultiplyByQuantizedMultiplier(
+      sum_of_exps += exp_results_Q015[c];
          input_diff, params.input_multiplier, params.input_left_shift);
      // recenter to [-32768, 32767]
      int32_t sym_scaled_diff = scaled_diff + 32767;
      int16_t sat_sym_scaled_diff =
          std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
                   static_cast<int32_t>(32767));
      // apply the exp() LUT activation function
      exp_result_Q015[c] =
          generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
    }
    // sum_of_exps is a Q16.15 fixed point format.
    int32_t sum_of_exps = 0;
    for (int c = 0; c < depth; ++c) {
      // Q16.15 + Q0.15
      sum_of_exps += exp_result_Q015[c];
    }
    // Compute the reciprocal 1/sum_of_exps
@@ -209,7 +215,7 @@ inline void SoftmaxInt16(const SoftmaxParams& params,
    for (int c = 0; c < depth; ++c) {
      uint8_t right_shift = 31 - headroom_plus_one;
      int64_t round = 1 << (right_shift - 1);
-      int32_t result = (static_cast<int64_t>(exp_result_Q015[c]) *
+      int32_t result = (static_cast<int64_t>(exp_results_Q015[c]) *
                            static_cast<int64_t>(reciprocal_scale_Q015) +
                        round) >>
                       right_shift;
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/strided_slice.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/strided_slice.h
@@ -16,8 +16,10 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 namespace tflite {
 namespace reference_ops {
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/sub.h
@@ -15,9 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
-#include "fixedpoint/fixedpoint.h"
+#include <stdint.h>
 #include <algorithm>
 #include <limits>
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 namespace tflite {
@@ -41,11 +47,11 @@ inline void SubNonBroadcast(const ArithmeticParams& params,
 inline void SubNonBroadcast(const ArithmeticParams& params,
                            const RuntimeShape& input1_shape,
-                            const int32* input1_data,
+                            const int32_t* input1_data,
                            const RuntimeShape& input2_shape,
-                            const int32* input2_data,
+                            const int32_t* input2_data,
                            const RuntimeShape& output_shape,
-                            int32* output_data) {
+                            int32_t* output_data) {
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);
  for (int i = 0; i < flat_size; ++i) {
@@ -106,12 +112,12 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
 template <int N = 5>
 inline void BroadcastSubSlow(const ArithmeticParams& params,
                             const RuntimeShape& input1_shape,
-                             const uint8* input1_data,
+                             const uint8_t* input1_data,
                             const RuntimeShape& input2_shape,
-                             const uint8* input2_data,
+                             const uint8_t* input2_data,
                             const RuntimeShape& output_shape,
-                             uint8* output_data) {
+                             uint8_t* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8");
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8_t");
  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
@@ -134,28 +140,28 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
  // nesting loops such that the innermost loop has the smallest stride for the
  // best cache behavior.
  auto sub_func = [&](int indexes[N]) {
-    const int32 input1_val =
+    const int32_t input1_val =
        params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
-    const int32 input2_val =
+    const int32_t input2_val =
        params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sub, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
    output_data[SubscriptToIndex(output_desc, indexes)] =
-        static_cast<uint8>(clamped_output);
+        static_cast<uint8_t>(clamped_output);
  };
  NDOpsHelper<N>(output_desc, sub_func);
 }
@@ -163,12 +169,12 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
 template <int N = 5>
 inline void BroadcastSubSlow(const ArithmeticParams& params,
                             const RuntimeShape& input1_shape,
-                             const int32* input1_data,
+                             const int32_t* input1_data,
                             const RuntimeShape& input2_shape,
-                             const int32* input2_data,
+                             const int32_t* input2_data,
                             const RuntimeShape& output_shape,
-                             int32* output_data) {
+                             int32_t* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32");
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
@@ -208,7 +214,7 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
                             const int8_t* input2_data,
                             const RuntimeShape& output_shape,
                             int8_t* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8");
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8_t");
  NdArrayDesc<N> desc1;
  NdArrayDesc<N> desc2;
  NdArrayDesc<N> output_desc;
@@ -254,6 +260,45 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
  NDOpsHelper<N>(output_desc, sub_func);
 }
 template <int N = 5>
 void BroadcastSubSlow(const ArithmeticParams& params,
                      const RuntimeShape& input1_shape,
                      const int64_t* input1_data,
                      const RuntimeShape& input2_shape,
                      const int64_t* input2_data,
                      const RuntimeShape& output_shape, int64_t* output_data) {
  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
  NdArrayDesc<N> desc1;
  NdArrayDesc<N> desc2;
  NdArrayDesc<N> output_desc;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                      &desc2);
  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
  // In Tensorflow, the dimensions are canonically named (batch_number, row,
  // col, channel), with extents (batches, height, width, depth), with the
  // trailing dimension changing most rapidly (channels has the smallest stride,
  // typically 1 element).
  //
  // In generated C code, we store arrays with the dimensions reversed. The
  // first dimension has smallest stride.
  //
  // We name our variables by their Tensorflow convention, but generate C code
  // nesting loops such that the innermost loop has the smallest stride for the
  // best cache behavior.
  auto sub_func = [&](int indexes[N]) {
    output_data[SubscriptToIndex(output_desc, indexes)] =
        ActivationFunctionWithMinMax(
            input1_data[SubscriptToIndex(desc1, indexes)] -
                input2_data[SubscriptToIndex(desc2, indexes)],
            params.int64_activation_min, params.int64_activation_max);
  };
  NDOpsHelper<N>(output_desc, sub_func);
 }
 template <typename T, int N = 5>
 void BroadcastSubSlow(const ArithmeticParams& params,
                      const RuntimeShape& input1_shape, const T* input1_data,
@@ -294,33 +339,33 @@ void BroadcastSubSlow(const ArithmeticParams& params,
 // Element-wise Sub that can often be used for inner loop of broadcast sub as
 // well as the non-broadcast sub.
 inline void SubElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
+                           const uint8_t* input1_data,
-                           uint8* output_data) {
+                           const uint8_t* input2_data, uint8_t* output_data) {
  TFLITE_DCHECK_GT(params.input1_offset, -256);
  TFLITE_DCHECK_GT(params.input2_offset, -256);
  TFLITE_DCHECK_LT(params.input1_offset, 256);
  TFLITE_DCHECK_LT(params.input2_offset, 256);
  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32_t input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sub, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
  }
 }
@@ -336,22 +381,22 @@ inline void SubElementwise(int size, const ArithmeticParams& params,
  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32_t input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sub, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
    output_data[i] = static_cast<int8_t>(clamped_output);
@@ -359,9 +404,9 @@ inline void SubElementwise(int size, const ArithmeticParams& params,
 }
 inline void Sub(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& output_shape, uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  const int flat_size =
@@ -428,40 +473,43 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
  }
 }
-inline void SubWithActivation(const ArithmeticParams& params,
+inline void SetActivationMinMax(const ArithmeticParams& params,
-                              const RuntimeShape& input1_shape,
+                                int32_t* activation_min,
-                              const int32* input1_data,
+                                int32_t* activation_max) {
-                              const RuntimeShape& input2_shape,
+  *activation_min = params.quantized_activation_min;
-                              const int32* input2_data,
+  *activation_max = params.quantized_activation_max;
-                              const RuntimeShape& output_shape,
+}
-                              int32* output_data) {
+
 inline void SetActivationMinMax(const ArithmeticParams& params,
                                float* activation_min, float* activation_max) {
  *activation_min = params.float_activation_min;
  *activation_max = params.float_activation_max;
 }
 inline void SetActivationMinMax(const ArithmeticParams& params,
                                int64_t* activation_min,
                                int64_t* activation_max) {
  *activation_min = params.int64_activation_min;
  *activation_max = params.int64_activation_max;
 }
 template <typename T>
 inline void SubWithActivation(
    const ArithmeticParams& params, const RuntimeShape& input1_shape,
    const T* input1_data, const RuntimeShape& input2_shape,
    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
  ruy::profiler::ScopeLabel label("SubWithActivation");
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);
  T activation_min, activation_max;
  SetActivationMinMax(params, &activation_min, &activation_max);
  for (int i = 0; i < flat_size; ++i) {
    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.quantized_activation_min,
+        input1_data[i] - input2_data[i], activation_min, activation_max);
        params.quantized_activation_max);
  }
 }
 inline void SubWithActivation(const ArithmeticParams& params,
                              const RuntimeShape& input1_shape,
                              const float* input1_data,
                              const RuntimeShape& input2_shape,
                              const float* input2_data,
                              const RuntimeShape& output_shape,
                              float* output_data) {
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);
  for (int i = 0; i < flat_size; ++i) {
    output_data[i] = ActivationFunctionWithMinMax(
        input1_data[i] - input2_data[i], params.float_activation_min,
        params.float_activation_max);
  }
 }
 }  // namespace reference_ops
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/tanh.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/reference/tanh.h
@@ -0,0 +1,129 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
 #include <cmath>
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 namespace tflite {
 namespace reference_ops {
 inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
                 const RuntimeShape& output_shape, float* output_data) {
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < flat_size; i++) {
    float val = input_data[i];
    float result = std::tanh(val);
    output_data[i] = result;
  }
 }
 // Convenience version that allows, for example, generated-code calls to be
 // uniform between data types.
 inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
                 const float* input_data, const RuntimeShape& output_shape,
                 float* output_data) {
  // Drop params: not needed.
  Tanh(input_shape, input_data, output_shape, output_data);
 }
 inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
                 const int16_t* input_data, const RuntimeShape& output_shape,
                 int16_t* output_data) {
  const int input_left_shift = params.input_left_shift;
  // Support for shifts is limited until we have a parameterized version of
  // SaturatingRoundingMultiplyByPOT().
  TFLITE_DCHECK_GE(input_left_shift, 0);
  TFLITE_DCHECK_LE(input_left_shift, 1);
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  // F0 uses 0 integer bits, range [-1, 1].
  // This is the return type of math functions such as tanh, logistic,
  // whose range is in [-1, 1].
  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
  if (input_left_shift == 0) {
    for (int i = 0; i < flat_size; i++) {
      F3 input = F3::FromRaw(input_data[i]);
      F0 output = gemmlowp::tanh(input);
      output_data[i] = output.raw();
    }
  } else {
    for (int i = 0; i < flat_size; i++) {
      F3 input = F3::FromRaw(
          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
      F0 output = gemmlowp::tanh(input);
      output_data[i] = output.raw();
    }
  }
 }
 inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
                 const uint8_t* input_data, const RuntimeShape& output_shape,
                 uint8_t* output_data) {
  const int32_t input_zero_point = params.input_zero_point;
  const int32_t input_range_radius = params.input_range_radius;
  const int32_t input_multiplier = params.input_multiplier;
  const int input_left_shift = params.input_left_shift;
  const int32_t output_zero_point = 128;
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < flat_size; i++) {
    const uint8_t input_val_u8 = input_data[i];
    const int32_t input_val_centered =
        static_cast<int32_t>(input_val_u8) - input_zero_point;
    uint8_t output_val;
    if (input_val_centered <= -input_range_radius) {
      output_val = 0;
    } else if (input_val_centered >= input_range_radius) {
      output_val = 255;
    } else {
      const int32_t input_val_rescaled =
          MultiplyByQuantizedMultiplierGreaterThanOne(
              input_val_centered, input_multiplier, input_left_shift);
      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, 4>;
      using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
      // Convert from Q0.31 to Q24.7.
      using gemmlowp::RoundingDivideByPOT;
      int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
      output_val_s32 += output_zero_point;
      if (output_val_s32 == 256) {
        output_val_s32 = 255;
      }
      // Reinterpret as Q0.7, encoded in uint8_t.
      TFLITE_DCHECK_GE(output_val_s32, 0);
      TFLITE_DCHECK_LE(output_val_s32, 255);
      output_val = static_cast<uint8_t>(output_val_s32);
    }
    output_data[i] = output_val;
  }
 }
 }  // namespace reference_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/strided_slice_logic.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/strided_slice_logic.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <limits>
 #include <vector>
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -69,8 +70,8 @@ inline void StridedSlicePadIndices(tflite::StridedSliceParams* p,
 }
 // Return the index for the first element along that axis. This index will be a
-// positive integer between [0, axis_size - 1] that can be used to index
+// positive integer between [0, axis_size] (or [-1, axis_size -1] if stride < 0)
-// directly into the data.
+// that can be used to index directly into the data.
 inline int StartForAxis(const tflite::StridedSliceParams& params,
                        const RuntimeShape& input_shape, int axis) {
  const auto begin_mask = params.begin_mask;
@@ -102,7 +103,13 @@ inline int StartForAxis(const tflite::StridedSliceParams& params,
  }
  // Clamping
-  start = Clamp(start, 0, axis_size - 1);
+  if (strides[axis] > 0) {
    // Forward iteration
    start = Clamp(start, 0, axis_size);
  } else {
    // Backward iteration
    start = Clamp(start, -1, axis_size - 1);
  }
  return start;
 }
--- a/code/lib/tfmicro/tensorflow/lite/kernels/internal/types.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/internal/types.h
@@ -24,24 +24,29 @@ limitations under the License.
 namespace tflite {
-enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu };
+enum class FusedActivationFunctionType : uint8_t {
-enum class PaddingType : uint8 { kNone, kSame, kValid };
+  kNone,
  kRelu6,
  kRelu1,
  kRelu
 };
 enum class PaddingType : uint8_t { kNone, kSame, kValid };
 struct PaddingValues {
-  int16 width;
+  int16_t width;
-  int16 height;
+  int16_t height;
  // offset is used for calculating "remaining" padding, for example, `width`
  // is 1 and `width_offset` is 1, so padding_left is 1 while padding_right is
  // 1 + 1 = 2.
-  int16 width_offset;
+  int16_t width_offset;
  // Same as width_offset except it's over the height dimension.
-  int16 height_offset;
+  int16_t height_offset;
 };
 // This enumeration allows for non-default formats for the weights array
 // of a fully-connected operator, allowing the use of special optimized
 // runtime paths.
-enum class FullyConnectedWeightsFormat : uint8 {
+enum class FullyConnectedWeightsFormat : uint8_t {
  // Default format (flat 2D layout, the inner contiguous dimension
  // is input_depth, the outer non-contiguous dimension is output_depth)
  kDefault,
@@ -88,11 +93,11 @@ enum class FullyConnectedWeightsFormat : uint8 {
  //     maximize arithmetic throughput.
  //
  // Finally, the 'Int8' part in the name refers to the fact that this
-  // weights format has each weights value encoded as a signed int8 value,
+  // weights format has each weights value encoded as a signed int8_t value,
-  // even if the data type of the weights buffer is uint8.  This is intended
+  // even if the data type of the weights buffer is uint8_t.  This is intended
  // to save runtime kernels the effort to have to XOR the top bit of these
  // bytes before using them in signed arithmetic, see this file for more
-  // explanations on the 'signed int8 trick' in matrix multiplication kernels:
+  // explanations on the 'signed int8_t trick' in matrix multiplication kernels:
  //
  //   tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
  //
@@ -111,7 +116,7 @@ enum class FullyConnectedWeightsFormat : uint8 {
 // the real 0 value, and scale designates the difference between the real values
 // corresponding to consecutive quantized values differing by 1.
 struct QuantizationParams {
-  int32 zero_point = 0;
+  int32_t zero_point = 0;
  double scale = 0.0;
 };
@@ -140,20 +145,20 @@ class RuntimeShape {
    if (dimensions_count > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
+#else  // TF_LITE_STATIC_MEMORY
-      dims_pointer_ = new int32[dimensions_count];
+      dims_pointer_ = new int32_t[dimensions_count];
 #endif  // TF_LITE_STATIC_MEMORY
    }
  }
-  RuntimeShape(int shape_size, int32 value) : size_(0) {
+  RuntimeShape(int shape_size, int32_t value) : size_(0) {
    Resize(shape_size);
    for (int i = 0; i < shape_size; ++i) {
      SetDim(i, value);
    }
  }
-  RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) {
+  RuntimeShape(int dimensions_count, const int32_t* dims_data) : size_(0) {
    ReplaceWith(dimensions_count, dims_data);
  }
@@ -165,33 +170,34 @@ class RuntimeShape {
  // rolls out.
  RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
    if (size_ > kMaxSmallSize) {
-      dims_pointer_ = new int32[size_];
+      dims_pointer_ = new int32_t[size_];
    }
-    std::memcpy(DimsData(), other.DimsData(), sizeof(int32) * size_);
+    std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * size_);
  }
  bool operator==(const RuntimeShape& comp) const {
    return this->size_ == comp.size_ &&
-           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32)) == 0;
+           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32_t)) ==
               0;
  }
  ~RuntimeShape() {
    if (size_ > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
+#else  // TF_LITE_STATIC_MEMORY
      delete[] dims_pointer_;
 #endif  // TF_LITE_STATIC_MEMORY
    }
  }
-  inline int32 DimensionsCount() const { return size_; }
+  inline int32_t DimensionsCount() const { return size_; }
-  inline int32 Dims(int i) const {
+  inline int32_t Dims(int i) const {
    TFLITE_DCHECK_GE(i, 0);
    TFLITE_DCHECK_LT(i, size_);
    return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i];
  }
-  inline void SetDim(int i, int32 val) {
+  inline void SetDim(int i, int32_t val) {
    TFLITE_DCHECK_GE(i, 0);
    TFLITE_DCHECK_LT(i, size_);
    if (size_ > kMaxSmallSize) {
@@ -201,20 +207,20 @@ class RuntimeShape {
    }
  }
-  inline int32* DimsData() {
+  inline int32_t* DimsData() {
    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
  }
-  inline const int32* DimsData() const {
+  inline const int32_t* DimsData() const {
    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
  }
  // The caller must ensure that the shape is no bigger than 5-D.
-  inline const int32* DimsDataUpTo5D() const { return dims_; }
+  inline const int32_t* DimsDataUpTo5D() const { return dims_; }
  inline void Resize(int dimensions_count) {
    if (size_ > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
+#else  // TF_LITE_STATIC_MEMORY
      delete[] dims_pointer_;
 #endif  // TF_LITE_STATIC_MEMORY
    }
@@ -222,16 +228,16 @@ class RuntimeShape {
    if (dimensions_count > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
+#else  // TF_LITE_STATIC_MEMORY
-      dims_pointer_ = new int32[dimensions_count];
+      dims_pointer_ = new int32_t[dimensions_count];
 #endif  // TF_LITE_STATIC_MEMORY
    }
  }
-  inline void ReplaceWith(int dimensions_count, const int32* dims_data) {
+  inline void ReplaceWith(int dimensions_count, const int32_t* dims_data) {
    Resize(dimensions_count);
-    int32* dst_dims = DimsData();
+    int32_t* dst_dims = DimsData();
-    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32));
+    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
  }
  template <typename T>
@@ -239,7 +245,7 @@ class RuntimeShape {
    const int dimensions_count =
        std::distance(src_iterable.begin(), src_iterable.end());
    Resize(dimensions_count);
-    int32* data = DimsData();
+    int32_t* data = DimsData();
    for (auto it : src_iterable) {
      *data = it;
      ++data;
@@ -288,13 +294,13 @@ class RuntimeShape {
      SetDim(i, pad_value);
    }
    std::memcpy(DimsData() + size_increase, shape.DimsData(),
-                sizeof(int32) * shape.DimensionsCount());
+                sizeof(int32_t) * shape.DimensionsCount());
  }
-  int32 size_;
+  int32_t size_;
  union {
-    int32 dims_[kMaxSmallSize];
+    int32_t dims_[kMaxSmallSize];
-    int32* dims_pointer_;
+    int32_t* dims_pointer_;
  };
 };
@@ -432,7 +438,7 @@ int MatchingArraySize(const ArrayType1& array1, int index1,
 inline int MatchingDim(const RuntimeShape& shape1, int index1,
                       const RuntimeShape& shape2, int index2) {
  TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2));
-  return shape1.Dims(index1);
+  return std::min(shape1.Dims(index1), shape2.Dims(index2));
 }
 template <typename... Args>
@@ -713,7 +719,7 @@ void ComputeStrides(Dims<N>* dims) {
  }
 }
-enum class BroadcastableOpCategory : uint8 {
+enum class BroadcastableOpCategory : uint8_t {
  kNone,
  kNonBroadcast,               // Matching input shapes.
  kFirstInputBroadcastsFast,   // Fivefold nested loops.
@@ -729,21 +735,21 @@ static_assert(sizeof(MinMax) == 8, "");
 struct ActivationParams {
  FusedActivationFunctionType activation_type;
-  // uint8, etc, activation params.
+  // uint8_t, etc, activation params.
-  int32 quantized_activation_min;
+  int32_t quantized_activation_min;
-  int32 quantized_activation_max;
+  int32_t quantized_activation_max;
 };
 struct ReluParams : public ActivationParams {
-  int32 input_offset;
+  int32_t input_offset;
-  int32 output_offset;
+  int32_t output_offset;
-  int32 output_multiplier;
+  int32_t output_multiplier;
-  int32 output_shift;
+  int output_shift;
 };
 // Styles of resizing op usages. For example, kImageStyle can be used with a Pad
 // op for pattern-specific optimization.
-enum class ResizingCategory : uint8 {
+enum class ResizingCategory : uint8_t {
  kNone,
  kImageStyle,  // 4D, operating on inner dimensions, say {0, a, b, 0}.
  kGenericResize,
@@ -753,24 +759,29 @@ enum class ResizingCategory : uint8 {
 struct ArithmeticParams {
  // Shape dependent / common to data / op types.
  BroadcastableOpCategory broadcast_category;
-  // uint8 inference params.
+  // uint8_t inference params.
-  int32 input1_offset;
+  int32_t input1_offset;
-  int32 input2_offset;
+  int32_t input2_offset;
-  int32 output_offset;
+  int32_t output_offset;
-  int32 output_multiplier;
+  int32_t output_multiplier;
  int output_shift;
-  // Add / Sub, not Mul, uint8 inference params.
+  // Add / Sub, not Mul, uint8_t inference params.
  int left_shift;
-  int32 input1_multiplier;
+  int32_t input1_multiplier;
  int input1_shift;
-  int32 input2_multiplier;
+  int32_t input2_multiplier;
  int input2_shift;
-  // uint8, etc, activation params.
+
-  int32 quantized_activation_min;
+  // TODO(b/158622529): Union the following activation params.
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
  int32_t quantized_activation_min;
  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
  // int64_t activation params.
  int64_t int64_activation_min;
  int64_t int64_activation_max;
  // Processed output dimensions.
  // Let input "a" be the one that broadcasts in the faster-changing dimension.
@@ -785,22 +796,22 @@ struct ArithmeticParams {
 };
 struct ConcatenationParams {
-  int8 axis;
+  int8_t axis;
-  const int32* input_zeropoint;
+  const int32_t* input_zeropoint;
  const float* input_scale;
-  uint16 inputs_count;
+  uint16_t inputs_count;
-  int32 output_zeropoint;
+  int32_t output_zeropoint;
  float output_scale;
 };
 struct ComparisonParams {
-  // uint8 inference params.
+  // uint8_t inference params.
  int left_shift;
-  int32 input1_offset;
+  int32_t input1_offset;
-  int32 input1_multiplier;
+  int32_t input1_multiplier;
  int input1_shift;
-  int32 input2_offset;
+  int32_t input2_offset;
-  int32 input2_multiplier;
+  int32_t input2_multiplier;
  int input2_shift;
  // Shape dependent / common to inference types.
  bool is_broadcast;
@@ -810,81 +821,81 @@ struct ConvParams {
  PaddingType padding_type;
  PaddingValues padding_values;
  // TODO(starka): This was just "stride", so check that width+height is OK.
-  int16 stride_width;
+  int16_t stride_width;
-  int16 stride_height;
+  int16_t stride_height;
-  int16 dilation_width_factor;
+  int16_t dilation_width_factor;
-  int16 dilation_height_factor;
+  int16_t dilation_height_factor;
-  // uint8 inference params.
+  // uint8_t inference params.
  // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
+  int32_t input_offset;
-  int32 weights_offset;
+  int32_t weights_offset;
-  int32 output_offset;
+  int32_t output_offset;
-  int32 output_multiplier;
+  int32_t output_multiplier;
  int output_shift;
-  // uint8, etc, activation params.
+  // uint8_t, etc, activation params.
-  int32 quantized_activation_min;
+  int32_t quantized_activation_min;
-  int32 quantized_activation_max;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
 };
 struct DepthToSpaceParams {
-  int32 block_size;
+  int32_t block_size;
 };
 struct DepthwiseParams {
  PaddingType padding_type;
  PaddingValues padding_values;
-  int16 stride_width;
+  int16_t stride_width;
-  int16 stride_height;
+  int16_t stride_height;
-  int16 dilation_width_factor;
+  int16_t dilation_width_factor;
-  int16 dilation_height_factor;
+  int16_t dilation_height_factor;
-  int16 depth_multiplier;
+  int16_t depth_multiplier;
-  // uint8 inference params.
+  // uint8_t inference params.
  // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
+  int32_t input_offset;
-  int32 weights_offset;
+  int32_t weights_offset;
-  int32 output_offset;
+  int32_t output_offset;
-  int32 output_multiplier;
+  int32_t output_multiplier;
  int output_shift;
-  // uint8, etc, activation params.
+  // uint8_t, etc, activation params.
-  int32 quantized_activation_min;
+  int32_t quantized_activation_min;
-  int32 quantized_activation_max;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
-  const int32* output_multiplier_per_channel;
+  const int32_t* output_multiplier_per_channel;
-  const int32* output_shift_per_channel;
+  const int32_t* output_shift_per_channel;
 };
 struct DequantizationParams {
  double scale;
-  int32 zero_point;
+  int32_t zero_point;
 };
 struct PerChannelDequantizationParams {
  const float* scale;
-  const int32* zero_point;
+  const int32_t* zero_point;
-  int32 quantized_dimension;
+  int32_t quantized_dimension;
 };
 struct FakeQuantParams {
  MinMax minmax;
-  int32 num_bits;
+  int32_t num_bits;
 };
 struct FullyConnectedParams {
-  // uint8 inference params.
+  // uint8_t inference params.
  // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
+  int32_t input_offset;
-  int32 weights_offset;
+  int32_t weights_offset;
-  int32 output_offset;
+  int32_t output_offset;
-  int32 output_multiplier;
+  int32_t output_multiplier;
  int output_shift;
-  // uint8, etc, activation params.
+  // uint8_t, etc, activation params.
-  int32 quantized_activation_min;
+  int32_t quantized_activation_min;
-  int32 quantized_activation_max;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
@@ -895,16 +906,16 @@ struct FullyConnectedParams {
 };
 struct GatherParams {
-  int16 axis;
+  int16_t axis;
 };
 struct L2NormalizationParams {
-  // uint8 inference params.
+  // uint8_t inference params.
-  int32 input_zero_point;
+  int32_t input_zero_point;
 };
 struct LocalResponseNormalizationParams {
-  int32 range;
+  int32_t range;
  double bias;
  double alpha;
  double beta;
@@ -932,48 +943,50 @@ struct HardSwishParams {
 };
 struct LogisticParams {
-  // uint8 inference params.
+  // uint8_t inference params.
-  int32 input_zero_point;
+  int32_t input_zero_point;
-  int32 input_range_radius;
+  int32_t input_range_radius;
-  int32 input_multiplier;
+  int32_t input_multiplier;
  int input_left_shift;
 };
 struct LstmCellParams {
-  int32 weights_zero_point;
+  int32_t weights_zero_point;
-  int32 accum_multiplier;
+  int32_t accum_multiplier;
  int accum_shift;
  int state_integer_bits;
 };
 struct MeanParams {
-  int8 axis_count;
+  int8_t axis_count;
-  int16 axis[4];
+  int16_t axis[4];
 };
 struct PackParams {
-  int8 axis;
+  int8_t axis;
-  const int32* input_zeropoint;
+  const int32_t* input_zeropoint;
  const float* input_scale;
-  uint16 inputs_count;
+  uint16_t inputs_count;
-  int32 output_zeropoint;
+  int32_t output_zeropoint;
  float output_scale;
 };
 struct PadParams {
-  int8 left_padding_count;
+  int8_t left_padding_count;
-  int32 left_padding[4];
+  int32_t left_padding[4];
-  int8 right_padding_count;
+  int8_t right_padding_count;
-  int32 right_padding[4];
+  int32_t right_padding[4];
  ResizingCategory resizing_category;
 };
 struct PreluParams {
-  int32 input_offset;
+  int32_t input_offset;
-  int32 alpha_offset;
+  int32_t alpha_offset;
-  int32 output_offset;
+  int32_t output_offset;
-  int32 output_multiplier;
+  int32_t output_multiplier_1;
-  int output_shift;
+  int output_shift_1;
  int32_t output_multiplier_2;
  int output_shift_2;
 };
 struct PoolParams {
@@ -984,17 +997,17 @@ struct PoolParams {
  int stride_width;
  int filter_height;
  int filter_width;
-  // uint8, etc, activation params.
+  // uint8_t, etc, activation params.
-  int32 quantized_activation_min;
+  int32_t quantized_activation_min;
-  int32 quantized_activation_max;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
 };
 struct ReshapeParams {
-  int8 shape_count;
+  int8_t shape_count;
-  int32 shape[4];
+  int32_t shape[4];
 };
 struct ResizeBilinearParams {
@@ -1011,91 +1024,95 @@ struct ResizeNearestNeighborParams {
 };
 struct SliceParams {
-  int8 begin_count;
+  int8_t begin_count;
-  int32 begin[4];
+  int32_t begin[4];
-  int8 size_count;
+  int8_t size_count;
-  int32 size[4];
+  int32_t size[4];
 };
 struct SoftmaxParams {
  // beta is not really used (not a Tensorflow parameter) and not implemented
  // for LogSoftmax.
  double beta;
-  // uint8 inference params.  Used even when beta defaults to 1.0.
+  // uint8_t inference params.  Used even when beta defaults to 1.0.
-  int32 input_multiplier;
+  int32_t input_multiplier;
-  int32 input_left_shift;
+  int32_t input_left_shift;
  // Reverse scaling is only used by LogSoftmax.
-  int32 reverse_scaling_divisor;
+  int32_t reverse_scaling_divisor;
-  int32 reverse_scaling_right_shift;
+  int32_t reverse_scaling_right_shift;
  int diff_min;
  int32_t zero_point;
  float scale;
  float* table;
  // int16 LUT for exp(x), where x uniform distributed between [-10.0 , 0.0]
  int16_t* exp_lut;
  // int16 LUT for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0]
  int16_t* one_over_one_plus_x_lut;
  uint8_t* uint8_table1;
  uint8_t* uint8_table2;
 };
 struct SpaceToBatchParams {
-  // "Zero" padding for uint8 means padding with the output offset.
+  // "Zero" padding for uint8_t means padding with the output offset.
-  int32 output_offset;
+  int32_t output_offset;
 };
 struct SpaceToDepthParams {
-  int32 block_size;
+  int32_t block_size;
 };
 struct SplitParams {
  // Graphs that split into, say, 2000 nodes are encountered.  The indices in
-  // OperatorEdges are of type uint16.
+  // OperatorEdges are of type uint16_t.
-  uint16 num_split;
+  uint16_t num_split;
-  int16 axis;
+  int16_t axis;
 };
 struct SqueezeParams {
-  int8 squeeze_dims_count;
+  int8_t squeeze_dims_count;
-  int32 squeeze_dims[4];
+  int32_t squeeze_dims[4];
 };
 struct StridedSliceParams {
-  int8 start_indices_count;
+  int8_t start_indices_count;
-  int32 start_indices[5];
+  int32_t start_indices[5];
-  int8 stop_indices_count;
+  int8_t stop_indices_count;
-  int32 stop_indices[5];
+  int32_t stop_indices[5];
-  int8 strides_count;
+  int8_t strides_count;
-  int32 strides[5];
+  int32_t strides[5];
-  int16 begin_mask;
+  int16_t begin_mask;
-  int16 ellipsis_mask;
+  int16_t ellipsis_mask;
-  int16 end_mask;
+  int16_t end_mask;
-  int16 new_axis_mask;
+  int16_t new_axis_mask;
-  int16 shrink_axis_mask;
+  int16_t shrink_axis_mask;
 };
 struct TanhParams {
-  int32 input_zero_point;
+  int32_t input_zero_point;
-  int32 input_range_radius;
+  int32_t input_range_radius;
-  int32 input_multiplier;
+  int32_t input_multiplier;
  int input_left_shift;
 };
 struct TransposeParams {
-  int8 perm_count;
+  int8_t perm_count;
-  int32 perm[5];
+  int32_t perm[5];
 };
 struct UnpackParams {
-  uint16 num_split;
+  uint16_t num_split;
-  int16 axis;
+  int16_t axis;
 };
 struct LeakyReluParams {
  float alpha;
-  int32 input_offset;
+  int32_t input_offset;
-  int32 output_offset;
+  int32_t output_offset;
-  int32 output_multiplier_alpha;
+  int32_t output_multiplier_alpha;
-  int32 output_shift_alpha;
+  int32_t output_shift_alpha;
-  int32 output_multiplier_identity;
+  int32_t output_multiplier_identity;
-  int32 output_shift_identity;
+  int32_t output_shift_identity;
 };
 template <typename P>
@@ -1105,13 +1122,19 @@ inline void SetActivationParams(float min, float max, P* params) {
 }
 template <typename P>
-inline void SetActivationParams(int32 min, int32 max, P* params) {
+inline void SetActivationParams(int32_t min, int32_t max, P* params) {
  params->quantized_activation_min = min;
  params->quantized_activation_max = max;
 }
 template <typename P>
-inline void GetActivationParams(const P& params, int32* min, int32* max) {
+inline void SetActivationParams(int64_t min, int64_t max, P* params) {
  params->int64_activation_min = min;
  params->int64_activation_max = max;
 }
 template <typename P>
 inline void GetActivationParams(const P& params, int32_t* min, int32_t* max) {
  *min = params.quantized_activation_min;
  *max = params.quantized_activation_max;
 }
@@ -1122,6 +1145,11 @@ inline void GetActivationParams(const P& params, float* min, float* max) {
  *max = params.float_activation_max;
 }
 template <typename P>
 inline void GetActivationParams(const P& params, int64_t* min, int64_t* max) {
  *min = params.int64_activation_min;
  *max = params.int64_activation_max;
 }
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/kernel_util.cc
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/kernel_util.cc
@@ -14,15 +14,176 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include <stdint.h>
 #include <stdlib.h>
 #include <algorithm>
-#include <cmath>
+#include <complex>
 #include <limits>
 #include <memory>
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 namespace tflite {
 namespace {
 // Assumes tensor_index is a valid index (in bounds)
 inline TfLiteTensor* GetTensorAtIndex(const TfLiteContext* context,
                                      int tensor_index) {
  if (context->tensors != nullptr) {
    return &context->tensors[tensor_index];
  } else {
    return context->GetTensor(context, tensor_index);
  }
 }
 // Validate in a single place to reduce binary size
 inline TfLiteStatus ValidateTensorIndexingSafe(const TfLiteContext* context,
                                               int index, int max_size,
                                               const int* tensor_indices,
                                               int* tensor_index) {
  if (index < 0 || index >= max_size) {
    TF_LITE_KERNEL_LOG(const_cast<TfLiteContext*>(context),
                       "Invalid tensor index %d (not in [0, %d))\n", index,
                       max_size);
    return kTfLiteError;
  }
  if (tensor_indices[index] == kTfLiteOptionalTensor) {
    TF_LITE_KERNEL_LOG(const_cast<TfLiteContext*>(context),
                       "Tensor at index %d was optional but was expected\n",
                       index);
    return kTfLiteError;
  }
  *tensor_index = tensor_indices[index];
  return kTfLiteOk;
 }
 // Same as above but returns -1 for invalid inputs instead of status + logging
 // error.
 inline int ValidateTensorIndexing(const TfLiteContext* context, int index,
                                  int max_size, const int* tensor_indices) {
  if (index >= 0 && index < max_size) {
    const int tensor_index = tensor_indices[index];
    if (tensor_index != kTfLiteOptionalTensor) {
      return tensor_index;
    }
  }
  return -1;
 }
 inline TfLiteTensor* GetMutableInput(const TfLiteContext* context,
                                     const TfLiteNode* node, int index) {
  const int tensor_index = ValidateTensorIndexing(
      context, index, node->inputs->size, node->inputs->data);
  if (tensor_index < 0) {
    return nullptr;
  }
  return GetTensorAtIndex(context, tensor_index);
 }
 inline TfLiteStatus GetMutableInputSafe(const TfLiteContext* context,
                                        const TfLiteNode* node, int index,
                                        const TfLiteTensor** tensor) {
  int tensor_index;
  TF_LITE_ENSURE_OK(
      context, ValidateTensorIndexingSafe(context, index, node->inputs->size,
                                          node->inputs->data, &tensor_index));
  *tensor = GetTensorAtIndex(context, tensor_index);
  return kTfLiteOk;
 }
 }  // anonymous namespace.
 const TfLiteTensor* GetInput(const TfLiteContext* context,
                             const TfLiteNode* node, int index) {
  return GetMutableInput(context, node, index);
 }
 TfLiteStatus GetInputSafe(const TfLiteContext* context, const TfLiteNode* node,
                          int index, const TfLiteTensor** tensor) {
  return GetMutableInputSafe(context, node, index, tensor);
 }
 TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
                               int index) {
  TfLiteTensor* tensor = GetMutableInput(context, node, index);
  return tensor->is_variable ? tensor : nullptr;
 }
 TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
                        int index) {
  const int tensor_index = ValidateTensorIndexing(
      context, index, node->outputs->size, node->outputs->data);
  if (tensor_index < 0) {
    return nullptr;
  }
  return GetTensorAtIndex(context, tensor_index);
 }
 TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node,
                           int index, TfLiteTensor** tensor) {
  int tensor_index;
  TF_LITE_ENSURE_OK(
      context, ValidateTensorIndexingSafe(context, index, node->outputs->size,
                                          node->outputs->data, &tensor_index));
  *tensor = GetTensorAtIndex(context, tensor_index);
  return kTfLiteOk;
 }
 const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
                                           const TfLiteNode* node, int index) {
  return GetInput(context, node, index);
 }
 #ifndef TF_LITE_STATIC_MEMORY
 TfLiteTensor* GetTemporary(TfLiteContext* context, const TfLiteNode* node,
                           int index) {
  const int tensor_index = ValidateTensorIndexing(
      context, index, node->temporaries->size, node->temporaries->data);
  if (tensor_index < 0) {
    return nullptr;
  }
  return GetTensorAtIndex(context, tensor_index);
 }
 TfLiteStatus GetTemporarySafe(const TfLiteContext* context,
                              const TfLiteNode* node, int index,
                              TfLiteTensor** tensor) {
  int tensor_index;
  TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe(
                                 context, index, node->temporaries->size,
                                 node->temporaries->data, &tensor_index));
  *tensor = GetTensorAtIndex(context, tensor_index);
  return kTfLiteOk;
 }
 const TfLiteTensor* GetIntermediates(TfLiteContext* context,
                                     const TfLiteNode* node, int index) {
  const int tensor_index = ValidateTensorIndexing(
      context, index, node->intermediates->size, node->intermediates->data);
  if (tensor_index < 0) {
    return nullptr;
  }
  return GetTensorAtIndex(context, tensor_index);
 }
 TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context,
                                  const TfLiteNode* node, int index,
                                  TfLiteTensor** tensor) {
  int tensor_index;
  TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe(
                                 context, index, node->intermediates->size,
                                 node->intermediates->data, &tensor_index));
  *tensor = GetTensorAtIndex(context, tensor_index);
  return kTfLiteOk;
 }
 #endif  // TF_LITE_STATIC_MEMORY
 // Per-axis
 TfLiteStatus PopulateConvolutionQuantizationParams(
    TfLiteContext* context, const TfLiteTensor* input,
@@ -126,11 +287,27 @@ TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
  // pipeline.
  if (bias) {
    const double bias_scale = static_cast<double>(bias->params.scale);
-    // Here we're making sure the input_product_scale & bias_scale the same.
+    // Here we're making sure the input_product_scale & bias_scale are about the
-    // Normally this should be guaranteed by the training pipeline, we are
+    // same. Since we have:
-    // setting the threshold to be 2e-6 to allow some numeric stability
+    // (output - output_zp) * output_scale =
-    // difference.
+    // input_product_scale * input_product + bias * bias_scale ---- (0)
-    TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <= 2e-6);
+    //
    // (0) equals:
    // (input_product + bias) * input_product_scale ----- (1)
    //           +
    // bias * (bias_scale - input_product_scale)   ------ (2)
    //
    // For the real kernel computation, we're doing (1), so we really need to
    // make sure (2) has minimum impact on the output, so:
    // bias * (bias_scale - input_product_scale) / output_scale should be
    // a small number for an integer.
    // Since normally bias should be within a small range.
    // We should expect (bias_scale - input_product_scale) / output_scale to
    // be a small number like 0.02.
    const double scale_diff = std::abs(input_product_scale - bias_scale);
    const double output_scale = static_cast<double>(output->params.scale);
    TF_LITE_ENSURE(context, scale_diff / output_scale <= 0.02);
  }
  return GetQuantizedConvolutionMultipler(context, input, filter, output,
                                          multiplier);
@@ -167,7 +344,7 @@ void CalculateActivationRangeQuantizedImpl(TfLiteFusedActivation activation,
  } else if (activation == kTfLiteActRelu6) {
    *act_min = std::max(qmin, quantize(0.0));
    *act_max = std::min(qmax, quantize(6.0));
-  } else if (activation == kTfLiteActRelu1) {
+  } else if (activation == kTfLiteActReluN1To1) {
    *act_min = std::max(qmin, quantize(-1.0));
    *act_max = std::min(qmax, quantize(1.0));
  } else {
@@ -258,4 +435,44 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
 }
 #endif  // TF_LITE_STATIC_MEMORY
 // Size of string is not constant, return 0 in such case.
 int TfLiteTypeGetSize(TfLiteType type) {
  switch (type) {
    case kTfLiteUInt8:
      TF_LITE_ASSERT_EQ(sizeof(uint8_t), 1);
      return 1;
    case kTfLiteInt8:
      TF_LITE_ASSERT_EQ(sizeof(int8_t), 1);
      return 1;
    case kTfLiteBool:
      return sizeof(bool);
    case kTfLiteInt16:
      TF_LITE_ASSERT_EQ(sizeof(int16_t), 2);
      return 2;
    case kTfLiteFloat16:
      TF_LITE_ASSERT_EQ(sizeof(int16_t), 2);
      return 2;
    case kTfLiteFloat32:
      TF_LITE_ASSERT_EQ(sizeof(float), 4);
      return 4;
    case kTfLiteInt32:
      TF_LITE_ASSERT_EQ(sizeof(int32_t), 4);
      return 4;
    case kTfLiteInt64:
      TF_LITE_ASSERT_EQ(sizeof(int64_t), 8);
      return 8;
    case kTfLiteFloat64:
      TF_LITE_ASSERT_EQ(sizeof(double), 8);
      return 8;
    case kTfLiteComplex64:
      TF_LITE_ASSERT_EQ(sizeof(std::complex<float>), 8);
      return 8;
    case kTfLiteComplex128:
      TF_LITE_ASSERT_EQ(sizeof(std::complex<double>), 16);
      return 16;
    default:
      return 0;
  }
 }
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/kernels/kernel_util.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/kernel_util.h
@@ -15,52 +15,148 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
 #define TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
-#include <algorithm>
+#include <stdint.h>
 #include <limits>
 #include "flatbuffers/flatbuffers.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 namespace tflite {
 // A fair number of functions in this header have historically been inline.
 // It is ok to change functions to not be inline if the latency with
 // benchmark_model for MobileNet + MobileBERT is unaffected. If such a change is
 // made, move the newly non-inlined function declarations to the top of this
 // header file.
 // Note: You must check if result is not null:
 //
 //   TfLiteTensor* my_tensor = GetInput(context, node, kMyTensorIdx);
 //   TF_LITE_ENSURE(context, my_tensor != nullptr);
 //
 // This is because the index might point to the optional tensor constant
 // (kTfLiteOptionalTensor) in which case there is no tensor to return.
 const TfLiteTensor* GetInput(const TfLiteContext* context,
                             const TfLiteNode* node, int index);
 // Same as `GetInput` but returns boolean and uses output argument for tensor.
 //
 //   TfLiteTensor* my_tensor;
 //   TF_LITE_ENSURE_OK(context,
 //                     GetInputSafe(context, node, kMyTensorIdx, &my_tensor));
 //   // can use my_tensor directly from here onwards, it is not nullptr
 //
 // Should be used in cases where the binary size is too large.
 TfLiteStatus GetInputSafe(const TfLiteContext* context, const TfLiteNode* node,
                          int index, const TfLiteTensor** tensor);
 // Note: You must check if result is not null:
 //
 //   TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
 //   TF_LITE_ENSURE(context, my_tensor != nullptr);
 //
 // This is because the index might point to the optional tensor constant
 // (kTfLiteOptionalTensor) in which case there is no tensor to return.
 TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
                               int index);
 // Note: You must check if result is not null:
 //
 //   TfLiteTensor* my_tensor = GetOutput(context, node, kMyTensorIdx);
 //   TF_LITE_ENSURE(context, my_tensor != nullptr);
 //
 // This is because the index might point to the optional tensor constant
 // (kTfLiteOptionalTensor) in which case there is no tensor to return.
 TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
                        int index);
 // Same as `GetOutput` but returns boolean and uses output argument for tensor.
 //
 //   TfLiteTensor* my_tensor;
 //   TF_LITE_ENSURE_OK(context,
 //                     GetOutputSafe(context, node, kMyTensorIdx, &my_tensor));
 //   // can use my_tensor directly from here onwards, it is not nullptr
 //
 // Should be used in cases where the binary size is too large.
 TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node,
                           int index, TfLiteTensor** tensor);
 // Note: You must check if result is not null:
 //
 //   TfLiteTensor* my_tensor = GetOptionalInputTensor(context, node, kIdx);
 //   TF_LITE_ENSURE(context, my_tensor != nullptr);
 //
 // This is because the index might point to the optional tensor constant
 // (kTfLiteOptionalTensor) in which case there is no tensor to return.
 //
 // Deprecated. GetInput has the same functionality.
 const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
                                           const TfLiteNode* node, int index);
 #ifndef TF_LITE_STATIC_MEMORY
 // Note: You must check if result is not null:
 //
 //   TfLiteTensor* my_tensor = GetTemporary(context, node, kMyTensorIdx);
 //   TF_LITE_ENSURE(context, my_tensor != nullptr);
 //
 // This is because the index might point to the optional tensor constant
 // (kTfLiteOptionalTensor) in which case there is no tensor to return.
 TfLiteTensor* GetTemporary(TfLiteContext* context, const TfLiteNode* node,
                           int index);
 // Same as `GetTemporary` but returns boolean and uses output argument for
 // tensor.
 //
 //   TfLiteTensor* my_tensor;
 //   TF_LITE_ENSURE_OK(context,
 //                     GetTemporarySafe(context, node, kMyTensorIdx,
 //                     &my_tensor));
 //   // can use my_tensor directly from here onwards, it is not nullptr
 //
 // Should be used in cases where the binary size is too large.
 TfLiteStatus GetTemporarySafe(const TfLiteContext* context,
                              const TfLiteNode* node, int index,
                              TfLiteTensor** tensor);
 // Note: You must check if result is not null:
 //
 //   TfLiteTensor* my_tensor = GetIntermediates(context, node, kMyTensorIdx);
 //   TF_LITE_ENSURE(context, my_tensor != nullptr);
 //
 // This is because the index might point to the optional tensor constant
 // (kTfLiteOptionalTensor) in which case there is no tensor to return.
 const TfLiteTensor* GetIntermediates(TfLiteContext* context,
                                     const TfLiteNode* node, int index);
 // Same as `GetIntermediates` but returns boolean and uses output argument for
 // tensor.
 //
 //   TfLiteTensor* my_tensor;
 //   TF_LITE_ENSURE_OK(context,
 //                     GetIntermediatesSafe(context, node, kMyTensorIdx,
 //                     &my_tensor));
 //   // can use my_tensor directly from here onwards, it is not nullptr
 //
 // Should be used in cases where the binary size is too large.
 TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context,
                                  const TfLiteNode* node, int index,
                                  TfLiteTensor** tensor);
 #endif  // TF_LITE_STATIC_MEMORY
 inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
 inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
  return t->dims->data[dim];
 }
-inline const TfLiteTensor* GetInput(TfLiteContext* context,
+
                                    const TfLiteNode* node, int index) {
  return &context
              ->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
 }
 // Note: You must check if result is not null:
 // TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
 // TF_LITE_ENSURE(context, my_tensor != nullptr);
 inline TfLiteTensor* GetVariableInput(TfLiteContext* context,
                                      const TfLiteNode* node, int index) {
  TfLiteTensor* tensor =
      &context->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
  return (tensor->is_variable) ? tensor : nullptr;
 }
 inline TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
                               int index) {
  return &context
              ->tensors[flatbuffers::EndianScalar(node->outputs->data[index])];
 }
 inline TfLiteTensor* GetTemporary(TfLiteContext* context,
                                  const TfLiteNode* node, int index) {
  return &context->tensors[flatbuffers::EndianScalar(
      node->temporaries->data[index])];
 }
 inline const TfLiteTensor* GetIntermediates(TfLiteContext* context,
                                            const TfLiteNode* node, int index) {
  return &context->tensors[node->intermediates->data[index]];
 }
 inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
 inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 #ifndef TF_LITE_STATIC_MEMORY
 inline int NumIntermediates(const TfLiteNode* node) {
  return node->intermediates->size;
 }
 #endif  // TF_LITE_STATIC_MEMORY
 inline int64_t NumElements(const TfLiteIntArray* dims) {
  int64_t count = 1;
@@ -74,19 +170,11 @@ inline int64_t NumElements(const TfLiteTensor* t) {
  return NumElements(t->dims);
 }
 inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
                                                  const TfLiteNode* node,
                                                  int index) {
  const bool use_tensor = index < node->inputs->size &&
                          node->inputs->data[index] != kTfLiteOptionalTensor;
  if (use_tensor) {
    return &context
                ->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
  }
  return nullptr;
 }
 // Determines whether tensor is constant.
 // TODO(b/138199592): Introduce new query which checks for constant OR
 // persistent-read-only, which would be useful for most tensor kernels that
 // are potentially dynamic based on the input tensor value availability at the
 // time of prepare.
 inline bool IsConstantTensor(const TfLiteTensor* tensor) {
  return tensor->allocation_type == kTfLiteMmapRo;
 }
@@ -105,6 +193,14 @@ inline void SetTensorToDynamic(TfLiteTensor* tensor) {
  }
 }
 // Sets tensor to persistent and read-only.
 inline void SetTensorToPersistentRo(TfLiteTensor* tensor) {
  if (tensor->allocation_type != kTfLitePersistentRo) {
    tensor->allocation_type = kTfLitePersistentRo;
    tensor->data.raw = nullptr;
  }
 }
 // Determines whether it is a hybrid op - one that has float inputs and
 // quantized weights.
 inline bool IsHybridOp(const TfLiteTensor* input, const TfLiteTensor* weight) {
@@ -162,7 +258,7 @@ void CalculateActivationRange(TfLiteFusedActivation activation,
  } else if (activation == kTfLiteActRelu6) {
    *activation_min = 0;
    *activation_max = 6;
-  } else if (activation == kTfLiteActRelu1) {
+  } else if (activation == kTfLiteActReluN1To1) {
    *activation_min = -1;
    *activation_max = 1;
  } else {
@@ -188,6 +284,10 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
                                        const TfLiteTensor* input2,
                                        const TfLiteTensor* input3,
                                        TfLiteIntArray** output_shape);
 // Return the size of given type in bytes. Return 0 in in case of string.
 int TfLiteTypeGetSize(TfLiteType type);
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
--- a/code/lib/tfmicro/tensorflow/lite/kernels/op_macros.h
+++ b/code/lib/tfmicro/tensorflow/lite/kernels/op_macros.h
@@ -19,7 +19,7 @@ limitations under the License.
 // non-portable function.
 #ifdef TF_LITE_MCU_DEBUG_LOG
-#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/micro/debug_log.h"
 #define DEBUG_LOG(x) \
  do {               \
@@ -36,7 +36,6 @@ inline void InfiniteLoop() {
 #else  // TF_LITE_MCU_DEBUG_LOG
 #include <cassert>
 #include <cstdio>
 #include <cstdlib>
@@ -45,6 +44,15 @@ inline void InfiniteLoop() {
    fprintf(stderr, "%s", (x)); \
  } while (0)
 // Report Error for unsupported type by op 'op_name' and returns kTfLiteError.
 #define TF_LITE_UNSUPPORTED_TYPE(context, type, op_name)                    \
  do {                                                                      \
    TF_LITE_KERNEL_LOG((context), "%s:%d Type %s is unsupported by op %s.", \
                       __FILE__, __LINE__, TfLiteTypeGetName(type),         \
                       (op_name));                                          \
    return kTfLiteError;                                                    \
  } while (0)
 #define TFLITE_ABORT abort()
 #endif  // TF_LITE_MCU_DEBUG_LOG
--- a/code/lib/tfmicro/tensorflow/lite/micro/all_ops_resolver.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/all_ops_resolver.cc
@@ -0,0 +1,94 @@
 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/micro_ops.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace custom {
 TfLiteRegistration* Register_ETHOSU();
 const char* GetString_ETHOSU();
 }  // namespace custom
 }  // namespace micro
 }  // namespace ops
 AllOpsResolver::AllOpsResolver() {
  // Please keep this list of Builtin Operators in alphabetical order.
  AddAbs();
  AddAdd();
  AddArgMax();
  AddArgMin();
  AddAveragePool2D();
  AddCeil();
  AddConcatenation();
  AddConv2D();
  AddCos();
  AddDepthwiseConv2D();
  AddDequantize();
  AddEqual();
  AddFloor();
  AddFullyConnected();
  AddGreater();
  AddGreaterEqual();
  AddHardSwish();
  AddL2Normalization();
  AddLess();
  AddLessEqual();
  AddLog();
  AddLogicalAnd();
  AddLogicalNot();
  AddLogicalOr();
  AddLogistic();
  AddMaximum();
  AddMaxPool2D();
  AddMean();
  AddMinimum();
  AddMul();
  AddNeg();
  AddNotEqual();
  AddPack();
  AddPad();
  AddPadV2();
  AddPrelu();
  AddQuantize();
  AddReduceMax();
  AddRelu();
  AddRelu6();
  AddReshape();
  AddResizeNearestNeighbor();
  AddRound();
  AddRsqrt();
  AddShape();
  AddSin();
  AddSoftmax();
  AddSplit();
  AddSplitV();
  AddSqrt();
  AddSquare();
  AddStridedSlice();
  AddSub();
  AddSvdf();
  AddTanh();
  AddUnpack();
  // TODO(b/159644355): Figure out if custom Ops belong in AllOpsResolver.
  TfLiteRegistration* registration =
      tflite::ops::micro::custom::Register_ETHOSU();
  if (registration) {
    AddCustom(tflite::ops::micro::custom::GetString_ETHOSU(), registration);
  }
 }
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/all_ops_resolver.h
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/all_ops_resolver.h
@@ -9,17 +9,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_MICRO_KERNELS_ALL_OPS_RESOLVER_H_
+#ifndef TENSORFLOW_LITE_MICRO_ALL_OPS_RESOLVER_H_
-#define TENSORFLOW_LITE_MICRO_KERNELS_ALL_OPS_RESOLVER_H_
+#define TENSORFLOW_LITE_MICRO_ALL_OPS_RESOLVER_H_
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 namespace tflite {
 namespace ops {
 namespace micro {
-class AllOpsResolver : public MicroMutableOpResolver {
+// The magic number in the template parameter is the maximum number of ops that
 // can be added to AllOpsResolver. It can be increased if needed. And most
 // applications that care about the memory footprint will want to directly use
 // MicroMutableOpResolver and have an application specific template parameter.
 // The examples directory has sample code for this.
 class AllOpsResolver : public MicroMutableOpResolver<128> {
 public:
  AllOpsResolver();
@@ -27,8 +30,6 @@ class AllOpsResolver : public MicroMutableOpResolver {
  TF_LITE_REMOVE_VIRTUAL_DELETE
 };
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
-#endif  // TENSORFLOW_LITE_MICRO_KERNELS_ALL_OPS_RESOLVER_H_
+#endif  // TENSORFLOW_LITE_MICRO_ALL_OPS_RESOLVER_H_
--- a/code/lib/tfmicro/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
--- a/code/lib/tfmicro/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.h
+++ b/code/lib/tfmicro/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.h
@@ -0,0 +1,22 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_MICRO_BENCHMARKS_KEYWORD_SCRAMBLED_MODEL_DATA_H_
 #define TENSORFLOW_LITE_MICRO_BENCHMARKS_KEYWORD_SCRAMBLED_MODEL_DATA_H_
 extern const unsigned char g_keyword_scrambled_model_data[];
 extern const unsigned int g_keyword_scrambled_model_data_length;
 #endif  // TENSORFLOW_LITE_MICRO_BENCHMARKS_KEYWORD_SCRAMBLED_MODEL_DATA_H_
--- a/code/lib/tfmicro/tensorflow/lite/micro/debug_log.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/debug_log.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -36,6 +36,15 @@ limitations under the License.
 #include "tensorflow/lite/micro/debug_log.h"
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
 #include <cstdio>
 #endif
-extern "C" void DebugLog(const char* s) { fprintf(stderr, "%s", s); }
+extern "C" void DebugLog(const char* s) {
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
  // Reusing TF_LITE_STRIP_ERROR_STRINGS to disable DebugLog completely to get
  // maximum reduction in binary size. This is because we have DebugLog calls
  // via TF_LITE_CHECK that are not stubbed out by TF_LITE_REPORT_ERROR.
  fprintf(stderr, "%s", s);
 #endif
 }
--- a/code/lib/tfmicro/tensorflow/lite/micro/debug_log.h
+++ b/code/lib/tfmicro/tensorflow/lite/micro/debug_log.h
@@ -15,9 +15,17 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_DEBUG_LOG_H_
 #define TENSORFLOW_LITE_MICRO_DEBUG_LOG_H_
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 // This function should be implemented by each target platform, and provide a
 // way for strings to be output to some text stream. For more information, see
 // tensorflow/lite/micro/debug_log.cc.
-extern "C" void DebugLog(const char* s);
+void DebugLog(const char* s);
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 #endif  // TENSORFLOW_LITE_MICRO_DEBUG_LOG_H_
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/activation_utils.h
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/activation_utils.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/max.h"
 #include "tensorflow/lite/kernels/internal/min.h"
 namespace tflite {
 namespace ops {
@@ -32,11 +34,11 @@ inline float ActivationValFloat(TfLiteFusedActivation act, float a) {
    case kTfLiteActNone:
      return a;
    case kTfLiteActRelu:
-      return std::max(0.0f, a);
+      return TfLiteMax(0.0f, a);
-    case kTfLiteActRelu1:
+    case kTfLiteActReluN1To1:
-      return std::max(-1.0f, std::min(a, 1.0f));
+      return TfLiteMax(-1.0f, TfLiteMin(a, 1.0f));
    case kTfLiteActRelu6:
-      return std::max(0.0f, std::min(a, 6.0f));
+      return TfLiteMax(0.0f, TfLiteMin(a, 6.0f));
    case kTfLiteActTanh:
      return std::tanh(a);
    case kTfLiteActSignBit:
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/activations.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/activations.cc
@@ -18,30 +18,82 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace activations {
 namespace {
 struct ReluOpData {
  ReluParams params;
 };
 struct Relu6OpData {
  int8_t six_int8;
  int8_t zero_int8;
  uint8_t six_uint8;
  uint8_t zero_uint8;
 };
 }  // namespace
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
-template <typename Q>
+template <typename T>
-inline void ReluQuantized(int32_t lower, const RuntimeShape& input_shape,
+inline void ReluQuantized(const ReluOpData& data,
-                          const Q* input_data, const RuntimeShape& output_shape,
+                          const RuntimeShape& input_shape,
-                          Q* output_data) {
+                          const RuntimeShape& output_shape, const T* input_data,
                          T* output_data) {
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < flat_size; ++i) {
-    const Q val = input_data[i];
+    const int32_t val = static_cast<int32_t>(input_data[i]);
-    const Q clamped = val < lower ? lower : val;
+    int32_t clamped =
-    output_data[i] = clamped;
+        data.params.output_offset +
        MultiplyByQuantizedMultiplier(val - data.params.input_offset,
                                      data.params.output_multiplier,
                                      data.params.output_shift);
    clamped = std::max(data.params.quantized_activation_min, clamped);
    clamped = std::min(data.params.quantized_activation_max, clamped);
    output_data[i] = static_cast<T>(clamped);
  }
 }
 template <typename T>
 inline void CalculateReluOpData(const TfLiteTensor* input, TfLiteTensor* output,
                                ReluOpData* data) {
  float act_min = 0.0;
  float act_max = std::numeric_limits<float>::infinity();
  double real_multiplier =
      static_cast<double>(input->params.scale / output->params.scale);
  const RuntimeShape input_shape = GetTensorShape(input);
  const RuntimeShape output_shape = GetTensorShape(output);
  QuantizeMultiplier(real_multiplier, &data->params.output_multiplier,
                     &data->params.output_shift);
  data->params.quantized_activation_min = std::max(
      static_cast<int32_t>(std::numeric_limits<T>::min()),
      output->params.zero_point +
          static_cast<int32_t>(roundf(act_min / output->params.scale)));
  data->params.quantized_activation_max =
      act_max == std::numeric_limits<float>::infinity()
          ? static_cast<int32_t>(std::numeric_limits<T>::max())
          : std::min(static_cast<int32_t>(std::numeric_limits<T>::max()),
                     output->params.zero_point +
                         static_cast<int32_t>(
                             roundf(act_max / output->params.scale)));
  data->params.input_offset = input->params.zero_point;
  data->params.output_offset = output->params.zero_point;
 }
 inline void ReluFloat(const RuntimeShape& input_shape, const float* input_data,
                      const RuntimeShape& output_shape, float* output_data) {
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
@@ -77,33 +129,59 @@ inline void Relu6Quantized(Q lower, Q upper, const RuntimeShape& input_shape,
  }
 }
 void* ReluInit(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  return context->AllocatePersistentBuffer(context, sizeof(ReluOpData));
 }
 TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  ReluOpData* data = static_cast<ReluOpData*>(node->user_data);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  if (input->type == kTfLiteInt8) {
    CalculateReluOpData<int8_t>(input, output, data);
  } else if (input->type == kTfLiteUInt8) {
    CalculateReluOpData<uint8_t>(input, output, data);
  }
  return kTfLiteOk;
 }
 TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const ReluOpData& data = *(static_cast<const ReluOpData*>(node->user_data));
  const TfLiteEvalTensor* input =
      tflite::micro::GetEvalInput(context, node, kInputTensor);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  switch (input->type) {
    case kTfLiteFloat32: {
-      ReluFloat(GetTensorShape(input), GetTensorData<float>(input),
+      ReluFloat(tflite::micro::GetTensorShape(input),
-                GetTensorShape(output), GetTensorData<float>(output));
+                tflite::micro::GetTensorData<float>(input),
                tflite::micro::GetTensorShape(output),
                tflite::micro::GetTensorData<float>(output));
      return kTfLiteOk;
    }
    case kTfLiteInt8: {
-      ReluQuantized<int8_t>(input->params.zero_point, GetTensorShape(input),
+      ReluQuantized<int8_t>(data, tflite::micro::GetTensorShape(input),
-                            GetTensorData<int8_t>(input),
+                            tflite::micro::GetTensorShape(output),
-                            GetTensorShape(output),
+                            tflite::micro::GetTensorData<int8_t>(input),
-                            GetTensorData<int8_t>(output));
+                            tflite::micro::GetTensorData<int8_t>(output));
      return kTfLiteOk;
    }
    case kTfLiteUInt8: {
-      ReluQuantized<uint8_t>(input->params.zero_point, GetTensorShape(input),
+      ReluQuantized<uint8_t>(data, tflite::micro::GetTensorShape(input),
-                             GetTensorData<uint8_t>(input),
+                             tflite::micro::GetTensorShape(output),
-                             GetTensorShape(output),
+                             tflite::micro::GetTensorData<uint8_t>(input),
-                             GetTensorData<uint8_t>(output));
+                             tflite::micro::GetTensorData<uint8_t>(output));
      return kTfLiteOk;
    }
    default: {
@@ -114,37 +192,63 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
  }
 }
 void* Relu6Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  return context->AllocatePersistentBuffer(context, sizeof(Relu6OpData));
 }
 TfLiteStatus Relu6Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  Relu6OpData* data = static_cast<Relu6OpData*>(node->user_data);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
  if (input->type == kTfLiteInt8) {
    data->six_int8 = FloatToQuantizedType<int8_t>(6.0f, input->params.scale,
                                                  input->params.zero_point);
    data->zero_int8 = input->params.zero_point;
  } else if (input->type == kTfLiteUInt8) {
    data->six_uint8 = FloatToQuantizedType<uint8_t>(6.0f, input->params.scale,
                                                    input->params.zero_point);
    data->zero_uint8 = input->params.zero_point;
  }
  return kTfLiteOk;
 }
 TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const Relu6OpData& data = *(static_cast<const Relu6OpData*>(node->user_data));
  const TfLiteEvalTensor* input =
      tflite::micro::GetEvalInput(context, node, kInputTensor);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  switch (input->type) {
    case kTfLiteFloat32: {
-      Relu6Float(GetTensorShape(input), GetTensorData<float>(input),
+      Relu6Float(tflite::micro::GetTensorShape(input),
-                 GetTensorShape(output), GetTensorData<float>(output));
+                 tflite::micro::GetTensorData<float>(input),
                 tflite::micro::GetTensorShape(output),
                 tflite::micro::GetTensorData<float>(output));
      return kTfLiteOk;
    }
    case kTfLiteInt8: {
-      const int8_t six = FloatToAsymmetricQuantizedInt8(
+      Relu6Quantized<int8_t>(data.zero_int8, data.six_int8,
-          6.0f, input->params.scale, input->params.zero_point);
+                             tflite::micro::GetTensorShape(input),
-      const int8_t zero = input->params.zero_point;
+                             tflite::micro::GetTensorData<int8_t>(input),
-      Relu6Quantized<int8_t>(
+                             tflite::micro::GetTensorShape(output),
-          zero, six, GetTensorShape(input), GetTensorData<int8_t>(input),
+                             tflite::micro::GetTensorData<int8_t>(output));
          GetTensorShape(output), GetTensorData<int8_t>(output));
      return kTfLiteOk;
    }
    case kTfLiteUInt8: {
-      const uint8_t six = FloatToAsymmetricQuantizedUInt8(
+      Relu6Quantized<uint8_t>(data.zero_uint8, data.six_uint8,
-          6.0f, input->params.scale, input->params.zero_point);
+                              tflite::micro::GetTensorShape(input),
-      const uint8_t zero = input->params.zero_point;
+                              tflite::micro::GetTensorData<uint8_t>(input),
-      Relu6Quantized<uint8_t>(
+                              tflite::micro::GetTensorShape(output),
-          zero, six, GetTensorShape(input), GetTensorData<uint8_t>(input),
+                              tflite::micro::GetTensorData<uint8_t>(output));
          GetTensorShape(output), GetTensorData<uint8_t>(output));
      return kTfLiteOk;
    }
    default: {
@@ -157,28 +261,26 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace activations
-TfLiteRegistration* Register_RELU() {
+TfLiteRegistration Register_RELU() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/activations::ReluInit,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/activations::ReluPrepare,
+          /*prepare=*/activations::ReluPrepare,
-                                 /*invoke=*/activations::ReluEval,
+          /*invoke=*/activations::ReluEval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_RELU6() {
+TfLiteRegistration Register_RELU6() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/activations::Relu6Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/activations::Relu6Prepare,
+          /*prepare=*/activations::Relu6Prepare,
-                                 /*invoke=*/activations::Relu6Eval,
+          /*invoke=*/activations::Relu6Eval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
 }  // namespace micro
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/add.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/add.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 namespace tflite {
 namespace ops {
@@ -40,18 +42,22 @@ struct OpData {
  // and the special 16-bit -> 16bit quantized path
  int input1_shift;
  int input2_shift;
-  int32 output_activation_min;
+  int32_t output_activation_min;
-  int32 output_activation_max;
+  int32_t output_activation_max;
  // These fields are used only in the general 8-bit -> 8bit quantized path
-  int32 input1_multiplier;
+  int32_t input1_multiplier;
-  int32 input2_multiplier;
+  int32_t input2_multiplier;
-  int32 output_multiplier;
+  int32_t output_multiplier;
  int output_shift;
  int left_shift;
-  int32 input1_offset;
+  int32_t input1_offset;
-  int32 input2_offset;
+  int32_t input2_offset;
-  int32 output_offset;
+  int32_t output_offset;
  // Used only for float evals:
  float output_activation_min_f32;
  float output_activation_max_f32;
 };
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
@@ -89,37 +95,44 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
        context, params->activation, output, &data->output_activation_min,
        &data->output_activation_max));
  } else if (output->type == kTfLiteFloat32) {
    CalculateActivationRange(params->activation,
                             &data->output_activation_min_f32,
                             &data->output_activation_max_f32);
  }
  return kTfLiteOk;
 }
 void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
-             const OpData* data, const TfLiteTensor* input1,
+             const OpData* data, const TfLiteEvalTensor* input1,
-             const TfLiteTensor* input2, TfLiteTensor* output) {
+             const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
  tflite::ArithmeticParams op_params;
-  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+  SetActivationParams(data->output_activation_min_f32,
-#define TF_LITE_ADD(opname)                                                   \
+                      data->output_activation_max_f32, &op_params);
  reference_ops::opname(op_params, GetTensorShape(input1),                    \
                        GetTensorData<float>(input1), GetTensorShape(input2), \
                        GetTensorData<float>(input2), GetTensorShape(output), \
                        GetTensorData<float>(output))
  if (data->requires_broadcast) {
-    TF_LITE_ADD(BroadcastAdd4DSlow);
+    reference_ops::BroadcastAdd4DSlow(
        op_params, tflite::micro::GetTensorShape(input1),
        tflite::micro::GetTensorData<float>(input1),
        tflite::micro::GetTensorShape(input2),
        tflite::micro::GetTensorData<float>(input2),
        tflite::micro::GetTensorShape(output),
        tflite::micro::GetTensorData<float>(output));
  } else {
-    TF_LITE_ADD(Add);
+    reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
                       tflite::micro::GetTensorData<float>(input1),
                       tflite::micro::GetTensorShape(input2),
                       tflite::micro::GetTensorData<float>(input2),
                       tflite::micro::GetTensorShape(output),
                       tflite::micro::GetTensorData<float>(output));
  }
 #undef TF_LITE_ADD
 }
 TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                              TfLiteAddParams* params, const OpData* data,
-                              const TfLiteTensor* input1,
+                              const TfLiteEvalTensor* input1,
-                              const TfLiteTensor* input2,
+                              const TfLiteEvalTensor* input2,
-                              TfLiteTensor* output) {
+                              TfLiteEvalTensor* output) {
  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
    tflite::ArithmeticParams op_params;
    op_params.left_shift = data->left_shift;
@@ -135,46 +148,91 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
    SetActivationParams(data->output_activation_min,
                        data->output_activation_max, &op_params);
    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-        GetTensorShape(input1), GetTensorShape(input2), &op_params);
+        tflite::micro::GetTensorShape(input1),
-#define TF_LITE_ADD(type, opname, dtype)                             \
+        tflite::micro::GetTensorShape(input2), &op_params);
  type::opname(op_params, GetTensorShape(input1),                    \
               GetTensorData<dtype>(input1), GetTensorShape(input2), \
               GetTensorData<dtype>(input2), GetTensorShape(output), \
               GetTensorData<dtype>(output));
    if (output->type == kTfLiteInt8) {
      if (need_broadcast) {
-        TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
+        reference_integer_ops::BroadcastAdd4DSlow(
            op_params, tflite::micro::GetTensorShape(input1),
            tflite::micro::GetTensorData<int8_t>(input1),
            tflite::micro::GetTensorShape(input2),
            tflite::micro::GetTensorData<int8_t>(input2),
            tflite::micro::GetTensorShape(output),
            tflite::micro::GetTensorData<int8_t>(output));
      } else {
-        TF_LITE_ADD(reference_integer_ops, Add, int8_t);
+        reference_integer_ops::Add(
            op_params, tflite::micro::GetTensorShape(input1),
            tflite::micro::GetTensorData<int8_t>(input1),
            tflite::micro::GetTensorShape(input2),
            tflite::micro::GetTensorData<int8_t>(input2),
            tflite::micro::GetTensorShape(output),
            tflite::micro::GetTensorData<int8_t>(output));
      }
    } else {
      if (need_broadcast) {
-        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
+        reference_ops::BroadcastAdd4DSlow(
            op_params, tflite::micro::GetTensorShape(input1),
            tflite::micro::GetTensorData<uint8_t>(input1),
            tflite::micro::GetTensorShape(input2),
            tflite::micro::GetTensorData<uint8_t>(input2),
            tflite::micro::GetTensorShape(output),
            tflite::micro::GetTensorData<uint8_t>(output));
      } else {
-        TF_LITE_ADD(reference_ops, Add, uint8_t);
+        reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
                           tflite::micro::GetTensorData<uint8_t>(input1),
                           tflite::micro::GetTensorShape(input2),
                           tflite::micro::GetTensorData<uint8_t>(input2),
                           tflite::micro::GetTensorShape(output),
                           tflite::micro::GetTensorData<uint8_t>(output));
      }
    }
 #undef TF_LITE_ADD
  }
  return kTfLiteOk;
 }
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  TFLITE_DCHECK(node->builtin_data != nullptr);
  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
  TF_LITE_ENSURE(context, input1 != nullptr);
  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
  TF_LITE_ENSURE(context, input2 != nullptr);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  OpData* data = static_cast<OpData*>(node->user_data);
  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
  TF_LITE_ENSURE_STATUS(
      CalculateOpData(context, params, input1, input2, output, data));
  return kTfLiteOk;
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TFLITE_DCHECK(node->user_data != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  OpData data;
+  const TfLiteEvalTensor* input1 =
-  TF_LITE_ENSURE_STATUS(
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
-      CalculateOpData(context, params, input1, input2, output, &data));
+  const TfLiteEvalTensor* input2 =
      tflite::micro::GetEvalInput(context, node, kInputTensor2);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  if (output->type == kTfLiteFloat32) {
-    EvalAdd(context, node, params, &data, input1, input2, output);
+    EvalAdd(context, node, params, data, input1, input2, output);
  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, &data,
+    TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, data,
                                                input1, input2, output));
  } else {
    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -187,16 +245,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace add
-TfLiteRegistration* Register_ADD() {
+TfLiteRegistration Register_ADD() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/add::Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/add::Prepare,
-                                 /*invoke=*/add::Eval,
+          /*invoke=*/add::Eval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
 }  // namespace micro
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/all_ops_resolver.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/all_ops_resolver.cc
@@ -1,83 +0,0 @@
 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/micro_ops.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 // Register each supported op with:
 // AddBuiltin(<operator ID>, <registration>, [min version], [max version])
 AllOpsResolver::AllOpsResolver() {
  AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(), 1, 4);
  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D(), 1, 2);
  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(), 1, 2);
  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC(), 1, 2);
  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(), 1, 3);
  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(), 1, 3);
  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION(), 1, 3);
  AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D(), 1,
             3);
  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D(), 1, 2);
  AddBuiltin(BuiltinOperator_ABS, Register_ABS());
  AddBuiltin(BuiltinOperator_SIN, Register_SIN());
  AddBuiltin(BuiltinOperator_COS, Register_COS());
  AddBuiltin(BuiltinOperator_LOG, Register_LOG());
  AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
  AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
  AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
  AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
  AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN());
  AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
  AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
  AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT());
  AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL(), 1, 2);
  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL(), 1, 2);
  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER(), 1, 2);
  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL(), 1, 2);
  AddBuiltin(BuiltinOperator_LESS, Register_LESS(), 1, 2);
  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL(), 1, 2);
  AddBuiltin(BuiltinOperator_CEIL, Register_CEIL());
  AddBuiltin(BuiltinOperator_ROUND, Register_ROUND());
  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
  AddBuiltin(BuiltinOperator_PACK, Register_PACK(), 1, 2);
  AddBuiltin(BuiltinOperator_PAD, Register_PAD(), 1, 2);
  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2(), 1, 2);
  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(), 1, 3);
  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK(), 1, 2);
  AddBuiltin(BuiltinOperator_NEG, Register_NEG());
  AddBuiltin(BuiltinOperator_ADD, Register_ADD(), 1, 2);
  AddBuiltin(BuiltinOperator_MUL, Register_MUL(), 1, 3);
  AddBuiltin(BuiltinOperator_SUB, Register_SUB(), 1, 2);
  AddBuiltin(BuiltinOperator_QUANTIZE, Register_QUANTIZE());
  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(), 1, 2);
  AddBuiltin(BuiltinOperator_RELU, Register_RELU());
  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN());
  AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
             Register_RESIZE_NEAREST_NEIGHBOR(),
             /* min_version = */ 1,
             /* max_version = */ 2);
  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
 }
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/arg_min_max.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/arg_min_max.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/micro_utils.h"
 namespace tflite {
@@ -45,14 +46,20 @@ inline void ArgMinMaxHelper(const RuntimeShape& input1_shape,
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* input =
-  const TfLiteTensor* axis = GetInput(context, node, kAxis);
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* axis =
      tflite::micro::GetEvalInput(context, node, kAxis);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
-#define TF_LITE_ARG_MIN_MAX(data_type, axis_type, output_type)            \
+#define TF_LITE_ARG_MIN_MAX(data_type, axis_type, output_type)       \
-  ArgMinMaxHelper(GetTensorShape(input), GetTensorData<data_type>(input), \
+  ArgMinMaxHelper(tflite::micro::GetTensorShape(input),              \
-                  GetTensorData<axis_type>(axis), GetTensorShape(output), \
+                  tflite::micro::GetTensorData<data_type>(input),    \
-                  GetTensorData<output_type>(output), is_arg_max)
+                  tflite::micro::GetTensorData<axis_type>(axis),     \
                  tflite::micro::GetTensorShape(output),             \
                  tflite::micro::GetTensorData<output_type>(output), \
                  is_arg_max)
  if (axis->type == kTfLiteInt32) {
    if (output->type == kTfLiteInt32) {
      switch (input->type) {
@@ -67,18 +74,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
          break;
        default:
          TF_LITE_KERNEL_LOG(context,
-                             "Only float32, uint8 and int8 are "
+                             "Only float32, uint8_t and int8_t are "
                             "supported currently, got %s.",
                             TfLiteTypeGetName(input->type));
          return kTfLiteError;
      }
    } else {
-      TF_LITE_KERNEL_LOG(context, "Only int32 are supported currently, got %s.",
+      TF_LITE_KERNEL_LOG(context,
                         "Only int32_t are supported currently, got %s.",
                         TfLiteTypeGetName(output->type));
      return kTfLiteError;
    }
  } else {
-    TF_LITE_KERNEL_LOG(context, "Only int32 are supported currently, got %s.",
+    TF_LITE_KERNEL_LOG(context, "Only int32_t are supported currently, got %s.",
                       TfLiteTypeGetName(axis->type));
    return kTfLiteError;
  }
@@ -98,28 +106,26 @@ TfLiteStatus ArgMaxEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace arg_min_max
-TfLiteRegistration* Register_ARG_MAX() {
+TfLiteRegistration Register_ARG_MAX() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/nullptr,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/nullptr,
-                                 /*invoke=*/arg_min_max::ArgMaxEval,
+          /*invoke=*/arg_min_max::ArgMaxEval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_ARG_MIN() {
+TfLiteRegistration Register_ARG_MIN() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/nullptr,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/nullptr,
-                                 /*invoke=*/arg_min_max::ArgMinEval,
+          /*invoke=*/arg_min_max::ArgMinEval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
 }  // namespace micro
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/ceil.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/ceil.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 namespace tflite {
 namespace ops {
@@ -29,11 +30,13 @@ constexpr int kOutputTensor = 0;
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
  TF_LITE_ENSURE_EQ(context, output->bytes, input->bytes);
  TF_LITE_ENSURE_EQ(context, output->dims->size, input->dims->size);
  for (int i = 0; i < output->dims->size; ++i) {
@@ -43,26 +46,29 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* input =
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
-  reference_ops::Ceil(GetTensorShape(input), GetTensorData<float>(input),
+  reference_ops::Ceil(tflite::micro::GetTensorShape(input),
-                      GetTensorShape(output), GetTensorData<float>(output));
+                      tflite::micro::GetTensorData<float>(input),
                      tflite::micro::GetTensorShape(output),
                      tflite::micro::GetTensorData<float>(output));
  return kTfLiteOk;
 }
 }  // namespace ceil
-TfLiteRegistration* Register_CEIL() {
+TfLiteRegistration Register_CEIL() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/nullptr,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/ceil::Prepare,
+          /*prepare=*/ceil::Prepare,
-                                 /*invoke=*/ceil::Eval,
+          /*invoke=*/ceil::Eval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
 }  // namespace micro
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/circular_buffer.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/circular_buffer.cc
@@ -17,11 +17,10 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 /*
 * The circular buffer custom operator is used to implement strided streaming
@@ -78,7 +77,9 @@ void Free(TfLiteContext* context, void* buffer) { op_data_counter = 0; }
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE(context, input != nullptr);
  TF_LITE_ENSURE(context, output != nullptr);
@@ -89,10 +90,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, 1, input->dims->data[2]);
  TF_LITE_ENSURE_EQ(context, output->dims->data[3], input->dims->data[3]);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
-  // The circular buffer custom operator currently only supports int8.
+  // The circular buffer custom operator currently only supports int8_t.
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteInt8);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
  // TODO(b/132070898): Use statically slotted OpData structures until a
  // scratch memory API is ready.
@@ -121,8 +122,10 @@ void EvalInt8(const int8_t* input, int num_slots, int depth, int8_t* output) {
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* input =
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  OpData* data = reinterpret_cast<OpData*>(node->user_data);
@@ -130,8 +133,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  int depth = output->dims->data[3];
  if (input->type == kTfLiteInt8) {
-    EvalInt8(GetTensorData<int8_t>(input), num_slots, depth,
+    EvalInt8(tflite::micro::GetTensorData<int8_t>(input), num_slots, depth,
-             GetTensorData<int8_t>(output));
+             tflite::micro::GetTensorData<int8_t>(output));
  } else {
    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                       TfLiteTypeGetName(input->type), input->type);
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/comparisons.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/comparisons.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 namespace tflite {
 namespace ops {
@@ -25,103 +26,109 @@ namespace micro {
 namespace comparisons {
 namespace {
 struct OpData {
  ComparisonParams params;
 };
 constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 // TODO(ruic): optimize macros below to using template functions.
 #define TF_LITE_QUANTIZE_COMPARISON(opname)                                    \
  template <typename input_dtype>                                              \
  void EvalQuantized##opname(TfLiteContext* context, TfLiteNode* node,         \
                             const TfLiteTensor* input1,                       \
                             const TfLiteTensor* input2, TfLiteTensor* output, \
                             bool requires_broadcast) {                        \
    if (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8) {         \
      auto input1_offset = -input1->params.zero_point;                         \
      auto input2_offset = -input2->params.zero_point;                         \
      const int left_shift = 8;                                                \
                                                                               \
      int32 input1_multiplier;                                                 \
      int input1_shift;                                                        \
      QuantizeMultiplierSmallerThanOneExp(                                     \
          static_cast<double>(input1->params.scale), &input1_multiplier,       \
          &input1_shift);                                                      \
      int32 input2_multiplier;                                                 \
      int input2_shift;                                                        \
      QuantizeMultiplierSmallerThanOneExp(                                     \
          static_cast<double>(input2->params.scale), &input2_multiplier,       \
          &input2_shift);                                                      \
                                                                               \
      ComparisonParams op_params;                                              \
      op_params.left_shift = left_shift;                                       \
      op_params.input1_offset = input1_offset;                                 \
      op_params.input1_multiplier = input1_multiplier;                         \
      op_params.input1_shift = input1_shift;                                   \
      op_params.input2_offset = input2_offset;                                 \
      op_params.input2_multiplier = input2_multiplier;                         \
      op_params.input2_shift = input2_shift;                                   \
      if (requires_broadcast) {                                                \
        reference_ops::Broadcast4DSlow##opname##WithScaling(                   \
            op_params, GetTensorShape(input1),                                 \
            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
            GetTensorData<bool>(output));                                      \
      } else {                                                                 \
        reference_ops::opname##WithScaling(                                    \
            op_params, GetTensorShape(input1),                                 \
            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
            GetTensorData<bool>(output));                                      \
      }                                                                        \
    }                                                                          \
  }
 TF_LITE_QUANTIZE_COMPARISON(Equal);
 TF_LITE_QUANTIZE_COMPARISON(NotEqual);
 TF_LITE_QUANTIZE_COMPARISON(Greater);
 TF_LITE_QUANTIZE_COMPARISON(GreaterEqual);
 TF_LITE_QUANTIZE_COMPARISON(Less);
 TF_LITE_QUANTIZE_COMPARISON(LessEqual);
 #undef TF_LITE_QUANTIZE_COMPARISON
 #define TF_LITE_COMPARISON(type, opname, requires_broadcast)                  \
  {                                                                           \
    ComparisonParams op_params;                                               \
    requires_broadcast                                                        \
        ? reference_ops::Broadcast4DSlow##opname##NoScaling(                  \
              op_params, GetTensorShape(input1), GetTensorData<type>(input1), \
              GetTensorShape(input2), GetTensorData<type>(input2),            \
              GetTensorShape(output), GetTensorData<bool>(output))            \
        : reference_ops::opname##NoScaling(                                   \
              op_params, GetTensorShape(input1), GetTensorData<type>(input1), \
              GetTensorShape(input2), GetTensorData<type>(input2),            \
              GetTensorShape(output), GetTensorData<bool>(output));           \
  }
 TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TFLITE_DCHECK(node->user_data != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  const TfLiteEvalTensor* input1 =
      tflite::micro::GetEvalInput(context, node, kInputTensor1);
  const TfLiteEvalTensor* input2 =
      tflite::micro::GetEvalInput(context, node, kInputTensor2);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
  bool* output_data = tflite::micro::GetTensorData<bool>(output);
  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
  switch (input1->type) {
    case kTfLiteBool:
-      TF_LITE_COMPARISON(bool, Equal, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<bool>(input1), input2_shape,
                tflite::micro::GetTensorData<bool>(input2), output_shape,
                output_data)
          : reference_ops::EqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<bool>(input1), input2_shape,
                tflite::micro::GetTensorData<bool>(input2), output_shape,
                output_data);
      break;
    case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, Equal, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<float>(input1), input2_shape,
                tflite::micro::GetTensorData<float>(input2), output_shape,
                output_data)
          : reference_ops::EqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<float>(input1), input2_shape,
                tflite::micro::GetTensorData<float>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, Equal, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
                output_data)
          : reference_ops::EqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, Equal, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
                output_data)
          : reference_ops::EqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteUInt8:
-      EvalQuantizedEqual<uint8_t>(context, node, input1, input2, output,
+      requires_broadcast
-                                  requires_broadcast);
+          ? reference_ops::Broadcast4DSlowEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
                output_data)
          : reference_ops::EqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt8:
-      EvalQuantizedEqual<int8_t>(context, node, input1, input2, output,
+      requires_broadcast
-                                 requires_broadcast);
+          ? reference_ops::Broadcast4DSlowEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
                output_data)
          : reference_ops::EqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
                output_data);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -133,30 +140,100 @@ TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
 // TODO(renjieliu): Refactor the logic to avoid duplications.
 TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TFLITE_DCHECK(node->user_data != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  const TfLiteEvalTensor* input1 =
      tflite::micro::GetEvalInput(context, node, kInputTensor1);
  const TfLiteEvalTensor* input2 =
      tflite::micro::GetEvalInput(context, node, kInputTensor2);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
  bool* output_data = tflite::micro::GetTensorData<bool>(output);
  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
  switch (input1->type) {
    case kTfLiteBool:
-      TF_LITE_COMPARISON(bool, NotEqual, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowNotEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<bool>(input1), input2_shape,
                tflite::micro::GetTensorData<bool>(input2), output_shape,
                output_data)
          : reference_ops::NotEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<bool>(input1), input2_shape,
                tflite::micro::GetTensorData<bool>(input2), output_shape,
                output_data);
      break;
    case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, NotEqual, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowNotEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<float>(input1), input2_shape,
                tflite::micro::GetTensorData<float>(input2), output_shape,
                output_data)
          : reference_ops::NotEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<float>(input1), input2_shape,
                tflite::micro::GetTensorData<float>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, NotEqual, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowNotEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
                output_data)
          : reference_ops::NotEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, NotEqual, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowNotEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
                output_data)
          : reference_ops::NotEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteUInt8:
-      EvalQuantizedNotEqual<uint8_t>(context, node, input1, input2, output,
+      requires_broadcast
-                                     requires_broadcast);
+          ? reference_ops::Broadcast4DSlowNotEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
                output_data)
          : reference_ops::NotEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt8:
-      EvalQuantizedNotEqual<int8_t>(context, node, input1, input2, output,
+      requires_broadcast
-                                    requires_broadcast);
+          ? reference_ops::Broadcast4DSlowNotEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
                output_data)
          : reference_ops::NotEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
                output_data);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -167,27 +244,87 @@ TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
 }
 TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TFLITE_DCHECK(node->user_data != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  const TfLiteEvalTensor* input1 =
      tflite::micro::GetEvalInput(context, node, kInputTensor1);
  const TfLiteEvalTensor* input2 =
      tflite::micro::GetEvalInput(context, node, kInputTensor2);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
  bool* output_data = tflite::micro::GetTensorData<bool>(output);
  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
  switch (input1->type) {
    case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, Greater, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowGreaterNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<float>(input1), input2_shape,
                tflite::micro::GetTensorData<float>(input2), output_shape,
                output_data)
          : reference_ops::GreaterNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<float>(input1), input2_shape,
                tflite::micro::GetTensorData<float>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, Greater, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowGreaterNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
                output_data)
          : reference_ops::GreaterNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, Greater, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowGreaterNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
                output_data)
          : reference_ops::GreaterNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteUInt8:
-      EvalQuantizedGreater<uint8_t>(context, node, input1, input2, output,
+      requires_broadcast
-                                    requires_broadcast);
+          ? reference_ops::Broadcast4DSlowGreaterWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
                output_data)
          : reference_ops::GreaterWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt8:
-      EvalQuantizedGreater<int8_t>(context, node, input1, input2, output,
+      requires_broadcast
-                                   requires_broadcast);
+          ? reference_ops::Broadcast4DSlowGreaterWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
                output_data)
          : reference_ops::GreaterWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
                output_data);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -198,27 +335,87 @@ TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
 }
 TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TFLITE_DCHECK(node->user_data != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  const TfLiteEvalTensor* input1 =
      tflite::micro::GetEvalInput(context, node, kInputTensor1);
  const TfLiteEvalTensor* input2 =
      tflite::micro::GetEvalInput(context, node, kInputTensor2);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
  bool* output_data = tflite::micro::GetTensorData<bool>(output);
  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
  switch (input1->type) {
    case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, GreaterEqual, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowGreaterEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<float>(input1), input2_shape,
                tflite::micro::GetTensorData<float>(input2), output_shape,
                output_data)
          : reference_ops::GreaterEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<float>(input1), input2_shape,
                tflite::micro::GetTensorData<float>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, GreaterEqual, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowGreaterEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
                output_data)
          : reference_ops::GreaterEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, GreaterEqual, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowGreaterEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
                output_data)
          : reference_ops::GreaterEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteUInt8:
-      EvalQuantizedGreaterEqual<uint8_t>(context, node, input1, input2, output,
+      requires_broadcast
-                                         requires_broadcast);
+          ? reference_ops::Broadcast4DSlowGreaterEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
                output_data)
          : reference_ops::GreaterEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt8:
-      EvalQuantizedGreaterEqual<int8_t>(context, node, input1, input2, output,
+      requires_broadcast
-                                        requires_broadcast);
+          ? reference_ops::Broadcast4DSlowGreaterEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
                output_data)
          : reference_ops::GreaterEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
                output_data);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -229,27 +426,87 @@ TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
 }
 TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TFLITE_DCHECK(node->user_data != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  const TfLiteEvalTensor* input1 =
      tflite::micro::GetEvalInput(context, node, kInputTensor1);
  const TfLiteEvalTensor* input2 =
      tflite::micro::GetEvalInput(context, node, kInputTensor2);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
  bool* output_data = tflite::micro::GetTensorData<bool>(output);
  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
  switch (input1->type) {
    case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, Less, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowLessNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<float>(input1), input2_shape,
                tflite::micro::GetTensorData<float>(input2), output_shape,
                output_data)
          : reference_ops::LessNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<float>(input1), input2_shape,
                tflite::micro::GetTensorData<float>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, Less, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowLessNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
                output_data)
          : reference_ops::LessNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, Less, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowLessNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
                output_data)
          : reference_ops::LessNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteUInt8:
-      EvalQuantizedLess<uint8_t>(context, node, input1, input2, output,
+      requires_broadcast
-                                 requires_broadcast);
+          ? reference_ops::Broadcast4DSlowLessWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
                output_data)
          : reference_ops::LessWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt8:
-      EvalQuantizedLess<int8_t>(context, node, input1, input2, output,
+      requires_broadcast
-                                requires_broadcast);
+          ? reference_ops::Broadcast4DSlowLessWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
                output_data)
          : reference_ops::LessWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
                output_data);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -260,27 +517,87 @@ TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
 }
 TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TFLITE_DCHECK(node->user_data != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  const TfLiteEvalTensor* input1 =
      tflite::micro::GetEvalInput(context, node, kInputTensor1);
  const TfLiteEvalTensor* input2 =
      tflite::micro::GetEvalInput(context, node, kInputTensor2);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
  bool* output_data = tflite::micro::GetTensorData<bool>(output);
  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
  switch (input1->type) {
    case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, LessEqual, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowLessEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<float>(input1), input2_shape,
                tflite::micro::GetTensorData<float>(input2), output_shape,
                output_data)
          : reference_ops::LessEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<float>(input1), input2_shape,
                tflite::micro::GetTensorData<float>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, LessEqual, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowLessEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
                output_data)
          : reference_ops::LessEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, LessEqual, requires_broadcast);
+      requires_broadcast
          ? reference_ops::Broadcast4DSlowLessEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
                output_data)
          : reference_ops::LessEqualNoScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteUInt8:
-      EvalQuantizedLessEqual<uint8_t>(context, node, input1, input2, output,
+      requires_broadcast
-                                      requires_broadcast);
+          ? reference_ops::Broadcast4DSlowLessEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
                output_data)
          : reference_ops::LessEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
                output_data);
      break;
    case kTfLiteInt8:
-      EvalQuantizedLessEqual<int8_t>(context, node, input1, input2, output,
+      requires_broadcast
-                                     requires_broadcast);
+          ? reference_ops::Broadcast4DSlowLessEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
                output_data)
          : reference_ops::LessEqualWithScaling(
                data->params, input1_shape,
                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
                output_data);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -291,78 +608,115 @@ TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  OpData* data = static_cast<OpData*>(node->user_data);
  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
  TF_LITE_ENSURE(context, input1 != nullptr);
  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
  TF_LITE_ENSURE(context, input2 != nullptr);
  if (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8) {
    auto input1_offset = -input1->params.zero_point;
    auto input2_offset = -input2->params.zero_point;
    const int kLeftShift = 8;
    int32_t input1_multiplier;
    int input1_shift;
    QuantizeMultiplierSmallerThanOneExp(
        static_cast<double>(input1->params.scale), &input1_multiplier,
        &input1_shift);
    int32_t input2_multiplier;
    int input2_shift;
    QuantizeMultiplierSmallerThanOneExp(
        static_cast<double>(input2->params.scale), &input2_multiplier,
        &input2_shift);
    data->params.left_shift = kLeftShift;
    data->params.input1_offset = input1_offset;
    data->params.input1_multiplier = input1_multiplier;
    data->params.input1_shift = input1_shift;
    data->params.input2_offset = input2_offset;
    data->params.input2_multiplier = input2_multiplier;
    data->params.input2_shift = input2_shift;
  }
  return kTfLiteOk;
 }
 }  // namespace comparisons
-TfLiteRegistration* Register_EQUAL() {
+TfLiteRegistration Register_EQUAL() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/comparisons::Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
-                                 /*invoke=*/comparisons::EqualEval,
+          /*invoke=*/comparisons::EqualEval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_NOT_EQUAL() {
+TfLiteRegistration Register_NOT_EQUAL() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/comparisons::Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
-                                 /*invoke=*/comparisons::NotEqualEval,
+          /*invoke=*/comparisons::NotEqualEval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_GREATER() {
+TfLiteRegistration Register_GREATER() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/comparisons::Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
-                                 /*invoke=*/comparisons::GreaterEval,
+          /*invoke=*/comparisons::GreaterEval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_GREATER_EQUAL() {
+TfLiteRegistration Register_GREATER_EQUAL() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/comparisons::Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
-                                 /*invoke=*/comparisons::GreaterEqualEval,
+          /*invoke=*/comparisons::GreaterEqualEval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_LESS() {
+TfLiteRegistration Register_LESS() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/comparisons::Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
-                                 /*invoke=*/comparisons::LessEval,
+          /*invoke=*/comparisons::LessEval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_LESS_EQUAL() {
+TfLiteRegistration Register_LESS_EQUAL() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/comparisons::Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
-                                 /*invoke=*/comparisons::LessEqualEval,
+          /*invoke=*/comparisons::LessEqualEval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
 }  // namespace micro
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/concatenation.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/concatenation.cc
@@ -18,10 +18,11 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 namespace tflite {
 namespace ops {
@@ -31,14 +32,116 @@ namespace concatenation {
 constexpr int kMaxInputNum = 10;  // Maximum number of input tensors
 constexpr int kOutputTensor = 0;
 struct OpData {
  ConcatenationParams params;
 };
 // Handles negative axis index, coerces to positive index value.
 inline int CalculatePositiveAxis(int axis, const TfLiteTensor* output_tensor) {
  if (axis >= 0) {
    return axis;
  } else {
    return NumDimensions(output_tensor) + axis;
  }
 }
 // The following functions are helpers to get tensor data in the format that the
 // reference op implementation expects. They provide the same functionality as
 // class VectorOfTensors and class VectorOfQuantizedTensors in TFLite.
 // Gets shapes from a list of tensors.
 inline void GetAllInputTensorShapes(const TfLiteContext* context,
                                    const TfLiteNode* node,
                                    RuntimeShape all_shapes[kMaxInputNum]) {
  TFLITE_DCHECK(context != nullptr);
  TFLITE_DCHECK(node != nullptr);
  for (int i = 0; i < node->inputs->size; ++i) {
    const TfLiteEvalTensor* t = tflite::micro::GetEvalInput(context, node, i);
    RuntimeShape shape = tflite::micro::GetTensorShape(t);
    all_shapes[i].ReplaceWith(shape.DimensionsCount(), shape.DimsData());
  }
 }
 // Get shape pointers from a list of shapes.
 inline void GetShapesPointers(const RuntimeShape* shapes, size_t num,
                              const RuntimeShape* pointers[]) {
  for (size_t i = 0; i < num; ++i) {
    pointers[i] = &shapes[i];
  }
 }
 // Gets data pointers from a list of tensors.
 template <typename T>
 inline void GetAllInputTensorData(const TfLiteContext* context,
                                  const TfLiteNode* node,
                                  T* all_data[kMaxInputNum]) {
  TFLITE_DCHECK(context != nullptr);
  TFLITE_DCHECK(node != nullptr);
  for (int i = 0; i < node->inputs->size; ++i) {
    const TfLiteEvalTensor* t = tflite::micro::GetEvalInput(context, node, i);
    all_data[i] = tflite::micro::GetTensorData<T>(t);
  }
 }
 template <typename data_type>
 void EvalUnquantized(TfLiteContext* context, TfLiteNode* node) {
  // Collect the shapes and data pointer of input tensors
  RuntimeShape inputs_shape[kMaxInputNum];
  const RuntimeShape* inputs_shape_ptr[kMaxInputNum];
  const data_type* inputs_data[kMaxInputNum];
  GetAllInputTensorShapes(context, node, inputs_shape);
  GetShapesPointers(inputs_shape, node->inputs->size, inputs_shape_ptr);
  GetAllInputTensorData(context, node, inputs_data);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  TFLITE_DCHECK(node->user_data != nullptr);
  const OpData* data = static_cast<const OpData*>(node->user_data);
  reference_ops::Concatenation(data->params, inputs_shape_ptr, inputs_data,
                               tflite::micro::GetTensorShape(output),
                               tflite::micro::GetTensorData<data_type>(output));
 }
 void EvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node) {
  // Collect the shapes and data pointer of input tensors
  RuntimeShape inputs_shape[kMaxInputNum];
  const RuntimeShape* inputs_shape_ptr[kMaxInputNum];
  const uint8_t* inputs_data[kMaxInputNum];
  GetAllInputTensorShapes(context, node, inputs_shape);
  GetShapesPointers(inputs_shape, node->inputs->size, inputs_shape_ptr);
  GetAllInputTensorData(context, node, inputs_data);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  TFLITE_DCHECK(node->user_data != nullptr);
  const OpData* data = static_cast<const OpData*>(node->user_data);
  reference_ops::ConcatenationWithScaling(
      data->params, inputs_shape_ptr, inputs_data,
      tflite::micro::GetTensorShape(output),
      tflite::micro::GetTensorData<uint8_t>(output));
 }
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  // This function only checks the types. Additional shape validations are
  // performed in the reference implementation called during Eval().
  const TfLiteConcatenationParams* params =
      reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
-  TfLiteType input_type = GetInput(context, node, 0)->type;
+  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
-  TfLiteType output_type = GetOutput(context, node, kOutputTensor)->type;
+  TF_LITE_ENSURE(context, input_tensor != nullptr);
  TfLiteType input_type = input_tensor->type;
  const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE(context, output_tensor != nullptr);
  TfLiteType output_type = output_tensor->type;
  // Check activation and input type
  TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
@@ -57,133 +160,76 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  // Shapes with dimensions >4 are not yet supported with static allocation.
  for (int i = 0; i < num_inputs; ++i) {
    const TfLiteTensor* input = GetInput(context, node, i);
    TF_LITE_ENSURE(context, input != nullptr);
    int num_dimensions = NumDimensions(input);
    if (num_dimensions > 4) {
      TF_LITE_KERNEL_LOG(
          context,
          "Op Concatenation does not currently support num dimensions >4 "
-          "Tensor '%s' has %d dimensions.",
+          "Tensor has %d dimensions.",
-          input->name, num_dimensions);
+          num_dimensions);
      return kTfLiteError;
    }
  }
  // Calculate OpData.
  TFLITE_DCHECK(node->user_data != nullptr);
  OpData* data = static_cast<OpData*>(node->user_data);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  switch (output_type) {  // Already know in/outtypes are same.
    case kTfLiteFloat32:
    case kTfLiteInt32:
    case kTfLiteInt64: {
      data->params.axis = CalculatePositiveAxis(params->axis, output);
      data->params.inputs_count = node->inputs->size;
      break;
    }
    case kTfLiteUInt8:
    case kTfLiteInt8: {
      data->params.axis = CalculatePositiveAxis(params->axis, output);
      data->params.inputs_count = node->inputs->size;
      float* input_scales =
          reinterpret_cast<float*>(context->AllocatePersistentBuffer(
              context, node->inputs->size * sizeof(float)));
      int32_t* input_zero_points =
          reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
              context, node->inputs->size * sizeof(int32_t)));
      // Allocate persistent scale and zeropoint buffers.
      // Store input scale and zero point values in OpParams:
      for (int i = 0; i < node->inputs->size; ++i) {
        const TfLiteTensor* t = GetInput(context, node, i);
        TF_LITE_ENSURE(context, t != nullptr);
        input_scales[i] = t->params.scale;
        input_zero_points[i] = t->params.zero_point;
      }
      data->params.input_scale = input_scales;
      data->params.input_zeropoint = input_zero_points;
      data->params.output_zeropoint = output->params.zero_point;
      data->params.output_scale = output->params.scale;
      break;
    }
    default:
      TF_LITE_KERNEL_LOG(
          context, "Op Concatenation does not currently support Type '%s'.",
          TfLiteTypeGetName(output_type));
      return kTfLiteError;
  }
  return kTfLiteOk;
 }
 // Handles negative axis index, coerces to positive index value.
 inline int CalculatePositiveAxis(int axis, const TfLiteTensor* output_tensor) {
  if (axis >= 0) {
    return axis;
  } else {
    return NumDimensions(output_tensor) + axis;
  }
 }
 // The following functions are helpers to get tensor data in the format that the
 // reference op implementation expects. They provide the same functionality as
 // class VectorOfTensors and class VectorOfQuantizedTensors in TFLite.
 // Gets shapes from a list of tensors.
 inline void GetAllTensorShapes(const TfLiteContext& context,
                               const TfLiteIntArray& tensor_list,
                               RuntimeShape all_shapes[kMaxInputNum]) {
  for (int i = 0; i < tensor_list.size; ++i) {
    const TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
    RuntimeShape shape = GetTensorShape(t);
    all_shapes[i].ReplaceWith(shape.DimensionsCount(), shape.DimsData());
  }
 }
 // Get shape pointers from a list of shapes.
 inline void GetShapesPointers(const RuntimeShape* shapes, size_t num,
                              const RuntimeShape* pointers[]) {
  for (size_t i = 0; i < num; ++i) {
    pointers[i] = &shapes[i];
  }
 }
 // Gets data pointers from a list of tensors.
 template <typename T>
 inline void GetAllTensorData(const TfLiteContext& context,
                             const TfLiteIntArray& tensor_list,
                             T* all_data[kMaxInputNum]) {
  for (int i = 0; i < tensor_list.size; ++i) {
    const TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
    all_data[i] = GetTensorData<T>(t);
  }
 }
 // Gets scale and zero point from a list of tensors
 inline void GetAllQuantizationParam(const TfLiteContext& context,
                                    const TfLiteIntArray& tensor_list,
                                    float scales[kMaxInputNum],
                                    int32 zero_points[kMaxInputNum]) {
  for (int i = 0; i < tensor_list.size; ++i) {
    const TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
    scales[i] = t->params.scale;
    zero_points[i] = t->params.zero_point;
  }
 }
 template <typename data_type>
 void EvalUnquantized(TfLiteContext* context, TfLiteNode* node) {
  // Collect the shapes and data pointer of input tensors
  RuntimeShape inputs_shape[kMaxInputNum];
  const RuntimeShape* inputs_shape_ptr[kMaxInputNum];
  const data_type* inputs_data[kMaxInputNum];
  GetAllTensorShapes(*context, *node->inputs, inputs_shape);
  GetShapesPointers(inputs_shape, node->inputs->size, inputs_shape_ptr);
  GetAllTensorData(*context, *node->inputs, inputs_data);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  const TfLiteConcatenationParams* params =
      reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
  ConcatenationParams op_params;
  op_params.axis = CalculatePositiveAxis(params->axis, output);
  op_params.inputs_count = NumInputs(node);
  reference_ops::Concatenation(op_params, inputs_shape_ptr, inputs_data,
                               GetTensorShape(output),
                               GetTensorData<data_type>(output));
 }
 void EvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node) {
  // Collect the shapes and data pointer of input tensors
  RuntimeShape inputs_shape[kMaxInputNum];
  const RuntimeShape* inputs_shape_ptr[kMaxInputNum];
  const uint8_t* inputs_data[kMaxInputNum];
  float inputs_scale[kMaxInputNum];
  int32 inputs_zero_point[kMaxInputNum];
  GetAllTensorShapes(*context, *node->inputs, inputs_shape);
  GetShapesPointers(inputs_shape, node->inputs->size, inputs_shape_ptr);
  GetAllTensorData(*context, *node->inputs, inputs_data);
  GetAllQuantizationParam(*context, *node->inputs, inputs_scale,
                          inputs_zero_point);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  const TfLiteConcatenationParams* params =
      reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
  ConcatenationParams op_params;
  op_params.axis = CalculatePositiveAxis(params->axis, output);
  op_params.inputs_count = NumInputs(node);
  op_params.input_zeropoint = inputs_zero_point;
  op_params.input_scale = inputs_scale;
  op_params.output_zeropoint = output->params.zero_point;
  op_params.output_scale = output->params.scale;
  reference_ops::ConcatenationWithScaling(op_params, inputs_shape_ptr,
                                          inputs_data, GetTensorShape(output),
                                          GetTensorData<uint8>(output));
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteType output_type = GetOutput(context, node, kOutputTensor)->type;
+  const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE(context, output_tensor != nullptr);
  TfLiteType output_type = output_tensor->type;
  switch (output_type) {  // Already know in/outtypes are same.
    case kTfLiteFloat32:
@@ -214,16 +260,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace concatenation
-TfLiteRegistration* Register_CONCATENATION() {
+TfLiteRegistration Register_CONCATENATION() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/concatenation::Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/concatenation::Prepare,
+          /*prepare=*/concatenation::Prepare,
-                                 /*invoke=*/concatenation::Eval,
+          /*invoke=*/concatenation::Eval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
 }  // namespace micro
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/conv
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/conv
@@ -1,279 +0,0 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/reference/conv.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace conv {
 constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 // Angepasst jomjol 05.06.20
 //constexpr int kMaxChannels = 1024;
 constexpr int kMaxChannels = 4096;
 // Conv is quantized along dimension 0:
 // https://www.tensorflow.org/lite/performance/quantization_spec
 constexpr int kConvQuantizedDimension = 0;
 // This file has 2 implementation of Conv.
 struct OpData {
  TfLitePaddingValues padding;
  // The scaling factor from input to output (aka the 'real multiplier') can
  // be represented as a fixed point multiplier plus a left shift.
  int32_t output_multiplier;
  int output_shift;
  // Per channel output multiplier and shift.
  // TODO(b/141139247): Allocate these dynamically when possible.
  int32_t per_channel_output_multiplier[kMaxChannels];
  int32_t per_channel_output_shift[kMaxChannels];
  // The range of the fused activation layer. For example for kNone and
  // uint8_t these would be 0 and 255.
  int32_t output_activation_min;
  int32_t output_activation_max;
 };
 inline PaddingType RuntimePaddingType(TfLitePadding padding) {
  switch (padding) {
    case TfLitePadding::kTfLitePaddingSame:
      return PaddingType::kSame;
    case TfLitePadding::kTfLitePaddingValid:
      return PaddingType::kValid;
    case TfLitePadding::kTfLitePaddingUnknown:
    default:
      return PaddingType::kNone;
  }
 }
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                             TfLiteConvParams* params, int width, int height,
                             int filter_width, int filter_height, int out_width,
                             int out_height, const TfLiteType data_type,
                             OpData* data) {
  bool has_bias = node->inputs->size == 3;
  // Check number of inputs/outputs
  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
  // Matching GetWindowedOutputSize in TensorFlow.
  auto padding = params->padding;
  data->padding = ComputePaddingHeightWidth(
      params->stride_height, params->stride_width,
      params->dilation_height_factor, params->dilation_width_factor, height,
      width, filter_height, filter_width, padding, &out_height, &out_width);
  // Note that quantized inference requires that all tensors have their
  // parameters set. This is usually done during quantized training.
  if (data_type != kTfLiteFloat32) {
    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
    const TfLiteTensor* bias =
        GetOptionalInputTensor(context, node, kBiasTensor);
    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
    int output_channels = filter->dims->data[kConvQuantizedDimension];
    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
        context, input, filter, bias, output, params->activation,
        &data->output_multiplier, &data->output_shift,
        &data->output_activation_min, &data->output_activation_max,
        data->per_channel_output_multiplier,
        reinterpret_cast<int*>(data->per_channel_output_shift),
        output_channels));
  }
  return kTfLiteOk;
 }
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                   TfLiteConvParams* params, OpData* data,
                   const TfLiteTensor* input, const TfLiteTensor* filter,
                   const TfLiteTensor* bias, TfLiteTensor* im2col,
                   TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
  const int32_t input_offset = -input->params.zero_point;
  const int32_t filter_offset = -filter->params.zero_point;
  const int32_t output_offset = output->params.zero_point;
  ConvParams op_params;
  op_params.padding_type = RuntimePaddingType(params->padding);
  op_params.padding_values.width = data->padding.width;
  op_params.padding_values.height = data->padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.input_offset = input_offset;
  op_params.weights_offset = filter_offset;
  op_params.output_offset = output_offset;
  op_params.output_multiplier = data->output_multiplier;
  op_params.output_shift = -data->output_shift;
  op_params.quantized_activation_min = data->output_activation_min;
  op_params.quantized_activation_max = data->output_activation_max;
  reference_ops::Conv(op_params, GetTensorShape(input),
                      GetTensorData<uint8_t>(input), GetTensorShape(filter),
                      GetTensorData<uint8_t>(filter), GetTensorShape(bias),
                      GetTensorData<int32_t>(bias), GetTensorShape(output),
                      GetTensorData<uint8_t>(output), GetTensorShape(im2col),
                      GetTensorData<uint8_t>(im2col), nullptr);
 }
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                             TfLiteConvParams* params, OpData* data,
                             const TfLiteTensor* input,
                             const TfLiteTensor* filter,
                             const TfLiteTensor* bias, TfLiteTensor* output,
                             TfLiteTensor* im2col) {
  ConvParams op_params;
  op_params.input_offset = -input->params.zero_point;
  op_params.output_offset = output->params.zero_point;
  op_params.stride_height = params->stride_height;
  op_params.stride_width = params->stride_width;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.padding_values.height = data->padding.height;
  op_params.padding_values.width = data->padding.width;
  op_params.quantized_activation_min = data->output_activation_min;
  op_params.quantized_activation_max = data->output_activation_max;
  reference_integer_ops::ConvPerChannel(
      op_params, data->per_channel_output_multiplier,
      data->per_channel_output_shift, GetTensorShape(input),
      GetTensorData<int8>(input), GetTensorShape(filter),
      GetTensorData<int8>(filter), GetTensorShape(bias),
      GetTensorData<int32>(bias), GetTensorShape(output),
      GetTensorData<int8>(output));
 }
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
               TfLiteConvParams* params, OpData* data,
               const TfLiteTensor* input, const TfLiteTensor* filter,
               const TfLiteTensor* bias, TfLiteTensor* im2col,
               TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
  ConvParams op_params;
  op_params.padding_type = RuntimePaddingType(params->padding);
  op_params.padding_values.width = data->padding.width;
  op_params.padding_values.height = data->padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.float_activation_min = output_activation_min;
  op_params.float_activation_max = output_activation_max;
  reference_ops::Conv(op_params, GetTensorShape(input),
                      GetTensorData<float>(input), GetTensorShape(filter),
                      GetTensorData<float>(filter), GetTensorShape(bias),
                      GetTensorData<float>(bias), GetTensorShape(output),
                      GetTensorData<float>(output), GetTensorShape(im2col),
                      GetTensorData<float>(im2col));
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
  int input_width = input->dims->data[2];
  int input_height = input->dims->data[1];
  int filter_width = filter->dims->data[2];
  int filter_height = filter->dims->data[1];
  int output_width = output->dims->data[2];
  int output_height = output->dims->data[1];
  OpData data;
  // All per-channel quantized tensors need valid zero point and scale arrays.
  if (input->type == kTfLiteInt8) {
    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
                      kTfLiteAffineQuantization);
    const auto* affine_quantization =
        reinterpret_cast<TfLiteAffineQuantization*>(
            filter->quantization.params);
    TF_LITE_ENSURE(context, affine_quantization);
    TF_LITE_ENSURE(context, affine_quantization->scale);
    TF_LITE_ENSURE(context, affine_quantization->zero_point);
    TF_LITE_ENSURE(context,
                   affine_quantization->scale->size == 1 ||
                       affine_quantization->scale->size ==
                           filter->dims->data[kConvQuantizedDimension]);
    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                      affine_quantization->zero_point->size);
  }
  TF_LITE_ENSURE_STATUS(CalculateOpData(
      context, node, params, input_width, input_height, filter_width,
      filter_height, output_width, output_height, input->type, &data));
  switch (input->type) {  // Already know in/out types are same.
    case kTfLiteFloat32:
      EvalFloat(context, node, params, &data, input, filter, bias, nullptr,
                nullptr, output);
      break;
    case kTfLiteInt8:
      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
                              output, nullptr);
      break;
    case kTfLiteUInt8:
      EvalQuantized(context, node, params, &data, input, filter, bias, nullptr,
                    nullptr, output);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                         TfLiteTypeGetName(input->type), input->type);
      return kTfLiteError;
  }
  return kTfLiteOk;
 }
 }  // namespace conv
 TfLiteRegistration* Register_CONV_2D() {
  static TfLiteRegistration r = {/*init=*/nullptr,
                                 /*free=*/nullptr,
                                 /*prepare=*/nullptr,
                                 /*invoke=*/conv::Eval,
                                 /*profiling_string=*/nullptr,
                                 /*builtin_code=*/0,
                                 /*custom_name=*/nullptr,
                                 /*version=*/0};
  return &r;
 }
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/conv.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/conv.cc
@@ -23,19 +23,15 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 namespace tflite {
-namespace ops {
+namespace {
 namespace micro {
 namespace conv {
 constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 // Angepasst jomjol 05.06.20
 //constexpr int kMaxChannels = 1024;
 constexpr int kMaxChannels = 32384;
 // Conv is quantized along dimension 0:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -45,15 +41,20 @@ constexpr int kConvQuantizedDimension = 0;
 struct OpData {
  TfLitePaddingValues padding;
  // Cached tensor zero point values for quantized operations.
  int32_t input_zero_point;
  int32_t filter_zero_point;
  int32_t output_zero_point;
  // The scaling factor from input to output (aka the 'real multiplier') can
  // be represented as a fixed point multiplier plus a left shift.
  int32_t output_multiplier;
  int output_shift;
  // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
+  int32_t* per_channel_output_multiplier;
-  int32_t per_channel_output_multiplier[kMaxChannels];
+  int32_t* per_channel_output_shift;
  int32_t per_channel_output_shift[kMaxChannels];
  // The range of the fused activation layer. For example for kNone and
  // uint8_t these would be 0 and 255.
@@ -74,10 +75,10 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) {
 }
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, int width, int height,
+                             const TfLiteConvParams* params, int width,
-                             int filter_width, int filter_height, int out_width,
+                             int height, int filter_width, int filter_height,
-                             int out_height, const TfLiteType data_type,
+                             int out_width, int out_height,
-                             OpData* data) {
+                             const TfLiteType data_type, OpData* data) {
  bool has_bias = node->inputs->size == 3;
  // Check number of inputs/outputs
  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@@ -94,10 +95,13 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
  // parameters set. This is usually done during quantized training.
  if (data_type != kTfLiteFloat32) {
    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
    TF_LITE_ENSURE(context, input != nullptr);
    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
    TF_LITE_ENSURE(context, filter != nullptr);
    const TfLiteTensor* bias =
        GetOptionalInputTensor(context, node, kBiasTensor);
    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
    TF_LITE_ENSURE(context, output != nullptr);
    int output_channels = filter->dims->data[kConvQuantizedDimension];
    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
@@ -111,100 +115,24 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
  return kTfLiteOk;
 }
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-                   TfLiteConvParams* params, OpData* data,
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
                   const TfLiteTensor* bias, TfLiteTensor* im2col,
                   TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
  const int32_t input_offset = -input->params.zero_point;
  const int32_t filter_offset = -filter->params.zero_point;
  const int32_t output_offset = output->params.zero_point;
  ConvParams op_params;
  op_params.padding_type = RuntimePaddingType(params->padding);
  op_params.padding_values.width = data->padding.width;
  op_params.padding_values.height = data->padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.input_offset = input_offset;
  op_params.weights_offset = filter_offset;
  op_params.output_offset = output_offset;
  op_params.output_multiplier = data->output_multiplier;
  op_params.output_shift = -data->output_shift;
  op_params.quantized_activation_min = data->output_activation_min;
  op_params.quantized_activation_max = data->output_activation_max;
  reference_ops::Conv(op_params, GetTensorShape(input),
                      GetTensorData<uint8_t>(input), GetTensorShape(filter),
                      GetTensorData<uint8_t>(filter), GetTensorShape(bias),
                      GetTensorData<int32_t>(bias), GetTensorShape(output),
                      GetTensorData<uint8_t>(output), GetTensorShape(im2col),
                      GetTensorData<uint8_t>(im2col), nullptr);
 }
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-                             TfLiteConvParams* params, OpData* data,
+  TFLITE_DCHECK(node->user_data != nullptr);
-                             const TfLiteTensor* input,
+  TFLITE_DCHECK(node->builtin_data != nullptr);
                             const TfLiteTensor* filter,
                             const TfLiteTensor* bias, TfLiteTensor* output,
                             TfLiteTensor* im2col) {
  ConvParams op_params;
  op_params.input_offset = -input->params.zero_point;
  op_params.output_offset = output->params.zero_point;
  op_params.stride_height = params->stride_height;
  op_params.stride_width = params->stride_width;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.padding_values.height = data->padding.height;
  op_params.padding_values.width = data->padding.width;
  op_params.quantized_activation_min = data->output_activation_min;
  op_params.quantized_activation_max = data->output_activation_max;
-  reference_integer_ops::ConvPerChannel(
+  OpData* data = static_cast<OpData*>(node->user_data);
-      op_params, data->per_channel_output_multiplier,
+  const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);
      data->per_channel_output_shift, GetTensorShape(input),
      GetTensorData<int8>(input), GetTensorShape(filter),
      GetTensorData<int8>(filter), GetTensorShape(bias),
      GetTensorData<int32>(bias), GetTensorShape(output),
      GetTensorData<int8>(output));
 }
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
               TfLiteConvParams* params, OpData* data,
               const TfLiteTensor* input, const TfLiteTensor* filter,
               const TfLiteTensor* bias, TfLiteTensor* im2col,
               TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
  ConvParams op_params;
  op_params.padding_type = RuntimePaddingType(params->padding);
  op_params.padding_values.width = data->padding.width;
  op_params.padding_values.height = data->padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.float_activation_min = output_activation_min;
  op_params.float_activation_max = output_activation_max;
  reference_ops::Conv(op_params, GetTensorShape(input),
                      GetTensorData<float>(input), GetTensorShape(filter),
                      GetTensorData<float>(filter), GetTensorShape(bias),
                      GetTensorData<float>(bias), GetTensorShape(output),
                      GetTensorData<float>(output), GetTensorShape(im2col),
                      GetTensorData<float>(im2col));
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
  int input_width = input->dims->data[2];
  int input_height = input->dims->data[1];
@@ -213,8 +141,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  int output_width = output->dims->data[2];
  int output_height = output->dims->data[1];
-
+  // Dynimically allocate per-channel quantization parameters.
-  struct tflite::ops::micro::conv::OpData *data = (struct tflite::ops::micro::conv::OpData*) malloc(sizeof(struct tflite::ops::micro::conv::OpData));
+  const int num_channels = filter->dims->data[kConvQuantizedDimension];
  data->per_channel_output_multiplier =
      static_cast<int32_t*>(context->AllocatePersistentBuffer(
          context, num_channels * sizeof(int32_t)));
  data->per_channel_output_shift =
      static_cast<int32_t*>(context->AllocatePersistentBuffer(
          context, num_channels * sizeof(int32_t)));
  // All per-channel quantized tensors need valid zero point and scale arrays.
  if (input->type == kTfLiteInt8) {
@@ -222,8 +156,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                      kTfLiteAffineQuantization);
    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
            filter->quantization.params);
    TF_LITE_ENSURE(context, affine_quantization);
    TF_LITE_ENSURE(context, affine_quantization->scale);
    TF_LITE_ENSURE(context, affine_quantization->zero_point);
@@ -240,6 +173,136 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
      context, node, params, input_width, input_height, filter_width,
      filter_height, output_width, output_height, input->type, data));
  data->input_zero_point = input->params.zero_point;
  data->filter_zero_point = filter->params.zero_point;
  data->output_zero_point = output->params.zero_point;
  return kTfLiteOk;
 }  // namespace conv
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                   TfLiteConvParams* params, const OpData& data,
                   const TfLiteEvalTensor* input,
                   const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
                   TfLiteEvalTensor* im2col, TfLiteEvalTensor* hwcn_weights,
                   TfLiteEvalTensor* output) {
  const int32_t input_offset = -data.input_zero_point;
  const int32_t filter_offset = -data.filter_zero_point;
  const int32_t output_offset = data.output_zero_point;
  // TODO(b/154032858): Investigate removing extra copies.
  ConvParams op_params;
  op_params.padding_type = RuntimePaddingType(params->padding);
  op_params.padding_values.width = data.padding.width;
  op_params.padding_values.height = data.padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.input_offset = input_offset;
  op_params.weights_offset = filter_offset;
  op_params.output_offset = output_offset;
  op_params.output_multiplier = data.output_multiplier;
  op_params.output_shift = -data.output_shift;
  op_params.quantized_activation_min = data.output_activation_min;
  op_params.quantized_activation_max = data.output_activation_max;
  reference_ops::Conv(op_params, tflite::micro::GetTensorShape(input),
                      tflite::micro::GetTensorData<uint8_t>(input),
                      tflite::micro::GetTensorShape(filter),
                      tflite::micro::GetTensorData<uint8_t>(filter),
                      tflite::micro::GetTensorShape(bias),
                      tflite::micro::GetTensorData<int32_t>(bias),
                      tflite::micro::GetTensorShape(output),
                      tflite::micro::GetTensorData<uint8_t>(output),
                      tflite::micro::GetTensorShape(im2col),
                      tflite::micro::GetTensorData<uint8_t>(im2col), nullptr);
 }
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                             TfLiteConvParams* params, const OpData& data,
                             const TfLiteEvalTensor* input,
                             const TfLiteEvalTensor* filter,
                             const TfLiteEvalTensor* bias,
                             TfLiteEvalTensor* output,
                             TfLiteEvalTensor* im2col) {
  // TODO(b/154032858): Investigate removing extra copies.
  ConvParams op_params;
  op_params.input_offset = -data.input_zero_point;
  op_params.output_offset = data.output_zero_point;
  op_params.stride_height = params->stride_height;
  op_params.stride_width = params->stride_width;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.padding_values.height = data.padding.height;
  op_params.padding_values.width = data.padding.width;
  op_params.quantized_activation_min = data.output_activation_min;
  op_params.quantized_activation_max = data.output_activation_max;
  reference_integer_ops::ConvPerChannel(
      op_params, data.per_channel_output_multiplier,
      data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
      tflite::micro::GetTensorData<int8_t>(input),
      tflite::micro::GetTensorShape(filter),
      tflite::micro::GetTensorData<int8_t>(filter),
      tflite::micro::GetTensorShape(bias),
      tflite::micro::GetTensorData<int32_t>(bias),
      tflite::micro::GetTensorShape(output),
      tflite::micro::GetTensorData<int8_t>(output));
 }
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
               TfLiteConvParams* params, const OpData& data,
               const TfLiteEvalTensor* input, const TfLiteEvalTensor* filter,
               const TfLiteEvalTensor* bias, TfLiteEvalTensor* im2col,
               TfLiteEvalTensor* hwcn_weights, TfLiteEvalTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
  // TODO(b/154032858): Investigate removing extra copies.
  ConvParams op_params;
  op_params.padding_type = RuntimePaddingType(params->padding);
  op_params.padding_values.width = data.padding.width;
  op_params.padding_values.height = data.padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.float_activation_min = output_activation_min;
  op_params.float_activation_max = output_activation_max;
  reference_ops::Conv(op_params, tflite::micro::GetTensorShape(input),
                      tflite::micro::GetTensorData<float>(input),
                      tflite::micro::GetTensorShape(filter),
                      tflite::micro::GetTensorData<float>(filter),
                      tflite::micro::GetTensorShape(bias),
                      tflite::micro::GetTensorData<float>(bias),
                      tflite::micro::GetTensorShape(output),
                      tflite::micro::GetTensorData<float>(output),
                      tflite::micro::GetTensorShape(im2col),
                      tflite::micro::GetTensorData<float>(im2col));
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
  const TfLiteEvalTensor* input =
      tflite::micro::GetEvalInput(context, node, kInputTensor);
  const TfLiteEvalTensor* filter =
      tflite::micro::GetEvalInput(context, node, kFilterTensor);
  const TfLiteEvalTensor* bias =
      (NumInputs(node) == 3)
          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
          : nullptr;
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  TFLITE_DCHECK(node->user_data != nullptr);
  const OpData& data = *(static_cast<const OpData*>(node->user_data));
  TF_LITE_ENSURE_EQ(context, input->type, output->type);
  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                     "Hybrid models are not supported on TFLite Micro.");
  switch (input->type) {  // Already know in/out types are same.
    case kTfLiteFloat32:
      EvalFloat(context, node, params, data, input, filter, bias, nullptr,
@@ -256,27 +319,22 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                         TfLiteTypeGetName(input->type), input->type);
      free(data);
      return kTfLiteError;
  }
  free(data);
  return kTfLiteOk;
 }
-}  // namespace conv
+}  // namespace
-TfLiteRegistration* Register_CONV_2D() {
+TfLiteRegistration Register_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/Prepare,
-                                 /*invoke=*/conv::Eval,
+          /*invoke=*/Eval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/depthwise_conv.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/depthwise_conv.cc
@@ -24,18 +24,15 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace depthwise_conv {
 namespace {
 constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 constexpr int kMaxChannels = 1024;
 // Depthwise conv is quantized along dimension 3:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -43,16 +40,20 @@ constexpr int kDepthwiseConvQuantizedDimension = 3;
 struct OpData {
  TfLitePaddingValues padding;
  // Cached tensor zero point values for quantized operations.
  int32_t input_zero_point;
  int32_t filter_zero_point;
  int32_t output_zero_point;
  // The scaling factor from input to output (aka the 'real multiplier') can
  // be represented as a fixed point multiplier plus a left shift.
  int32_t output_multiplier;
  int output_shift;
  // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
+  int32_t* per_channel_output_multiplier;
-  int32_t per_channel_output_multiplier[kMaxChannels];
+  int32_t* per_channel_output_shift;
  int32_t per_channel_output_shift[kMaxChannels];
  // The range of the fused activation layer. For example for kNone and
  // uint8_t these would be 0 and 255.
  int32_t output_activation_min;
@@ -78,125 +79,44 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
  // parameters set. This is usually done during quantized training.
  if (data_type != kTfLiteFloat32) {
    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
    TF_LITE_ENSURE(context, input != nullptr);
    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
    TF_LITE_ENSURE(context, filter != nullptr);
    const TfLiteTensor* bias =
        GetOptionalInputTensor(context, node, kBiasTensor);
    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
    TF_LITE_ENSURE(context, output != nullptr);
    int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+    return tflite::PopulateConvolutionQuantizationParams(
        context, input, filter, bias, output, params->activation,
        &data->output_multiplier, &data->output_shift,
        &data->output_activation_min, &data->output_activation_max,
        data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels);
  }
  return kTfLiteOk;
 }
-}  // namespace
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
               TfLiteDepthwiseConvParams* params, OpData* data,
               const TfLiteTensor* input, const TfLiteTensor* filter,
               const TfLiteTensor* bias, TfLiteTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
  tflite::DepthwiseParams op_params;
  // Padding type is ignored, but still set.
  op_params.padding_type = PaddingType::kSame;
  op_params.padding_values.width = data->padding.width;
  op_params.padding_values.height = data->padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.depth_multiplier = params->depth_multiplier;
  op_params.float_activation_min = output_activation_min;
  op_params.float_activation_max = output_activation_max;
  tflite::reference_ops::DepthwiseConv(
      op_params, GetTensorShape(input), GetTensorData<float>(input),
      GetTensorShape(filter), GetTensorData<float>(filter),
      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
      GetTensorData<float>(output));
 }
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-                             TfLiteDepthwiseConvParams* params, OpData* data,
+  TFLITE_DCHECK(node->user_data != nullptr);
-                             const TfLiteTensor* input,
+  TFLITE_DCHECK(node->builtin_data != nullptr);
                             const TfLiteTensor* filter,
                             const TfLiteTensor* bias, TfLiteTensor* output) {
  DepthwiseParams op_params;
  op_params.padding_type = PaddingType::kSame;
  op_params.padding_values.width = data->padding.width;
  op_params.padding_values.height = data->padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.depth_multiplier = params->depth_multiplier;
  op_params.input_offset = -input->params.zero_point;
  op_params.weights_offset = 0;
  op_params.output_offset = output->params.zero_point;
  // TODO(b/130439627): Use calculated value for clamping.
  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
  reference_integer_ops::DepthwiseConvPerChannel(
      op_params, data->per_channel_output_multiplier,
      data->per_channel_output_shift, GetTensorShape(input),
      GetTensorData<int8>(input), GetTensorShape(filter),
      GetTensorData<int8>(filter), GetTensorShape(bias),
      GetTensorData<int32>(bias), GetTensorShape(output),
      GetTensorData<int8>(output));
 }
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                   TfLiteDepthwiseConvParams* params, OpData* data,
                   const TfLiteTensor* input, const TfLiteTensor* filter,
                   const TfLiteTensor* bias, TfLiteTensor* output) {
  const int32_t input_offset = -input->params.zero_point;
  const int32_t filter_offset = -filter->params.zero_point;
  const int32_t output_offset = output->params.zero_point;
  tflite::DepthwiseParams op_params;
  // Padding type is ignored, but still set.
  op_params.padding_type = PaddingType::kSame;
  op_params.padding_values.width = data->padding.width;
  op_params.padding_values.height = data->padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.depth_multiplier = params->depth_multiplier;
  op_params.quantized_activation_min = data->output_activation_min;
  op_params.quantized_activation_max = data->output_activation_max;
  op_params.input_offset = input_offset;
  op_params.weights_offset = filter_offset;
  op_params.output_offset = output_offset;
  op_params.output_multiplier = data->output_multiplier;
  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
  op_params.output_shift = -data->output_shift;
  tflite::reference_ops::DepthwiseConv(
      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
      GetTensorShape(filter), GetTensorData<uint8_t>(filter),
      GetTensorShape(bias), GetTensorData<int32_t>(bias),
      GetTensorShape(output), GetTensorData<uint8_t>(output));
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  auto* params =
      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
  OpData* data = static_cast<OpData*>(node->user_data);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
+  TF_LITE_ENSURE(context, filter != nullptr);
      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
  const TfLiteType data_type = input->type;
  int width = SizeOfDimension(input, 2);
@@ -204,7 +124,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  int filter_width = SizeOfDimension(filter, 2);
  int filter_height = SizeOfDimension(filter, 1);
-  OpData data;
+  // Per channel quantization is only needed for int8_t inference. For other
  // quantized types, only a single scale and zero point is needed.
  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
  // Dynimically allocate per-channel quantization parameters.
  data->per_channel_output_multiplier =
      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
          context, num_channels * sizeof(int32_t)));
  data->per_channel_output_shift =
      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
          context, num_channels * sizeof(int32_t)));
  // All per-channel quantized tensors need valid zero point and scale arrays.
  if (input->type == kTfLiteInt8) {
@@ -227,20 +156,151 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
                                        filter_width, filter_height, data_type,
-                                        &data));
+                                        data));
  data->input_zero_point = input->params.zero_point;
  data->filter_zero_point = filter->params.zero_point;
  data->output_zero_point = output->params.zero_point;
  return kTfLiteOk;
 }
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
               TfLiteDepthwiseConvParams* params, const OpData& data,
               const TfLiteEvalTensor* input, const TfLiteEvalTensor* filter,
               const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
  tflite::DepthwiseParams op_params;
  // Padding type is ignored, but still set.
  op_params.padding_type = PaddingType::kSame;
  op_params.padding_values.width = data.padding.width;
  op_params.padding_values.height = data.padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.depth_multiplier = params->depth_multiplier;
  op_params.float_activation_min = output_activation_min;
  op_params.float_activation_max = output_activation_max;
  tflite::reference_ops::DepthwiseConv(
      op_params, tflite::micro::GetTensorShape(input),
      tflite::micro::GetTensorData<float>(input),
      tflite::micro::GetTensorShape(filter),
      tflite::micro::GetTensorData<float>(filter),
      tflite::micro::GetTensorShape(bias),
      tflite::micro::GetTensorData<float>(bias),
      tflite::micro::GetTensorShape(output),
      tflite::micro::GetTensorData<float>(output));
 }
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                             TfLiteDepthwiseConvParams* params,
                             const OpData& data, const TfLiteEvalTensor* input,
                             const TfLiteEvalTensor* filter,
                             const TfLiteEvalTensor* bias,
                             TfLiteEvalTensor* output) {
  DepthwiseParams op_params;
  op_params.padding_type = PaddingType::kSame;
  op_params.padding_values.width = data.padding.width;
  op_params.padding_values.height = data.padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.depth_multiplier = params->depth_multiplier;
  op_params.input_offset = -data.input_zero_point;
  op_params.weights_offset = 0;
  op_params.output_offset = data.output_zero_point;
  // TODO(b/130439627): Use calculated value for clamping.
  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
  reference_integer_ops::DepthwiseConvPerChannel(
      op_params, data.per_channel_output_multiplier,
      data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
      tflite::micro::GetTensorData<int8_t>(input),
      tflite::micro::GetTensorShape(filter),
      tflite::micro::GetTensorData<int8_t>(filter),
      tflite::micro::GetTensorShape(bias),
      tflite::micro::GetTensorData<int32_t>(bias),
      tflite::micro::GetTensorShape(output),
      tflite::micro::GetTensorData<int8_t>(output));
 }
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                   TfLiteDepthwiseConvParams* params, const OpData& data,
                   const TfLiteEvalTensor* input,
                   const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
                   TfLiteEvalTensor* output) {
  const int32_t input_offset = -data.input_zero_point;
  const int32_t filter_offset = -data.filter_zero_point;
  const int32_t output_offset = data.output_zero_point;
  tflite::DepthwiseParams op_params;
  // Padding type is ignored, but still set.
  op_params.padding_type = PaddingType::kSame;
  op_params.padding_values.width = data.padding.width;
  op_params.padding_values.height = data.padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.depth_multiplier = params->depth_multiplier;
  op_params.quantized_activation_min = data.output_activation_min;
  op_params.quantized_activation_max = data.output_activation_max;
  op_params.input_offset = input_offset;
  op_params.weights_offset = filter_offset;
  op_params.output_offset = output_offset;
  op_params.output_multiplier = data.output_multiplier;
  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
  op_params.output_shift = -data.output_shift;
  tflite::reference_ops::DepthwiseConv(
      op_params, tflite::micro::GetTensorShape(input),
      tflite::micro::GetTensorData<uint8_t>(input),
      tflite::micro::GetTensorShape(filter),
      tflite::micro::GetTensorData<uint8_t>(filter),
      tflite::micro::GetTensorShape(bias),
      tflite::micro::GetTensorData<int32_t>(bias),
      tflite::micro::GetTensorShape(output),
      tflite::micro::GetTensorData<uint8_t>(output));
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  TFLITE_DCHECK(node->builtin_data != nullptr);
  auto* params =
      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
  const OpData& data = *(static_cast<const OpData*>(node->user_data));
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  const TfLiteEvalTensor* input =
      tflite::micro::GetEvalInput(context, node, kInputTensor);
  const TfLiteEvalTensor* filter =
      tflite::micro::GetEvalInput(context, node, kFilterTensor);
  const TfLiteEvalTensor* bias =
      (NumInputs(node) == 3)
          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
          : nullptr;
  // TODO(aselle): Consider whether float conv and quantized conv should be
  // separate ops to avoid dispatch overhead here.
  switch (input->type) {  // Already know in/out types are same.
    case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, output);
+      EvalFloat(context, node, params, data, input, filter, bias, output);
      break;
    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
                              output);
      break;
    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, output);
+      EvalQuantized(context, node, params, data, input, filter, bias, output);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -250,20 +310,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  return kTfLiteOk;
 }
-}  // namespace depthwise_conv
+}  // namespace
-TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
+TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/Prepare,
-                                 /*invoke=*/depthwise_conv::Eval,
+          /*invoke=*/Eval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/dequantize.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/dequantize.cc
@@ -22,19 +22,39 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/requantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace dequantize {
 struct OpData {
  tflite::DequantizationParams quantization_params;
  // The scaling factor from input to output (aka the 'real multiplier') can
  // be represented as a fixed point multiplier plus a left shift.
  int32_t output_multiplier;
  int output_shift;
  int32_t output_zero_point;
 };
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  OpData* data = static_cast<OpData*>(node->user_data);
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
  // TODO(b/140515557): Add cached dequant to improve hybrid model performance.
  const TfLiteTensor* input = GetInput(context, node, 0);
  TF_LITE_ENSURE(context, input != nullptr);
  TfLiteTensor* output = GetOutput(context, node, 0);
  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE(context, input->type == kTfLiteUInt8 ||
                              input->type == kTfLiteInt8 ||
@@ -42,32 +62,49 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE(
      context, output->type == kTfLiteFloat32 || output->type == kTfLiteInt32);
  if (output->type == kTfLiteInt32) {
    const double effective_output_scale =
        static_cast<double>(input->params.scale) /
        static_cast<double>(output->params.scale);
    QuantizeMultiplier(effective_output_scale, &data->output_multiplier,
                       &data->output_shift);
  }
  data->quantization_params.zero_point = input->params.zero_point;
  data->quantization_params.scale = static_cast<double>(input->params.scale);
  data->output_zero_point = output->params.zero_point;
  return kTfLiteOk;
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  TFLITE_DCHECK(node->user_data != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  OpData* data = static_cast<OpData*>(node->user_data);
  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
  if (output->type == kTfLiteFloat32) {
    tflite::DequantizationParams op_params;
    op_params.zero_point = input->params.zero_point;
    op_params.scale = static_cast<double>(input->params.scale);
    switch (input->type) {
      case kTfLiteUInt8:
-        reference_ops::Dequantize(
+        reference_ops::Dequantize(data->quantization_params,
-            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+                                  tflite::micro::GetTensorShape(input),
-            GetTensorShape(output), GetTensorData<float>(output));
+                                  tflite::micro::GetTensorData<uint8_t>(input),
                                  tflite::micro::GetTensorShape(output),
                                  tflite::micro::GetTensorData<float>(output));
        break;
      case kTfLiteInt8:
-        reference_ops::Dequantize(
+        reference_ops::Dequantize(data->quantization_params,
-            op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+                                  tflite::micro::GetTensorShape(input),
-            GetTensorShape(output), GetTensorData<float>(output));
+                                  tflite::micro::GetTensorData<int8_t>(input),
                                  tflite::micro::GetTensorShape(output),
                                  tflite::micro::GetTensorData<float>(output));
        break;
      case kTfLiteInt16:
-        reference_ops::Dequantize(
+        reference_ops::Dequantize(data->quantization_params,
-            op_params, GetTensorShape(input), GetTensorData<int16_t>(input),
+                                  tflite::micro::GetTensorShape(input),
-            GetTensorShape(output), GetTensorData<float>(output));
+                                  tflite::micro::GetTensorData<int16_t>(input),
                                  tflite::micro::GetTensorShape(output),
                                  tflite::micro::GetTensorData<float>(output));
        break;
      default:
        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
@@ -76,28 +113,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
        return kTfLiteError;
    }
  } else if (output->type == kTfLiteInt32) {
-    int32_t output_multiplier;
+    int flat_size = MatchingFlatSize(tflite::micro::GetTensorShape(input),
-    int output_shift;
+                                     tflite::micro::GetTensorShape(output));
    const double effective_output_scale =
        static_cast<double>(input->params.scale) /
        static_cast<double>(output->params.scale);
    QuantizeMultiplier(effective_output_scale, &output_multiplier,
                       &output_shift);
    int flat_size =
        MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
    switch (input->type) {
      case kTfLiteInt16: {
        reference_ops::Requantize(
-            GetTensorData<int16_t>(input), flat_size, output_multiplier,
+            tflite::micro::GetTensorData<int16_t>(input), flat_size,
-            output_shift, input->params.zero_point, output->params.zero_point,
+            data->output_multiplier, data->output_shift,
-            GetTensorData<int32_t>(output));
+            data->quantization_params.zero_point, data->output_zero_point,
            tflite::micro::GetTensorData<int32_t>(output));
        break;
      }
      case kTfLiteInt8: {
        reference_ops::Requantize(
-            GetTensorData<int8_t>(input), flat_size, output_multiplier,
+            tflite::micro::GetTensorData<int8_t>(input), flat_size,
-            output_shift, input->params.zero_point, output->params.zero_point,
+            data->output_multiplier, data->output_shift,
-            GetTensorData<int32_t>(output));
+            data->quantization_params.zero_point, data->output_zero_point,
            tflite::micro::GetTensorData<int32_t>(output));
        break;
      }
      default:
@@ -118,16 +150,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace dequantize
-TfLiteRegistration* Register_DEQUANTIZE() {
+TfLiteRegistration Register_DEQUANTIZE() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/dequantize::Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/dequantize::Prepare,
+          /*prepare=*/dequantize::Prepare,
-                                 /*invoke=*/dequantize::Eval,
+          /*invoke=*/dequantize::Eval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
 }  // namespace micro
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/elementwise.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/elementwise.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 namespace tflite {
 namespace ops {
@@ -39,8 +41,10 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
  const TfLiteTensor* input = GetInput(context, node, 0);
  TF_LITE_ENSURE(context, input != nullptr);
  TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
  if (!IsSupportedType(input->type)) {
    TF_LITE_KERNEL_LOG(context, "Input data type %s (%d) is not supported.",
                       TfLiteTypeGetName(input->type), input->type);
@@ -52,13 +56,13 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
 template <typename T>
 inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
                             T func(T), TfLiteType expected_type) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, expected_type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, expected_type);
-  const int64_t num_elements = NumElements(input);
+  const size_t num_elements = ElementCount(*input->dims);
-  const T* in_data = GetTensorData<T>(input);
+  const T* in_data = tflite::micro::GetTensorData<T>(input);
-  T* out_data = GetTensorData<T>(output);
+  T* out_data = tflite::micro::GetTensorData<T>(output);
-  for (int64_t i = 0; i < num_elements; ++i) {
+  for (size_t i = 0; i < num_elements; ++i) {
    out_data[i] = func(in_data[i]);
  }
  return kTfLiteOk;
@@ -109,116 +113,100 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 }  // namespace elementwise
-TfLiteRegistration* Register_ABS() {
+TfLiteRegistration Register_ABS() {
-  static TfLiteRegistration r = {
+  return {/*init=*/nullptr,
-      /*init=*/nullptr,
+          /*free=*/nullptr,
-      /*free=*/nullptr,
+          /*prepare=*/
-      /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::AbsEval,
-      /*invoke=*/elementwise::AbsEval,
+          /*profiling_string=*/nullptr,
-      /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
-      /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
-      /*custom_name=*/nullptr,
+          /*version=*/0};
      /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_SIN() {
+TfLiteRegistration Register_SIN() {
-  static TfLiteRegistration r = {
+  return {/*init=*/nullptr,
-      /*init=*/nullptr,
+          /*free=*/nullptr,
-      /*free=*/nullptr,
+          /*prepare=*/
-      /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::SinEval,
-      /*invoke=*/elementwise::SinEval,
+          /*profiling_string=*/nullptr,
-      /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
-      /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
-      /*custom_name=*/nullptr,
+          /*version=*/0};
      /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_COS() {
+TfLiteRegistration Register_COS() {
-  static TfLiteRegistration r = {
+  return {/*init=*/nullptr,
-      /*init=*/nullptr,
+          /*free=*/nullptr,
-      /*free=*/nullptr,
+          /*prepare=*/
-      /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::CosEval,
-      /*invoke=*/elementwise::CosEval,
+          /*profiling_string=*/nullptr,
-      /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
-      /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
-      /*custom_name=*/nullptr,
+          /*version=*/0};
      /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_LOG() {
+TfLiteRegistration Register_LOG() {
-  static TfLiteRegistration r = {
+  return {/*init=*/nullptr,
-      /*init=*/nullptr,
+          /*free=*/nullptr,
-      /*free=*/nullptr,
+          /*prepare=*/
-      /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::LogEval,
-      /*invoke=*/elementwise::LogEval,
+          /*profiling_string=*/nullptr,
-      /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
-      /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
-      /*custom_name=*/nullptr,
+          /*version=*/0};
      /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_SQRT() {
+TfLiteRegistration Register_SQRT() {
-  static TfLiteRegistration r = {
+  return {/*init=*/nullptr,
-      /*init=*/nullptr,
+          /*free=*/nullptr,
-      /*free=*/nullptr,
+          /*prepare=*/
-      /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::SqrtEval,
-      /*invoke=*/elementwise::SqrtEval,
+          /*profiling_string=*/nullptr,
-      /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
-      /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
-      /*custom_name=*/nullptr,
+          /*version=*/0};
      /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_RSQRT() {
+TfLiteRegistration Register_RSQRT() {
-  static TfLiteRegistration r = {
+  return {/*init=*/nullptr,
-      /*init=*/nullptr,
+          /*free=*/nullptr,
-      /*free=*/nullptr,
+          /*prepare=*/
-      /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::RsqrtEval,
-      /*invoke=*/elementwise::RsqrtEval,
+          /*profiling_string=*/nullptr,
-      /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
-      /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
-      /*custom_name=*/nullptr,
+          /*version=*/0};
      /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_SQUARE() {
+TfLiteRegistration Register_SQUARE() {
-  static TfLiteRegistration r = {
+  return {/*init=*/nullptr,
-      /*init=*/nullptr,
+          /*free=*/nullptr,
-      /*free=*/nullptr,
+          /*prepare=*/
-      /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::SquareEval,
-      /*invoke=*/elementwise::SquareEval,
+          /*profiling_string=*/nullptr,
-      /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
-      /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
-      /*custom_name=*/nullptr,
+          /*version=*/0};
      /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_LOGICAL_NOT() {
+TfLiteRegistration Register_LOGICAL_NOT() {
-  static TfLiteRegistration r = {
+  return {/*init=*/nullptr,
-      /*init=*/nullptr,
+          /*free=*/nullptr,
-      /*free=*/nullptr,
+          /*prepare=*/
-      /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsLogicalSupportedType>,
-      elementwise::GenericPrepare<elementwise::IsLogicalSupportedType>,
+          /*invoke=*/elementwise::LogicalNotEval,
-      /*invoke=*/elementwise::LogicalNotEval,
+          /*profiling_string=*/nullptr,
-      /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
-      /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
-      /*custom_name=*/nullptr,
+          /*version=*/0};
      /*version=*/0};
  return &r;
 }
 }  // namespace micro
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/ethosu.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/ethosu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,16 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // Abstract string. We don't want even absl at this level.
 #ifndef TENSORFLOW_LITE_STRING_TYPE_H_
 #define TENSORFLOW_LITE_STRING_TYPE_H_
-#include <string>
+//
 // This is a stub file for non-Ethos platforms
 //
 #include "tensorflow/lite/c/common.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace custom {
 TfLiteRegistration* Register_ETHOSU() { return nullptr; }
-using std::string;
+const char* GetString_ETHOSU() { return ""; }
 }  // namespace custom
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_STRING_TYPE_H_
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/floor.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/floor.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 namespace tflite {
 namespace ops {
@@ -28,25 +28,28 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* input =
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
-  reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
+  TfLiteEvalTensor* output =
-                       GetTensorShape(output), GetTensorData<float>(output));
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  reference_ops::Floor(tflite::micro::GetTensorShape(input),
                       tflite::micro::GetTensorData<float>(input),
                       tflite::micro::GetTensorShape(output),
                       tflite::micro::GetTensorData<float>(output));
  return kTfLiteOk;
 }
 }  // namespace floor
-TfLiteRegistration* Register_FLOOR() {
+TfLiteRegistration Register_FLOOR() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/nullptr,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/nullptr,
-                                 /*invoke=*/floor::Eval,
+          /*invoke=*/floor::Eval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
 }  // namespace micro
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,20 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace fully_connected {
 namespace {
 struct OpData {
@@ -40,6 +39,10 @@ struct OpData {
  int32_t output_activation_max;
  // The index of the temporary tensor where the quantized inputs are cached.
  int input_quantized_index;
  // Cached zero point values of tensors.
  int32_t input_zero_point;
  int32_t filter_zero_point;
  int32_t output_zero_point;
 };
 constexpr int kInputTensor = 0;
@@ -64,20 +67,17 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
        context, activation, output, &data->output_activation_min,
        &data->output_activation_max));
    data->input_zero_point = input->params.zero_point;
    data->filter_zero_point = filter->params.zero_point;
    data->output_zero_point = output->params.zero_point;
  }
  return status;
 }
 }  // namespace
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
      kTfLiteError) {
    return nullptr;
  }
  return data;
 }
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -89,11 +89,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
  TF_LITE_ENSURE(context, filter != nullptr);
  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                     "Hybrid models are not supported on TFLite Micro.");
@@ -102,13 +105,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               const OpData& data, const TfLiteTensor* input,
+                               const OpData& data,
-                               const TfLiteTensor* filter,
+                               const TfLiteEvalTensor* input,
-                               const TfLiteTensor* bias, TfLiteTensor* output) {
+                               const TfLiteEvalTensor* filter,
                               const TfLiteEvalTensor* bias,
                               TfLiteEvalTensor* output) {
  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = -input->params.zero_point;
+  op_params.input_offset = -data.input_zero_point;
-  op_params.weights_offset = -filter->params.zero_point;
+  op_params.weights_offset = -data.filter_zero_point;
-  op_params.output_offset = output->params.zero_point;
+  op_params.output_offset = data.output_zero_point;
  op_params.output_multiplier = data.output_multiplier;
  // TODO(b/138810107): Figure out whether output shift should be inverted
  op_params.output_shift = -data.output_shift;
@@ -116,20 +121,25 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
  op_params.quantized_activation_max = data.output_activation_max;
  reference_integer_ops::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      op_params, tflite::micro::GetTensorShape(input),
-      GetTensorShape(filter), GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorData<int8_t>(input),
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(filter),
-      GetTensorShape(output), GetTensorData<int8_t>(output));
+      tflite::micro::GetTensorData<int8_t>(filter),
      tflite::micro::GetTensorShape(bias),
      tflite::micro::GetTensorData<int32_t>(bias),
      tflite::micro::GetTensorShape(output),
      tflite::micro::GetTensorData<int8_t>(output));
  return kTfLiteOk;
 }
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           const OpData& data, const TfLiteTensor* input,
+                           const OpData& data, const TfLiteEvalTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
+                           const TfLiteEvalTensor* filter,
-                           TfLiteTensor* output) {
+                           const TfLiteEvalTensor* bias,
-  const int32_t input_offset = -input->params.zero_point;
+                           TfLiteEvalTensor* output) {
-  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t input_offset = -data.input_zero_point;
-  const int32_t output_offset = output->params.zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
  const int32_t output_offset = data.output_zero_point;
  tflite::FullyConnectedParams op_params;
  op_params.input_offset = input_offset;
@@ -141,12 +151,16 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
  op_params.quantized_activation_min = data.output_activation_min;
  op_params.quantized_activation_max = data.output_activation_max;
-#define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
+#define TF_LITE_FULLY_CONNECTED(output_data_type)      \
-  reference_ops::FullyConnected(                                       \
+  reference_ops::FullyConnected(                       \
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
+      op_params, tflite::micro::GetTensorShape(input), \
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
+      tflite::micro::GetTensorData<uint8_t>(input),    \
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
+      tflite::micro::GetTensorShape(filter),           \
-      GetTensorShape(output), GetTensorData<output_data_type>(output))
+      tflite::micro::GetTensorData<uint8_t>(filter),   \
      tflite::micro::GetTensorShape(bias),             \
      tflite::micro::GetTensorData<int32_t>(bias),     \
      tflite::micro::GetTensorShape(output),           \
      tflite::micro::GetTensorData<output_data_type>(output))
  switch (output->type) {
    case kTfLiteUInt8:
      TF_LITE_FULLY_CONNECTED(uint8_t);
@@ -165,8 +179,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                       TfLiteFusedActivation activation,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteEvalTensor* input,
-                       const TfLiteTensor* bias, TfLiteTensor* output) {
+                       const TfLiteEvalTensor* filter,
                       const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(activation, &output_activation_min,
                           &output_activation_max);
@@ -174,10 +189,14 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
  op_params.float_activation_min = output_activation_min;
  op_params.float_activation_max = output_activation_max;
  tflite::reference_ops::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      op_params, tflite::micro::GetTensorShape(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
+      tflite::micro::GetTensorData<float>(input),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      tflite::micro::GetTensorShape(filter),
-      GetTensorData<float>(output));
+      tflite::micro::GetTensorData<float>(filter),
      tflite::micro::GetTensorShape(bias),
      tflite::micro::GetTensorData<float>(bias),
      tflite::micro::GetTensorShape(output),
      tflite::micro::GetTensorData<float>(output));
  return kTfLiteOk;
 }
@@ -186,10 +205,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  const auto* params =
      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* input =
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteEvalTensor* filter =
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+      tflite::micro::GetEvalInput(context, node, kWeightsTensor);
  const TfLiteEvalTensor* bias =
      tflite::micro::GetEvalInput(context, node, kBiasTensor);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  TFLITE_DCHECK(node->user_data != nullptr);
  const OpData& data = *(static_cast<const OpData*>(node->user_data));
@@ -214,20 +237,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  return kTfLiteOk;
 }
-}  // namespace fully_connected
+}  // namespace
-TfLiteRegistration* Register_FULLY_CONNECTED() {
+TfLiteRegistration Register_FULLY_CONNECTED() {
-  static TfLiteRegistration r = {/*init=*/fully_connected::Init,
+  return {/*init=*/Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/fully_connected::Prepare,
+          /*prepare=*/Prepare,
-                                 /*invoke=*/fully_connected::Eval,
+          /*invoke=*/Eval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/fully_connected.h
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/fully_connected.h
@@ -0,0 +1,50 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_MICRO_KERNELS_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_MICRO_KERNELS_FULLY_CONNECTED_H_
 #include "tensorflow/lite/c/common.h"
 namespace tflite {
 // This is the most generic TfLiteRegistration. The actual supported types may
 // still be target dependent. The only requirement is that every implementation
 // (reference or optimized) must define this function.
 TfLiteRegistration Register_FULLY_CONNECTED();
 #if defined(CMSIS_NN) || defined(ARDUINO)
 // The Arduino is a special case where we use the CMSIS kernels, but because of
 // the current approach to building for Arduino, we do not support -DCMSIS_NN as
 // part of the build. As a result, we use defined(ARDUINO) as proxy for the
 // CMSIS kernels for this one special case.
 // Returns a TfLiteRegistration struct for cmsis-nn kernel variant that only
 // supports int8.
 TfLiteRegistration Register_FULLY_CONNECTED_INT8();
 #else
 // Note that while this block gets used for both reference and optimized kernels
 // that do not have any specialized implementations, the only goal here is to
 // define fallback implementation that allow reference kernels to still be used
 // from applications that call a more specific kernel variant.
 inline TfLiteRegistration Register_FULLY_CONNECTED_INT8() {
  return Register_FULLY_CONNECTED();
 }
 #endif
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_MICRO_KERNELS_FULLY_CONNECTED_H_
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/hard_swish.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/hard_swish.cc
@@ -0,0 +1,142 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/reference/hard_swish.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace hard_swish {
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 void* HardSwishInit(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  return context->AllocatePersistentBuffer(context, sizeof(HardSwishParams));
 }
 TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
    HardSwishParams* params = static_cast<HardSwishParams*>(node->user_data);
    params->input_zero_point = input->params.zero_point;
    params->output_zero_point = output->params.zero_point;
    const float input_scale = input->params.scale;
    const float hires_input_scale = (1.0f / 128.0f) * input_scale;
    const float reluish_scale = 3.0f / 32768.0f;
    const float output_scale = output->params.scale;
    const double output_multiplier =
        static_cast<double>(hires_input_scale / output_scale);
    int32_t output_multiplier_fixedpoint_int32;
    QuantizeMultiplier(output_multiplier, &output_multiplier_fixedpoint_int32,
                       &params->output_multiplier_exponent);
    DownScaleInt32ToInt16Multiplier(
        output_multiplier_fixedpoint_int32,
        &params->output_multiplier_fixedpoint_int16);
    TF_LITE_ENSURE(context, params->output_multiplier_exponent <= 0);
    const double reluish_multiplier =
        static_cast<double>(hires_input_scale / reluish_scale);
    int32_t reluish_multiplier_fixedpoint_int32;
    QuantizeMultiplier(reluish_multiplier, &reluish_multiplier_fixedpoint_int32,
                       &params->reluish_multiplier_exponent);
    DownScaleInt32ToInt16Multiplier(
        reluish_multiplier_fixedpoint_int32,
        &params->reluish_multiplier_fixedpoint_int16);
  }
  return kTfLiteOk;
 }
 TfLiteStatus HardSwishEval(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteEvalTensor* input =
      tflite::micro::GetEvalInput(context, node, kInputTensor);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  HardSwishParams* params = static_cast<HardSwishParams*>(node->user_data);
  switch (input->type) {
    case kTfLiteFloat32: {
      tflite::reference_ops::HardSwish<float>(
          tflite::micro::GetTensorShape(input),
          tflite::micro::GetTensorData<float>(input),
          tflite::micro::GetTensorShape(output),
          tflite::micro::GetTensorData<float>(output));
    } break;
    case kTfLiteUInt8: {
      tflite::reference_ops::HardSwish<uint8_t>(
          *params, tflite::micro::GetTensorShape(input),
          tflite::micro::GetTensorData<uint8_t>(input),
          tflite::micro::GetTensorShape(output),
          tflite::micro::GetTensorData<uint8_t>(output));
    } break;
    case kTfLiteInt8: {
      tflite::reference_ops::HardSwish<int8_t>(
          *params, tflite::micro::GetTensorShape(input),
          tflite::micro::GetTensorData<int8_t>(input),
          tflite::micro::GetTensorShape(output),
          tflite::micro::GetTensorData<int8_t>(output));
    } break;
    default: {
      TF_LITE_KERNEL_LOG(
          context,
          "Only float32/int8_t/uint8_t are supported currently, got %s",
          TfLiteTypeGetName(input->type));
      return kTfLiteError;
    }
  }
  return kTfLiteOk;
 }
 }  // namespace hard_swish
 TfLiteRegistration Register_HARD_SWISH() {
  return {/*init=*/hard_swish::HardSwishInit,
          /*free=*/nullptr,
          /*prepare=*/hard_swish::HardSwishPrepare,
          /*invoke=*/hard_swish::HardSwishEval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
          /*custom_name=*/nullptr,
          /*version=*/0};
 }
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/kernel_runner.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/kernel_runner.cc
@@ -0,0 +1,165 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
 namespace tflite {
 namespace micro {
 namespace {
 constexpr size_t kBufferAlignment = 16;
 }  // namespace
 // TODO(b/161841696): Consider moving away from global arena buffers:
 constexpr int KernelRunner::kNumScratchBuffers_;
 constexpr int KernelRunner::kKernelRunnerBufferSize_;
 uint8_t KernelRunner::kKernelRunnerBuffer_[];
 KernelRunner::KernelRunner(const TfLiteRegistration& registration,
                           TfLiteTensor* tensors, int tensors_size,
                           TfLiteIntArray* inputs, TfLiteIntArray* outputs,
                           void* builtin_data, ErrorReporter* error_reporter)
    : allocator_(SimpleMemoryAllocator::Create(
          error_reporter, kKernelRunnerBuffer_, kKernelRunnerBufferSize_)),
      registration_(registration),
      tensors_(tensors),
      error_reporter_(error_reporter) {
  // Prepare TfLiteContext:
  context_.impl_ = static_cast<void*>(this);
  context_.ReportError = ReportOpError;
  context_.recommended_num_threads = 1;
  context_.GetTensor = GetTensor;
  context_.GetEvalTensor = GetEvalTensor;
  context_.AllocatePersistentBuffer = AllocatePersistentBuffer;
  context_.RequestScratchBufferInArena = RequestScratchBufferInArena;
  context_.GetScratchBuffer = GetScratchBuffer;
  // Prepare TfLiteNode:
  node_.inputs = inputs;
  node_.outputs = outputs;
  node_.builtin_data = builtin_data;
 }
 TfLiteStatus KernelRunner::InitAndPrepare(const char* init_data) {
  if (registration_.init) {
    node_.user_data = registration_.init(&context_, init_data, /*length=*/0);
  }
  if (registration_.prepare) {
    TF_LITE_ENSURE_STATUS(registration_.prepare(&context_, &node_));
  }
  return kTfLiteOk;
 }
 TfLiteStatus KernelRunner::Invoke() {
  if (registration_.invoke == nullptr) {
    TF_LITE_REPORT_ERROR(error_reporter_,
                         "TfLiteRegistration missing invoke function pointer!");
    return kTfLiteError;
  }
  return registration_.invoke(&context_, &node_);
 }
 TfLiteTensor* KernelRunner::GetTensor(const struct TfLiteContext* context,
                                      int tensor_index) {
  TFLITE_DCHECK(context != nullptr);
  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
  TFLITE_DCHECK(runner != nullptr);
  return &runner->tensors_[tensor_index];
 }
 TfLiteEvalTensor* KernelRunner::GetEvalTensor(
    const struct TfLiteContext* context, int tensor_index) {
  TFLITE_DCHECK(context != nullptr);
  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
  TFLITE_DCHECK(runner != nullptr);
  TfLiteEvalTensor* eval_tensor =
      reinterpret_cast<TfLiteEvalTensor*>(runner->allocator_->AllocateTemp(
          sizeof(TfLiteEvalTensor), alignof(TfLiteEvalTensor)));
  TFLITE_DCHECK(eval_tensor != nullptr);
  // In unit tests, the TfLiteTensor pointer contains the source of truth for
  // buffers and values:
  eval_tensor->data = runner->tensors_[tensor_index].data;
  eval_tensor->dims = runner->tensors_[tensor_index].dims;
  eval_tensor->type = runner->tensors_[tensor_index].type;
  return eval_tensor;
 }
 void* KernelRunner::AllocatePersistentBuffer(TfLiteContext* context,
                                             size_t bytes) {
  TFLITE_DCHECK(context != nullptr);
  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
  TFLITE_DCHECK(runner != nullptr);
  return runner->allocator_->AllocateFromTail(bytes, kBufferAlignment);
 }
 TfLiteStatus KernelRunner::RequestScratchBufferInArena(TfLiteContext* context,
                                                       size_t bytes,
                                                       int* buffer_index) {
  TFLITE_DCHECK(context != nullptr);
  TFLITE_DCHECK(buffer_index != nullptr);
  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
  TFLITE_DCHECK(runner != nullptr);
  if (runner->scratch_buffer_count_ == kNumScratchBuffers_) {
    TF_LITE_REPORT_ERROR(
        runner->error_reporter_,
        "Exceeded the maximum number of scratch tensors allowed (%d).",
        kNumScratchBuffers_);
    return kTfLiteError;
  }
  // For tests, we allocate scratch buffers from the tail and keep them around
  // for the lifetime of model. This means that the arena size in the tests will
  // be more than what we would have if the scratch buffers could share memory.
  runner->scratch_buffers_[runner->scratch_buffer_count_] =
      runner->allocator_->AllocateFromTail(bytes, kBufferAlignment);
  TFLITE_DCHECK(runner->scratch_buffers_[runner->scratch_buffer_count_] !=
                nullptr);
  *buffer_index = runner->scratch_buffer_count_++;
  return kTfLiteOk;
 }
 void* KernelRunner::GetScratchBuffer(TfLiteContext* context, int buffer_index) {
  TFLITE_DCHECK(context != nullptr);
  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
  TFLITE_DCHECK(runner != nullptr);
  TFLITE_DCHECK(runner->scratch_buffer_count_ <= kNumScratchBuffers_);
  if (buffer_index >= runner->scratch_buffer_count_) {
    return nullptr;
  }
  return runner->scratch_buffers_[buffer_index];
 }
 void KernelRunner::ReportOpError(struct TfLiteContext* context,
                                 const char* format, ...) {
  TFLITE_DCHECK(context != nullptr);
  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
  TFLITE_DCHECK(runner != nullptr);
  va_list args;
  va_start(args, format);
  TF_LITE_REPORT_ERROR(runner->error_reporter_, format, args);
  va_end(args);
 }
 }  // namespace micro
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/kernel_runner.h
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/kernel_runner.h
@@ -0,0 +1,83 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_RUNNER_H_
 #define TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_RUNNER_H_
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 namespace tflite {
 namespace micro {
 // Helper class to perform a simulated kernel (i.e. TfLiteRegistration) lifecyle
 // (init, prepare, invoke). All internal allocations are handled by this class.
 // Simply pass in the registration, list of required tensors, inputs array,
 // outputs array, and any pre-builtin data. Calling Invoke() will automatically
 // walk the kernl and outputs will be ready on the the TfLiteTensor output
 // provided during construction.
 class KernelRunner {
 public:
  KernelRunner(const TfLiteRegistration& registration, TfLiteTensor* tensors,
               int tensors_size, TfLiteIntArray* inputs,
               TfLiteIntArray* outputs, void* builtin_data,
               ErrorReporter* error_reporter);
  // Calls init and prepare on the kernel (i.e. TfLiteRegistration) struct. Any
  // exceptions will be reported through the error_reporter and returned as a
  // status code here.
  TfLiteStatus InitAndPrepare(const char* init_data = nullptr);
  // Calls init, prepare, and invoke on a given TfLiteRegistration pointer.
  // After successful invoke, results will be available in the output tensor as
  // passed into the constructor of this class.
  TfLiteStatus Invoke();
 protected:
  static TfLiteTensor* GetTensor(const struct TfLiteContext* context,
                                 int tensor_index);
  static TfLiteEvalTensor* GetEvalTensor(const struct TfLiteContext* context,
                                         int tensor_index);
  static void* AllocatePersistentBuffer(TfLiteContext* context, size_t bytes);
  static TfLiteStatus RequestScratchBufferInArena(TfLiteContext* context,
                                                  size_t bytes,
                                                  int* buffer_index);
  static void* GetScratchBuffer(TfLiteContext* context, int buffer_index);
  static void ReportOpError(struct TfLiteContext* context, const char* format,
                            ...);
 private:
  static constexpr int kNumScratchBuffers_ = 5;
  static constexpr int kKernelRunnerBufferSize_ = 10000;
  static uint8_t kKernelRunnerBuffer_[kKernelRunnerBufferSize_];
  SimpleMemoryAllocator* allocator_ = nullptr;
  const TfLiteRegistration& registration_;
  TfLiteTensor* tensors_ = nullptr;
  ErrorReporter* error_reporter_ = nullptr;
  TfLiteContext context_ = {};
  TfLiteNode node_ = {};
  int scratch_buffer_count_ = 0;
  uint8_t* scratch_buffers_[kNumScratchBuffers_];
 };
 }  // namespace micro
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_RUNNER_H_
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/kernel_util.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/kernel_util.cc
@@ -0,0 +1,41 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/c/common.h"
 namespace tflite {
 namespace micro {
 bool HaveSameShapes(const TfLiteEvalTensor* input1,
                    const TfLiteEvalTensor* input2) {
  TFLITE_DCHECK(input1 != nullptr);
  TFLITE_DCHECK(input2 != nullptr);
  return TfLiteIntArrayEqual(input1->dims, input2->dims);
 }
 const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor) {
  if (tensor == nullptr || tensor->dims == nullptr) {
    return RuntimeShape();
  }
  TfLiteIntArray* dims = tensor->dims;
  const int dims_size = dims->size;
  const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
  return RuntimeShape(dims_size, dims_data);
 }
 }  // namespace micro
 }  // namespace tflite
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/kernel_util.h
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/kernel_util.h
@@ -0,0 +1,75 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_UTIL_H_
 #define TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_UTIL_H_
 #include <cstdint>
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 namespace tflite {
 namespace micro {
 // Returns a mutable tensor for a given input index. is_variable must be checked
 // during prepare when the full TfLiteTensor is available.
 inline TfLiteEvalTensor* GetMutableEvalInput(const TfLiteContext* context,
                                             const TfLiteNode* node,
                                             int index) {
  TFLITE_DCHECK(context != nullptr);
  TFLITE_DCHECK(node != nullptr);
  return context->GetEvalTensor(context, node->inputs->data[index]);
 }
 // Returns the TfLiteEvalTensor struct for a given input index in a node.
 inline const TfLiteEvalTensor* GetEvalInput(const TfLiteContext* context,
                                            const TfLiteNode* node, int index) {
  return GetMutableEvalInput(context, node, index);
 }
 // Returns the TfLiteEvalTensor struct for a given output index in a node.
 inline TfLiteEvalTensor* GetEvalOutput(const TfLiteContext* context,
                                       const TfLiteNode* node, int index) {
  TFLITE_DCHECK(context != nullptr);
  TFLITE_DCHECK(node != nullptr);
  return context->GetEvalTensor(context, node->outputs->data[index]);
 }
 // Returns data for a TfLiteEvalTensor struct.
 template <typename T>
 T* GetTensorData(TfLiteEvalTensor* tensor) {
  return tensor != nullptr ? reinterpret_cast<T*>(tensor->data.raw) : nullptr;
 }
 // Returns const data for a TfLiteEvalTensor struct.
 template <typename T>
 const T* GetTensorData(const TfLiteEvalTensor* tensor) {
  TFLITE_DCHECK(tensor != nullptr);
  return reinterpret_cast<const T*>(tensor->data.raw);
 }
 // Returns the shape of a TfLiteEvalTensor struct.
 const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor);
 // Return true if the given tensors have the same shape.
 bool HaveSameShapes(const TfLiteEvalTensor* input1,
                    const TfLiteEvalTensor* input2);
 }  // namespace micro
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_UTIL_H_
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/l2norm.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/l2norm.cc
@@ -14,16 +14,19 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h"
 #include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace l2norm {
 namespace {
 // This file has two implementation of L2Norm.
 enum KernelType {
  kReference,
@@ -33,44 +36,59 @@ enum KernelType {
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 }  // namespace
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(DEBUG)
+  TFLITE_DCHECK(node->user_data != nullptr);
  TFLITE_DCHECK(node->builtin_data != nullptr);
  auto* params = reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
  L2NormalizationParams* data =
      static_cast<L2NormalizationParams*>(node->user_data);
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
  TF_LITE_ENSURE(context, output->type == kTfLiteFloat32 ||
                              output->type == kTfLiteUInt8 ||
                              output->type == kTfLiteInt8);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.scale, (1. / 128.));
+    data->input_zero_point = input->params.zero_point;
-    if (output->type == kTfLiteUInt8) {
+  } else if (output->type == kTfLiteFloat32) {
-      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 128);
+    data->input_zero_point = 0;
    }
    if (output->type == kTfLiteInt8) {
      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
    }
  }
  // TODO(ahentz): For some reason our implementations don't support
  // activations.
  TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
 #endif
  return kTfLiteOk;
 }
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  return context->AllocatePersistentBuffer(context,
                                           sizeof(L2NormalizationParams));
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const L2NormalizationParams& data =
      *(static_cast<const L2NormalizationParams*>(node->user_data));
  const TfLiteEvalTensor* input =
      tflite::micro::GetEvalInput(context, node, kInputTensor);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  // TODO(b/143912164): instead of hardcode the epsilon here, we should read it
  // from tensorflow, i.e., adding a params.
@@ -87,39 +105,32 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  // So we don't even need to do handle the epsilon for quantized kernel case.
  const float epsilon = 1e-6f;
  if (output->type == kTfLiteFloat32) {
-#define TF_LITE_L2NORM(type)                                                 \
+    reference_ops::L2Normalization(data, tflite::micro::GetTensorShape(input),
-  tflite::L2NormalizationParams op_params;                                   \
+                                   tflite::micro::GetTensorData<float>(input),
-  op_params.input_zero_point = 0;                                            \
+                                   tflite::micro::GetTensorShape(output),
-  type::L2Normalization(op_params, GetTensorShape(input),                    \
+                                   tflite::micro::GetTensorData<float>(output),
-                        GetTensorData<float>(input), GetTensorShape(output), \
+                                   epsilon);
                        GetTensorData<float>(output), epsilon)
    TF_LITE_L2NORM(reference_ops);
 #undef TF_LITE_L2NORM
  } else if (output->type == kTfLiteUInt8) {
-#define TF_LITE_L2NORM(type)                                                 \
+    reference_ops::L2Normalization(
-  tflite::L2NormalizationParams op_params;                                   \
+        data, tflite::micro::GetTensorShape(input),
-  op_params.input_zero_point = input->params.zero_point;                     \
+        tflite::micro::GetTensorData<uint8_t>(input),
-  type::L2Normalization(op_params, GetTensorShape(input),                    \
+        tflite::micro::GetTensorShape(output),
-                        GetTensorData<uint8>(input), GetTensorShape(output), \
+        tflite::micro::GetTensorData<uint8_t>(output));
                        GetTensorData<uint8>(output))
    TF_LITE_L2NORM(reference_ops);
 #undef TF_LITE_L2NORM
  } else if (output->type == kTfLiteInt8) {
-    const auto input_shape = GetTensorShape(input);
+    const auto input_shape = tflite::micro::GetTensorShape(input);
-    const auto output_shape = GetTensorShape(output);
+    const auto output_shape = tflite::micro::GetTensorShape(output);
    const int trailing_dim = input_shape.DimensionsCount() - 1;
    const int depth =
        MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
    const int outer_size =
        MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-    reference_integer_ops::L2Normalization(input->params.zero_point, outer_size,
+    reference_integer_ops::L2Normalization(
-                                           depth, GetTensorData<int8>(input),
+        data.input_zero_point, outer_size, depth,
-                                           GetTensorData<int8>(output));
+        tflite::micro::GetTensorData<int8_t>(input),
        tflite::micro::GetTensorData<int8_t>(output));
  } else {
-    TF_LITE_KERNEL_LOG(context, "Output type is %d, requires float.",
+    TF_LITE_KERNEL_LOG(context, "Output type is %s, requires float.",
-                         output->type);
+                       TfLiteTypeGetName(output->type));
    return kTfLiteError;
  }
@@ -128,22 +139,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace l2norm
-TfLiteRegistration* Register_L2NORM_REF() {
+TfLiteRegistration Register_L2NORM_REF() {
-    static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/l2norm::Init,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/l2norm::Prepare,
+          /*prepare=*/l2norm::Prepare,
-                                 /*invoke=*/l2norm::Eval,
+          /*invoke=*/l2norm::Eval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_L2_NORMALIZATION() {
+TfLiteRegistration Register_L2_NORMALIZATION() { return Register_L2NORM_REF(); }
  return Register_L2NORM_REF();
 }
 }  // namespace micro
 }  // namespace ops
--- a/code/lib/tfmicro/tensorflow/lite/micro/kernels/logical.cc
+++ b/code/lib/tfmicro/tensorflow/lite/micro/kernels/logical.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/binary_function.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 namespace tflite {
 namespace ops {
@@ -31,20 +31,29 @@ constexpr int kOutputTensor = 0;
 TfLiteStatus LogicalImpl(TfLiteContext* context, TfLiteNode* node,
                         bool (*func)(bool, bool)) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input1 =
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input2 =
      tflite::micro::GetEvalInput(context, node, kInputTensor2);
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
-  if (HaveSameShapes(input1, input2)) {
+  if (tflite::micro::HaveSameShapes(input1, input2)) {
    reference_ops::BinaryFunction<bool, bool, bool>(
-        GetTensorShape(input1), GetTensorData<bool>(input1),
+        tflite::micro::GetTensorShape(input1),
-        GetTensorShape(input2), GetTensorData<bool>(input2),
+        tflite::micro::GetTensorData<bool>(input1),
-        GetTensorShape(output), GetTensorData<bool>(output), func);
+        tflite::micro::GetTensorShape(input2),
        tflite::micro::GetTensorData<bool>(input2),
        tflite::micro::GetTensorShape(output),
        tflite::micro::GetTensorData<bool>(output), func);
  } else {
    reference_ops::BroadcastBinaryFunction4DSlow<bool, bool, bool>(
-        GetTensorShape(input1), GetTensorData<bool>(input1),
+        tflite::micro::GetTensorShape(input1),
-        GetTensorShape(input2), GetTensorData<bool>(input2),
+        tflite::micro::GetTensorData<bool>(input1),
-        GetTensorShape(output), GetTensorData<bool>(output), func);
+        tflite::micro::GetTensorShape(input2),
        tflite::micro::GetTensorData<bool>(input2),
        tflite::micro::GetTensorShape(output),
        tflite::micro::GetTensorData<bool>(output), func);
  }
  return kTfLiteOk;
@@ -65,32 +74,30 @@ TfLiteStatus LogicalAndEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 }  // namespace logical
-TfLiteRegistration* Register_LOGICAL_OR() {
+TfLiteRegistration Register_LOGICAL_OR() {
  // Init, Free, Prepare, Eval are satisfying the Interface required by
  // TfLiteRegistration.
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/nullptr,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/nullptr,
-                                 /*invoke=*/logical::LogicalOrEval,
+          /*invoke=*/logical::LogicalOrEval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
-TfLiteRegistration* Register_LOGICAL_AND() {
+TfLiteRegistration Register_LOGICAL_AND() {
  // Init, Free, Prepare, Eval are satisfying the Interface required by
  // TfLiteRegistration.
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  return {/*init=*/nullptr,
-                                 /*free=*/nullptr,
+          /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+          /*prepare=*/nullptr,
-                                 /*invoke=*/logical::LogicalAndEval,
+          /*invoke=*/logical::LogicalAndEval,
-                                 /*profiling_string=*/nullptr,
+          /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
+          /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
+          /*custom_name=*/nullptr,
-                                 /*version=*/0};
+          /*version=*/0};
  return &r;
 }
 }  // namespace micro
--- a/Show More
+++ b/Show More