Rolling 20220526

2026-01-29 22:00:43 +03:00 · 2022-05-26 20:31:26 +02:00
parent cce812ff11
commit 00028010ee
203 changed files with 12003 additions and 1226 deletions
--- a/code/components/tflite-lib/CMakeLists.txt
+++ b/code/components/tflite-lib/CMakeLists.txt
@@ -1,3 +1,5 @@
+## TODO: GLOB is not a good way to collect files. Use explicit file list instead
+
 cmake_minimum_required(VERSION 3.5)

 set(tflite_dir "${CMAKE_CURRENT_SOURCE_DIR}/tensorflow/lite")
@@ -16,14 +18,27 @@ file(GLOB srcs_kernels
          "${tfmicro_kernels_dir}/*.c"
          "${tfmicro_kernels_dir}/*.cc")

+# remove sources which will be provided by esp_nn
+list(REMOVE_ITEM srcs_kernels
+          "${tfmicro_kernels_dir}/add.cc"
+          "${tfmicro_kernels_dir}/conv.cc"
+          "${tfmicro_kernels_dir}/depthwise_conv.cc"
+          "${tfmicro_kernels_dir}/fully_connected.cc"
+          "${tfmicro_kernels_dir}/mul.cc"
+          "${tfmicro_kernels_dir}/pooling.cc")
+
+FILE(GLOB esp_nn_kernels
+          "${tfmicro_kernels_dir}/esp_nn/*.cc")
+
 set(lib_srcs
          "${srcs_micro}"
          "${srcs_kernels}"
+          "${esp_nn_kernels}"
          "${src_micro_frontend}"
          "${tflite_dir}/kernels/kernel_util.cc"
          "${tflite_dir}/micro/memory_planner/greedy_memory_planner.cc"
          "${tflite_dir}/micro/memory_planner/linear_memory_planner.cc"
-          "${tflite_dir}/c/common.c"
+          "${tflite_dir}/c/common.cc"
          "${tflite_dir}/core/api/error_reporter.cc"
          "${tflite_dir}/core/api/flatbuffer_conversions.cc"
          "${tflite_dir}/core/api/op_resolver.cc"
@@ -36,15 +51,17 @@ idf_component_register(
            INCLUDE_DIRS "." "third_party/gemmlowp"
                         "third_party/flatbuffers/include"
                         "third_party/ruy"
-                         "third_party/kissfft")
+                         "third_party/kissfft"
+            REQUIRES "esp-nn")

 # Reduce the level of paranoia to be able to compile TF sources
 target_compile_options(${COMPONENT_LIB} PRIVATE
  -Wno-maybe-uninitialized
  -Wno-missing-field-initializers
+  -DESP_NN # enables ESP-NN optimizations by Espressif
  -Wno-type-limits)

-target_compile_options(${COMPONENT_LIB} PRIVATE -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter -DESP -DESP_NN -Wno-nonnull -Wno-nonnull -Wno-nonnull)
-target_compile_options(${COMPONENT_LIB} PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -std=c++11 -fno-rtti -fno-exceptions -fno-threadsafe-statics -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Werror -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter -DESP -DESP_NN -Wno-return-type -Wno-strict-aliasing -std=gnu++14 -Wno-return-type -Wno-strict-aliasing -std=gnu++14 -Wno-return-type -Wno-strict-aliasing -std=gnu++14 >)
+target_compile_options(${COMPONENT_LIB} PRIVATE -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter -Wno-nonnull)
+target_compile_options(${COMPONENT_LIB} PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -std=c++11 -fno-rtti -fno-exceptions -fno-threadsafe-statics -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Werror -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter -Wno-return-type -Wno-strict-aliasing -std=gnu++14 >)
 target_compile_options(${COMPONENT_LIB} INTERFACE $<$<IN_LIST:-DTF_LITE_STATIC_MEMORY,$<TARGET_PROPERTY:${COMPONENT_LIB},COMPILE_OPTIONS>>:-DTF_LITE_STATIC_MEMORY>)
 target_link_libraries(${COMPONENT_LIB} PRIVATE -lm)
--- a/code/components/tflite-lib/tensorflow/lite/builtin_op_data.h
+++ b/code/components/tflite-lib/tensorflow/lite/builtin_op_data.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Compatibility shim for new location of interface definitions.
+
+#ifndef TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
+#define TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+
+#endif  // TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
--- a/code/components/tflite-lib/tensorflow/lite/builtin_ops.h
+++ b/code/components/tflite-lib/tensorflow/lite/builtin_ops.h
@@ -0,0 +1,187 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_BUILTIN_OPS_H_
+#define TENSORFLOW_LITE_BUILTIN_OPS_H_
+
+// DO NOT EDIT MANUALLY: This file is automatically generated by
+// `schema/builtin_ops_header/generator.cc`.
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// The enum for builtin operators.
+// Note: CUSTOM, DELEGATE, and PLACEHOLDER_FOR_GREATER_OP_CODES are 3 special
+// ops which are not real built-in ops.
+typedef enum {
+  kTfLiteBuiltinAdd = 0,
+  kTfLiteBuiltinAveragePool2d = 1,
+  kTfLiteBuiltinConcatenation = 2,
+  kTfLiteBuiltinConv2d = 3,
+  kTfLiteBuiltinDepthwiseConv2d = 4,
+  kTfLiteBuiltinDepthToSpace = 5,
+  kTfLiteBuiltinDequantize = 6,
+  kTfLiteBuiltinEmbeddingLookup = 7,
+  kTfLiteBuiltinFloor = 8,
+  kTfLiteBuiltinFullyConnected = 9,
+  kTfLiteBuiltinHashtableLookup = 10,
+  kTfLiteBuiltinL2Normalization = 11,
+  kTfLiteBuiltinL2Pool2d = 12,
+  kTfLiteBuiltinLocalResponseNormalization = 13,
+  kTfLiteBuiltinLogistic = 14,
+  kTfLiteBuiltinLshProjection = 15,
+  kTfLiteBuiltinLstm = 16,
+  kTfLiteBuiltinMaxPool2d = 17,
+  kTfLiteBuiltinMul = 18,
+  kTfLiteBuiltinRelu = 19,
+  kTfLiteBuiltinReluN1To1 = 20,
+  kTfLiteBuiltinRelu6 = 21,
+  kTfLiteBuiltinReshape = 22,
+  kTfLiteBuiltinResizeBilinear = 23,
+  kTfLiteBuiltinRnn = 24,
+  kTfLiteBuiltinSoftmax = 25,
+  kTfLiteBuiltinSpaceToDepth = 26,
+  kTfLiteBuiltinSvdf = 27,
+  kTfLiteBuiltinTanh = 28,
+  kTfLiteBuiltinConcatEmbeddings = 29,
+  kTfLiteBuiltinSkipGram = 30,
+  kTfLiteBuiltinCall = 31,
+  kTfLiteBuiltinCustom = 32,
+  kTfLiteBuiltinEmbeddingLookupSparse = 33,
+  kTfLiteBuiltinPad = 34,
+  kTfLiteBuiltinUnidirectionalSequenceRnn = 35,
+  kTfLiteBuiltinGather = 36,
+  kTfLiteBuiltinBatchToSpaceNd = 37,
+  kTfLiteBuiltinSpaceToBatchNd = 38,
+  kTfLiteBuiltinTranspose = 39,
+  kTfLiteBuiltinMean = 40,
+  kTfLiteBuiltinSub = 41,
+  kTfLiteBuiltinDiv = 42,
+  kTfLiteBuiltinSqueeze = 43,
+  kTfLiteBuiltinUnidirectionalSequenceLstm = 44,
+  kTfLiteBuiltinStridedSlice = 45,
+  kTfLiteBuiltinBidirectionalSequenceRnn = 46,
+  kTfLiteBuiltinExp = 47,
+  kTfLiteBuiltinTopkV2 = 48,
+  kTfLiteBuiltinSplit = 49,
+  kTfLiteBuiltinLogSoftmax = 50,
+  kTfLiteBuiltinDelegate = 51,
+  kTfLiteBuiltinBidirectionalSequenceLstm = 52,
+  kTfLiteBuiltinCast = 53,
+  kTfLiteBuiltinPrelu = 54,
+  kTfLiteBuiltinMaximum = 55,
+  kTfLiteBuiltinArgMax = 56,
+  kTfLiteBuiltinMinimum = 57,
+  kTfLiteBuiltinLess = 58,
+  kTfLiteBuiltinNeg = 59,
+  kTfLiteBuiltinPadv2 = 60,
+  kTfLiteBuiltinGreater = 61,
+  kTfLiteBuiltinGreaterEqual = 62,
+  kTfLiteBuiltinLessEqual = 63,
+  kTfLiteBuiltinSelect = 64,
+  kTfLiteBuiltinSlice = 65,
+  kTfLiteBuiltinSin = 66,
+  kTfLiteBuiltinTransposeConv = 67,
+  kTfLiteBuiltinSparseToDense = 68,
+  kTfLiteBuiltinTile = 69,
+  kTfLiteBuiltinExpandDims = 70,
+  kTfLiteBuiltinEqual = 71,
+  kTfLiteBuiltinNotEqual = 72,
+  kTfLiteBuiltinLog = 73,
+  kTfLiteBuiltinSum = 74,
+  kTfLiteBuiltinSqrt = 75,
+  kTfLiteBuiltinRsqrt = 76,
+  kTfLiteBuiltinShape = 77,
+  kTfLiteBuiltinPow = 78,
+  kTfLiteBuiltinArgMin = 79,
+  kTfLiteBuiltinFakeQuant = 80,
+  kTfLiteBuiltinReduceProd = 81,
+  kTfLiteBuiltinReduceMax = 82,
+  kTfLiteBuiltinPack = 83,
+  kTfLiteBuiltinLogicalOr = 84,
+  kTfLiteBuiltinOneHot = 85,
+  kTfLiteBuiltinLogicalAnd = 86,
+  kTfLiteBuiltinLogicalNot = 87,
+  kTfLiteBuiltinUnpack = 88,
+  kTfLiteBuiltinReduceMin = 89,
+  kTfLiteBuiltinFloorDiv = 90,
+  kTfLiteBuiltinReduceAny = 91,
+  kTfLiteBuiltinSquare = 92,
+  kTfLiteBuiltinZerosLike = 93,
+  kTfLiteBuiltinFill = 94,
+  kTfLiteBuiltinFloorMod = 95,
+  kTfLiteBuiltinRange = 96,
+  kTfLiteBuiltinResizeNearestNeighbor = 97,
+  kTfLiteBuiltinLeakyRelu = 98,
+  kTfLiteBuiltinSquaredDifference = 99,
+  kTfLiteBuiltinMirrorPad = 100,
+  kTfLiteBuiltinAbs = 101,
+  kTfLiteBuiltinSplitV = 102,
+  kTfLiteBuiltinUnique = 103,
+  kTfLiteBuiltinCeil = 104,
+  kTfLiteBuiltinReverseV2 = 105,
+  kTfLiteBuiltinAddN = 106,
+  kTfLiteBuiltinGatherNd = 107,
+  kTfLiteBuiltinCos = 108,
+  kTfLiteBuiltinWhere = 109,
+  kTfLiteBuiltinRank = 110,
+  kTfLiteBuiltinElu = 111,
+  kTfLiteBuiltinReverseSequence = 112,
+  kTfLiteBuiltinMatrixDiag = 113,
+  kTfLiteBuiltinQuantize = 114,
+  kTfLiteBuiltinMatrixSetDiag = 115,
+  kTfLiteBuiltinRound = 116,
+  kTfLiteBuiltinHardSwish = 117,
+  kTfLiteBuiltinIf = 118,
+  kTfLiteBuiltinWhile = 119,
+  kTfLiteBuiltinNonMaxSuppressionV4 = 120,
+  kTfLiteBuiltinNonMaxSuppressionV5 = 121,
+  kTfLiteBuiltinScatterNd = 122,
+  kTfLiteBuiltinSelectV2 = 123,
+  kTfLiteBuiltinDensify = 124,
+  kTfLiteBuiltinSegmentSum = 125,
+  kTfLiteBuiltinBatchMatmul = 126,
+  kTfLiteBuiltinPlaceholderForGreaterOpCodes = 127,
+  kTfLiteBuiltinCumsum = 128,
+  kTfLiteBuiltinCallOnce = 129,
+  kTfLiteBuiltinBroadcastTo = 130,
+  kTfLiteBuiltinRfft2d = 131,
+  kTfLiteBuiltinConv3d = 132,
+  kTfLiteBuiltinImag = 133,
+  kTfLiteBuiltinReal = 134,
+  kTfLiteBuiltinComplexAbs = 135,
+  kTfLiteBuiltinHashtable = 136,
+  kTfLiteBuiltinHashtableFind = 137,
+  kTfLiteBuiltinHashtableImport = 138,
+  kTfLiteBuiltinHashtableSize = 139,
+  kTfLiteBuiltinReduceAll = 140,
+  kTfLiteBuiltinConv3dTranspose = 141,
+  kTfLiteBuiltinVarHandle = 142,
+  kTfLiteBuiltinReadVariable = 143,
+  kTfLiteBuiltinAssignVariable = 144,
+  kTfLiteBuiltinBroadcastArgs = 145,
+  kTfLiteBuiltinRandomStandardNormal = 146,
+  kTfLiteBuiltinBucketize = 147,
+  kTfLiteBuiltinRandomUniform = 148,
+  kTfLiteBuiltinMultinomial = 149,
+  kTfLiteBuiltinGelu = 150,
+  kTfLiteBuiltinDynamicUpdateSlice = 151,
+} TfLiteBuiltinOperator;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_LITE_BUILTIN_OPS_H_
--- a/code/components/tflite-lib/tensorflow/lite/c/c_api_types.h
+++ b/code/components/tflite-lib/tensorflow/lite/c/c_api_types.h
@@ -98,6 +98,7 @@ typedef enum {
  kTfLiteResource = 14,
  kTfLiteVariant = 15,
  kTfLiteUInt32 = 16,
+  kTfLiteUInt16 = 17,
 } TfLiteType;

 // Legacy. Will be deprecated in favor of TfLiteAffineQuantization.
@@ -111,6 +112,12 @@ typedef struct TfLiteQuantizationParams {
  int32_t zero_point;
 } TfLiteQuantizationParams;

+// --------------------------------------------------------------------------
+// Opaque types used by c_api_opaque.h.
+
+// TfLiteOpaqueTensor is an opaque version of TfLiteTensor;
+typedef struct TfLiteOpaqueTensor TfLiteOpaqueTensor;
+
 #ifdef __cplusplus
 }  // extern C
 #endif
--- a/code/components/tflite-lib/tensorflow/lite/c/common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/c/common.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <string.h>
 #endif  // TF_LITE_STATIC_MEMORY

+extern "C" {
+
 size_t TfLiteIntArrayGetSizeInBytes(int size) {
  static TfLiteIntArray dummy;

@@ -34,13 +36,13 @@ size_t TfLiteIntArrayGetSizeInBytes(int size) {

 int TfLiteIntArrayEqual(const TfLiteIntArray* a, const TfLiteIntArray* b) {
  if (a == b) return 1;
-  if (a == NULL || b == NULL) return 0;
+  if (a == nullptr || b == nullptr) return 0;
  return TfLiteIntArrayEqualsArray(a, b->size, b->data);
 }

 int TfLiteIntArrayEqualsArray(const TfLiteIntArray* a, int b_size,
                              const int b_data[]) {
-  if (a == NULL) return (b_size == 0);
+  if (a == nullptr) return (b_size == 0);
  if (a->size != b_size) return 0;
  int i = 0;
  for (; i < a->size; i++)
@@ -52,7 +54,7 @@ int TfLiteIntArrayEqualsArray(const TfLiteIntArray* a, int b_size,

 TfLiteIntArray* TfLiteIntArrayCreate(int size) {
  size_t alloc_size = TfLiteIntArrayGetSizeInBytes(size);
-  if (alloc_size <= 0) return NULL;
+  if (alloc_size <= 0) return nullptr;
  TfLiteIntArray* ret = (TfLiteIntArray*)malloc(alloc_size);
  if (!ret) return ret;
  ret->size = size;
@@ -60,7 +62,7 @@ TfLiteIntArray* TfLiteIntArrayCreate(int size) {
 }

 TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src) {
-  if (!src) return NULL;
+  if (!src) return nullptr;
  TfLiteIntArray* ret = TfLiteIntArrayCreate(src->size);
  if (ret) {
    memcpy(ret->data, src->data, src->size * sizeof(int));
@@ -99,7 +101,7 @@ void TfLiteTensorDataFree(TfLiteTensor* t) {
      t->allocation_type == kTfLitePersistentRo) {
    free(t->data.raw);
  }
-  t->data.raw = NULL;
+  t->data.raw = nullptr;
 }

 void TfLiteQuantizationFree(TfLiteQuantization* quantization) {
@@ -108,31 +110,31 @@ void TfLiteQuantizationFree(TfLiteQuantization* quantization) {
        (TfLiteAffineQuantization*)(quantization->params);
    if (q_params->scale) {
      TfLiteFloatArrayFree(q_params->scale);
-      q_params->scale = NULL;
+      q_params->scale = nullptr;
    }
    if (q_params->zero_point) {
      TfLiteIntArrayFree(q_params->zero_point);
-      q_params->zero_point = NULL;
+      q_params->zero_point = nullptr;
    }
    free(q_params);
  }
-  quantization->params = NULL;
+  quantization->params = nullptr;
  quantization->type = kTfLiteNoQuantization;
 }

 void TfLiteSparsityFree(TfLiteSparsity* sparsity) {
-  if (sparsity == NULL) {
+  if (sparsity == nullptr) {
    return;
  }

  if (sparsity->traversal_order) {
    TfLiteIntArrayFree(sparsity->traversal_order);
-    sparsity->traversal_order = NULL;
+    sparsity->traversal_order = nullptr;
  }

  if (sparsity->block_map) {
    TfLiteIntArrayFree(sparsity->block_map);
-    sparsity->block_map = NULL;
+    sparsity->block_map = nullptr;
  }

  if (sparsity->dim_metadata) {
@@ -141,13 +143,13 @@ void TfLiteSparsityFree(TfLiteSparsity* sparsity) {
      TfLiteDimensionMetadata metadata = sparsity->dim_metadata[i];
      if (metadata.format == kTfLiteDimSparseCSR) {
        TfLiteIntArrayFree(metadata.array_segments);
-        metadata.array_segments = NULL;
+        metadata.array_segments = nullptr;
        TfLiteIntArrayFree(metadata.array_indices);
-        metadata.array_indices = NULL;
+        metadata.array_indices = nullptr;
      }
    }
    free(sparsity->dim_metadata);
-    sparsity->dim_metadata = NULL;
+    sparsity->dim_metadata = nullptr;
  }

  free(sparsity);
@@ -156,16 +158,16 @@ void TfLiteSparsityFree(TfLiteSparsity* sparsity) {
 void TfLiteTensorFree(TfLiteTensor* t) {
  TfLiteTensorDataFree(t);
  if (t->dims) TfLiteIntArrayFree(t->dims);
-  t->dims = NULL;
+  t->dims = nullptr;

  if (t->dims_signature) {
    TfLiteIntArrayFree((TfLiteIntArray *) t->dims_signature);
  }
-  t->dims_signature = NULL;
+  t->dims_signature = nullptr;

  TfLiteQuantizationFree(&t->quantization);
  TfLiteSparsityFree(t->sparsity);
-  t->sparsity = NULL;
+  t->sparsity = nullptr;
 }

 void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
@@ -185,7 +187,7 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
  tensor->is_variable = is_variable;

  tensor->quantization.type = kTfLiteNoQuantization;
-  tensor->quantization.params = NULL;
+  tensor->quantization.params = nullptr;
 }

 TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst) {
@@ -229,6 +231,8 @@ const char* TfLiteTypeGetName(TfLiteType type) {
      return "NOTYPE";
    case kTfLiteFloat32:
      return "FLOAT32";
+    case kTfLiteUInt16:
+      return "UINT16";
    case kTfLiteInt16:
      return "INT16";
    case kTfLiteInt32:
@@ -263,14 +267,6 @@ const char* TfLiteTypeGetName(TfLiteType type) {
  return "Unknown type";
 }

-TfLiteDelegate TfLiteDelegateCreate(void) {
-  TfLiteDelegate d = {
-      .data_ = NULL,
-      .Prepare = NULL,
-      .CopyFromBufferHandle = NULL,
-      .CopyToBufferHandle = NULL,
-      .FreeBufferHandle = NULL,
-      .flags = kTfLiteDelegateFlagsNone,
-  };
-  return d;
-}
+TfLiteDelegate TfLiteDelegateCreate() { return TfLiteDelegate{}; }
+
+}  // extern "C"
--- a/code/components/tflite-lib/tensorflow/lite/c/common.h
+++ b/code/components/tflite-lib/tensorflow/lite/c/common.h
@@ -173,8 +173,9 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
    }                                                 \
  } while (false)
 #else  // TF_LITE_STRIP_ERROR_STRINGS
-#define TF_LITE_KERNEL_LOG(context, ...)
-#define TF_LITE_MAYBE_KERNEL_LOG(context, ...)
+#define UNUSED(...) (void)sizeof(#__VA_ARGS__)
+#define TF_LITE_KERNEL_LOG(context, ...) UNUSED(__VA_ARGS__)
+#define TF_LITE_MAYBE_KERNEL_LOG(context, ...) UNUSED(__VA_ARGS__)
 #endif  // TF_LITE_STRIP_ERROR_STRINGS

 // Check whether value is true, and if not return kTfLiteError from
@@ -316,6 +317,7 @@ typedef union TfLitePtrUnion {
  uint8_t* uint8;
  bool* b;
  int16_t* i16;
+  uint16_t* ui16;
  TfLiteComplex64* c64;
  TfLiteComplex128* c128;
  int8_t* int8;
@@ -459,7 +461,8 @@ typedef struct TfLiteTensor {
  // Optional. Encodes shapes with unknown dimensions with -1. This field is
  // only populated when unknown dimensions exist in a read-write tensor (i.e.
  // an input or output tensor). (e.g.  `dims` contains [1, 1, 1, 3] and
-  // `dims_signature` contains [1, -1, -1, 3]).
+  // `dims_signature` contains [1, -1, -1, 3]). Note that this field only
+  // exists when TF_LITE_STATIC_MEMORY is not defined.
  const TfLiteIntArray* dims_signature;
 } TfLiteTensor;

--- a/code/components/tflite-lib/tensorflow/lite/context_util.h
+++ b/code/components/tflite-lib/tensorflow/lite/context_util.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This provides a few C++ helpers that are useful for manipulating C structures
+// in C++.
+#ifndef TENSORFLOW_LITE_CONTEXT_UTIL_H_
+#define TENSORFLOW_LITE_CONTEXT_UTIL_H_
+
+#include <stddef.h>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+// Provide a range iterable wrapper for TfLiteIntArray* (C lists that TfLite
+// C api uses. Can't use the google array_view, since we can't depend on even
+// absl for embedded device reasons.
+class TfLiteIntArrayView {
+ public:
+  // Construct a view of a TfLiteIntArray*. Note, `int_array` should be non-null
+  // and this view does not take ownership of it.
+  explicit TfLiteIntArrayView(const TfLiteIntArray* int_array)
+      : int_array_(int_array) {}
+
+  TfLiteIntArrayView(const TfLiteIntArrayView&) = default;
+  TfLiteIntArrayView& operator=(const TfLiteIntArrayView& rhs) = default;
+
+  typedef const int* const_iterator;
+  const_iterator begin() const { return int_array_->data; }
+  const_iterator end() const { return &int_array_->data[int_array_->size]; }
+  size_t size() const { return end() - begin(); }
+  int operator[](size_t pos) const { return int_array_->data[pos]; }
+
+ private:
+  const TfLiteIntArray* int_array_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CONTEXT_UTIL_H_
--- a/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -208,6 +208,14 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
      return ParseBatchToSpaceNd(op, error_reporter, allocator, builtin_data);
    }

+    case BuiltinOperator_BROADCAST_ARGS: {
+      return ParseBroadcastArgs(op, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_BROADCAST_TO: {
+      return ParseBroadcastTo(op, error_reporter, allocator, builtin_data);
+    }
+
    case BuiltinOperator_CALL_ONCE: {
      return ParseCallOnce(op, error_reporter, allocator, builtin_data);
    }
@@ -336,6 +344,10 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
      return ParseLogSoftmax(op, error_reporter, allocator, builtin_data);
    }

+    case BuiltinOperator_LSTM: {
+      return ParseLSTM(op, error_reporter, allocator, builtin_data);
+    }
+
    case BuiltinOperator_MAXIMUM: {
      return ParseMaximum(op, error_reporter, allocator, builtin_data);
    }
@@ -605,37 +617,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
      *builtin_data = params.release();
      return kTfLiteOk;
    }
-    case BuiltinOperator_LSTM: {
-      auto params = safe_allocator.Allocate<TfLiteLSTMParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* lstm_params = op->builtin_options_as_LSTMOptions()) {
-        params->activation =
-            ConvertActivation(lstm_params->fused_activation_function());
-        params->cell_clip = lstm_params->cell_clip();
-        params->proj_clip = lstm_params->proj_clip();
-        switch (lstm_params->kernel_type()) {
-          case LSTMKernelType_FULL:
-            params->kernel_type = kTfLiteLSTMFullKernel;
-            break;
-          case LSTMKernelType_BASIC:
-            params->kernel_type = kTfLiteLSTMBasicKernel;
-            break;
-          default:
-            TF_LITE_REPORT_ERROR(error_reporter,
-                                 "Unhandled LSTM kernel type: %d",
-                                 lstm_params->kernel_type());
-            return kTfLiteError;
-        }
-        params->asymmetric_quantize_inputs =
-            lstm_params->asymmetric_quantize_inputs();
-      } else {
-        TF_LITE_REPORT_ERROR(error_reporter,
-                             "No valid LSTM builtin options exist");
-        return kTfLiteError;
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: {
      return ParseUnidirectionalSequenceLSTM(op, error_reporter, allocator,
                                             builtin_data);
@@ -883,7 +864,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
    case BuiltinOperator_SCATTER_ND:
    case BuiltinOperator_DENSIFY:
    case BuiltinOperator_SEGMENT_SUM:
-    case BuiltinOperator_BROADCAST_TO:
    case BuiltinOperator_RFFT2D:
    case BuiltinOperator_IMAG:
    case BuiltinOperator_REAL:
@@ -891,7 +871,7 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
    case BuiltinOperator_HASHTABLE_FIND:
    case BuiltinOperator_HASHTABLE_IMPORT:
    case BuiltinOperator_HASHTABLE_SIZE:
-    case BuiltinOperator_BROADCAST_ARGS:
+    case BuiltinOperator_DYNAMIC_UPDATE_SLICE:
      return kTfLiteOk;
    case BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES:
      return kTfLiteError;
@@ -916,6 +896,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
    case TensorType_INT16:
      *type = kTfLiteInt16;
      return kTfLiteOk;
+    case TensorType_UINT16:
+      *type = kTfLiteUInt16;
+      return kTfLiteOk;
    case TensorType_INT32:
      *type = kTfLiteInt32;
      return kTfLiteOk;
@@ -1085,6 +1068,22 @@ TfLiteStatus ParseBatchToSpaceNd(const Operator*, ErrorReporter*,
  return kTfLiteOk;
 }

+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseBroadcastArgs(const Operator*, ErrorReporter*,
+                                BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseBroadcastTo(const Operator*, ErrorReporter*,
+                              BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseCallOnce(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator,
                           void** builtin_data) {
@@ -1605,6 +1604,40 @@ TfLiteStatus ParseLogSoftmax(const Operator*, ErrorReporter*,
  return kTfLiteOk;
 }

+TfLiteStatus ParseLSTM(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  auto params = safe_allocator.Allocate<TfLiteLSTMParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+  if (const auto* lstm_params = op->builtin_options_as_LSTMOptions()) {
+    params->activation =
+        ConvertActivation(lstm_params->fused_activation_function());
+    params->cell_clip = lstm_params->cell_clip();
+    params->proj_clip = lstm_params->proj_clip();
+    switch (lstm_params->kernel_type()) {
+      case LSTMKernelType_FULL:
+        params->kernel_type = kTfLiteLSTMFullKernel;
+        break;
+      case LSTMKernelType_BASIC:
+        params->kernel_type = kTfLiteLSTMBasicKernel;
+        break;
+      default:
+        TF_LITE_REPORT_ERROR(error_reporter, "Unhandled LSTM kernel type: %d",
+                             lstm_params->kernel_type());
+        return kTfLiteError;
+    }
+    params->asymmetric_quantize_inputs =
+        lstm_params->asymmetric_quantize_inputs();
+  } else {
+    TF_LITE_REPORT_ERROR(error_reporter, "No valid LSTM builtin options exist");
+    return kTfLiteError;
+  }
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
@@ -2337,6 +2370,31 @@ TfLiteStatus ParseVarHandle(const Operator* op, ErrorReporter* error_reporter,
  return kTfLiteOk;
 }

+TfLiteStatus ParseWhile(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteWhileParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteWhileParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const WhileOptions* schema_params = op->builtin_options_as_WhileOptions();
+
+  if (schema_params != nullptr) {
+    params->cond_subgraph_index = schema_params->cond_subgraph_index();
+    params->body_subgraph_index = schema_params->body_subgraph_index();
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
--- a/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -98,6 +98,15 @@ TfLiteStatus ParseBatchToSpaceNd(const Operator* op,
                                 BuiltinDataAllocator* allocator,
                                 void** builtin_data);

+TfLiteStatus ParseBroadcastArgs(const Operator* op,
+                                ErrorReporter* error_reporter,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data);
+
+TfLiteStatus ParseBroadcastTo(const Operator* op, ErrorReporter* error_reporter,
+                              BuiltinDataAllocator* allocator,
+                              void** builtin_data);
+
 TfLiteStatus ParseCallOnce(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator,
                           void** builtin_data);
@@ -232,6 +241,9 @@ TfLiteStatus ParseLogSoftmax(const Operator* op, ErrorReporter* error_reporter,
                             BuiltinDataAllocator* allocator,
                             void** builtin_data);

+TfLiteStatus ParseLSTM(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseMaximum(const Operator* op, ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);

@@ -379,6 +391,9 @@ TfLiteStatus ParseVarHandle(const Operator* op, ErrorReporter* error_reporter,
                            BuiltinDataAllocator* allocator,
                            void** builtin_data);

+TfLiteStatus ParseWhile(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseZerosLike(const Operator* op, ErrorReporter* error_reporter,
                            BuiltinDataAllocator* allocator,
                            void** builtin_data);
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/portable_tensor_utils.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/portable_tensor_utils.h
@@ -60,9 +60,8 @@ void VectorBatchVectorAdd(const T* vector, int v_size, int n_batch,

 // Cwise product of two vectors.
 template <typename T>
-inline void VectorVectorCwiseProduct(const T* __restrict__ vector1,
-                                     const T* __restrict__ vector2, int v_size,
-                                     T* __restrict__ result) {
+inline void VectorVectorCwiseProduct(const T* vector1, const T* vector2,
+                                     int v_size, T* result) {
  for (int v = 0; v < v_size; v++) {
    *result++ = *vector1++ * *vector2++;
  }
@@ -117,6 +116,367 @@ void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch,
  }
 }

+// Checks if all entries of vector are zero for float.
+bool IsZeroVector(const float* vector, int v_size);
+
+// Checks if all entries of vector are zero for int8.
+bool IsZeroVector(const int8_t* vector, int v_size);
+
+// Quantizes a buffer of floating point values using a symmetric quantization
+// (i.e. linear quantization without an offset) to 8-bit signed integers.
+// It also outputs the range (min, max) of the floating point buffer, and the
+// scaling factor used to quantize the values.
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min_value,
+                             float* max_value, float* scaling_factor);
+
+// Quantizes a buffer of floating point values using a symmetric quantization
+// (i.e. linear quantization without an offset) to 8-bit signed integers.
+// It uses the range (min, max) provided to the function to calculate the
+// appropriate scaling factor to quantize the values.
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float min_value,
+                             float max_value, float* scaling_factor);
+
+void AsymmetricQuantizeFloats(const float* values, const int size,
+                              int8_t* quantized_values, float* scaling_factor,
+                              int32_t* offset);
+
+// Helper function to quantize floats.
+// float_data_ptr     input float vectors
+// n_batch            number of input vectors
+// n_data             size of a single input vector
+// quantized_data_ptr (out) vector with quantized data
+// scaling_factors    (out) scaling factors (one per vector)
+// zero_points        (out) zero points (one per vector)
+// do_asymmetric      controls if the quantization should be asymmetric.
+inline void BatchQuantizeFloats(const float* float_data_ptr, int n_batch,
+                                int n_data, int8_t* quantized_data_ptr,
+                                float* scaling_factors, int32_t* zero_points,
+                                bool do_asymmetric) {
+  for (int b = 0; b < n_batch; ++b) {
+    const int offset = b * n_data;
+    if (do_asymmetric) {
+      tensor_utils::AsymmetricQuantizeFloats(
+          float_data_ptr + offset, n_data, quantized_data_ptr + offset,
+          &scaling_factors[b], &zero_points[b]);
+    } else {
+      float unused_min, unused_max;
+      tensor_utils::SymmetricQuantizeFloats(
+          float_data_ptr + offset, n_data, quantized_data_ptr + offset,
+          &unused_min, &unused_max, &scaling_factors[b]);
+    }
+  }
+}
+
+// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
+// dimension composed by input vectors independent from each other). The result
+// of the multiplication is accumulated to the passed result buffer.
+// More specifically, for a matrix M of shape [n, i] and a batched-vector
+// of shape [i, batch] it will first compute the product of shape [n, batch].
+// This product will be accumulated to the result buffer.
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result);
+
+// Same as the function above, but the matrix is a sparse tensor with block
+// pattern 1x4.
+// This function assumes that m_cols is a multiple of the block size (4 in this
+// case) so that there's no incomplete block.
+void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result);
+
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//      with an integer representing the number of non-zero blocks for the
+//      corresponding row and follows with column indexes of the first element
+//      of each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result);
+
+// Same as the function above, but for values quantized using symmetric
+// quantization (e.g. by calling SymmetricQuantizeFloats).
+// The passed scaling factors is a buffer of the quantization scaling factors
+// that will be used to dequentize the products into the final result buffer.
+// These scaling factors are the multiplication of the matrix scaling factor
+// by the vector's scaling factor, one per batch (i.e. this allows quantizing
+// each batch in the batch-vector matrix independently).
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result);
+
+// Same as the function above except that vector values
+// are quantized with asymmetric quantization per-batch and the matrix
+// is quantized per row.
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, const float* __restrict__ per_channel_scale,
+    const int32_t* __restrict__ input_offset);
+
+// Same as the function above, but the matrix is a sparse tensor with block
+// pattern 1x16.
+// This function assumes that m_cols is a multiple of the block size (16 in this
+// case) so that there's no incomplete block. Also, it assumes all offsets of
+// input, output and filter are zero.
+void SparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result);
+
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//      with an integer representing the number of non-zero blocks for the
+//      corresponding row followed by column index of the first element of
+//      each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result);
+
+// Same as the above 8, 8, 8 integer matmul except for the presence of zero
+// point and non-accumulative.
+// TODO(b/148688698): remove this function by folding zero point calculation in
+// prepare() function.
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp);
+
+// Same as above but has 16 bit and 8 bit input and 8 bit output.
+// Used in projection when hidden is 16bit.
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output);
+
+// Apply Layer Normalization (https://arxiv.org/abs/1607.06450) to a Quantized
+// vector.
+// Parameters:
+//     - input: batch vector of size n_batch * n_input; 16 bit.
+//     - layer_norm_weights:  the quantized layer normalization weights.
+//     - bias: the bias for the layer normalization.
+//     - layer_norm_scale_a: multiplier for scale factor.
+//     - layer_norm_scale_b: shift for scale factor.
+//     - variance_limit: the guard to make sure the inverse does not overflow.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - output:  the 16 bit output
+void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                    const int32_t* bias, int32_t layer_norm_scale_a,
+                    int32_t layer_norm_scale_b, int32_t variance_limit,
+                    int n_batch, int n_input, int16_t* output);
+
+// Same as above but the internal calculation is done in float.
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output);
+
+// Apply Sigmoid to a quantized vector.
+// Parameters:
+//     - input: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - output:  the 16 bit output
+// The input is in Q3.12 format and the output is in Q0.15 format.
+void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                  int16_t* output);
+
+// Same as above but the internal calcualtion is float.
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output);
+
+// Apply Tanh to a quantized vector.
+// Parameters:
+//     - integer_bits: the integer bits of the input.
+//                     Currently supports 0, 1, 2, 3, 4, 5, 6.
+//     - input: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - output:  the 16 bit output
+// The input is in Qm.15-m format and the output is in Q0.15 format.
+void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
+               int32_t n_input, int16_t* output);
+
+// Apply Tanh to a quantized vector. Tbe internal calculation is in float.
+//    - Input has 2^(integer_bits) as scale.
+//    - Output has Q0.15 as scale.
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output);
+
+// Element-wise multiplication of two quantized vectors.
+// Parameters:
+//     - input_1: batch vector of size n_batch * n_input; 16 bit.
+//     - input_2: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - shift:   the shift needed to produce the output.
+//     - output:  the 16 bit output of size n_batch * n_input.
+// Output does not need to be initialized.
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int16_t* output);
+
+// Element-wise multiplication of two quantized vectors.
+// Parameters:
+//     - input_1: batch vector of size n_batch * n_input; 16 bit.
+//     - input_2: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - shift:   the shift needed to produce the output.
+//     - output:  the 8 bit output of size n_batch * n_input.
+// Output does not need to be initialized.
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int8_t* output);
+
+// Element-wise multiplication of two quantized vectors with rescaling.
+// Parameters:
+//     - input_1:    batch vector of size n_batch * n_input; 16 bit.
+//     - input_2:    batch vector of size n_batch * n_input; 16 bit.
+//     - multiplier: the multiplier part of scale.
+//     - shift:      the shift part of scale.
+//     - n_batch:    the number of batches.
+//     - n_input:    the size for input and output.
+//     - output:     the 8 bit output of size n_batch * n_input.
+//     - output_zp:  the zero point of output.
+// Output does not need to be initialized.
+// Multiplier ("m") and shift ("s") are connected to scale ("s") with s = m *
+// 2^(s - 31).
+void CwiseMul(const int16_t* input_1, const int16_t* input_2,
+              int32_t multiplier, int32_t shift, int32_t n_batch,
+              int32_t n_input, int32_t output_zp, int8_t* output);
+
+// Element-wise saturating addition of two quantized vectors without rescaling.
+// Parameters:
+//     - input_1:    batch vector of size n_batch * n_input; 16 bit.
+//     - input_2:    batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch:    the number of batches.
+//     - n_input:    the size for input and output.
+//     - output:     the 8 bit output of size n_batch * n_input.
+// Output does not need to be initialized.
+void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int16_t* output);
+
+// Element-wise in-place clipping of a vector. Overloaded for float, int16_t,
+// int8_t. Parameters:
+//     - vector:         vector of size v_size.
+//     - v_size:         the size of the vector.
+//     - clipping_value: the value used for clipping.
+void CwiseClipping(float* vector, const int v_size, const float clipping_value);
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value);
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value);
+
+// Dot product of two vectors.
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size);
+
+// Dot product of two batch vectors of size n_batch * v_size:
+// vector1 = [x_1_1, x_1_2, ..., x_1_vsize,
+//            x_2_1, x_2_2, ..., x_2_vsize,
+//            ...
+//            x_nbatch_1,..., x_nbatch_vsize]
+// vector2 = [y_1_1, y_1_2, ..., y_1_vsize,
+//            y_2_1, y_2_2, ..., y_2_vsize,
+//            ...
+//            y_nbatch_1,..., y_nbatch_vsize]
+// Then result will be a vector of n_batch size starting from 'result':
+// [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize,
+//  x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize,
+//  ...
+//  x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize]
+template <typename T>
+inline void BatchVectorBatchVectorDotProduct(const T* vector1, const T* vector2,
+                                             int v_size, int n_batch,
+                                             T* result) {
+  for (int b = 0; b < n_batch; b++) {
+    result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
+    vector1 += v_size;
+    vector2 += v_size;
+  }
+}
+
+// Same as above but input is 16bit and output is 32bit.
+void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
+                                      const int16_t* vector2, int v_size,
+                                      int n_batch, int32_t* result);
+
+// Same as above, but inputs are 16bit integer and output is 16bit integer.
+void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
+                                             const int16_t* batch_vector,
+                                             int n_batch, int32_t multiplier,
+                                             int shift, int16_t* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void Sub1Vector(const float* vector, int v_size, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG) for int16 input.
+// "vector" has range [0, 32767] because it is the output of sigmoid function.
+void Sub1Vector(const int16_t* vector, int v_size, int16_t* result);
+
+// Multiply all elements of vector with a scalar.
+void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                          float* result);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void ReductionSumVector(const float* input_vector, float* output_vector,
+                        int output_size, int reduction_size);
+
+// Same as above but input/output is 32 bit integer.
+void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size);
+
+// Same as above but input is 8 bit integer.
+void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size);
+
+// Layer norm for each batch.
+void MeanStddevNormalization(const float* input_vector, float* output_vector,
+                             int v_size, int n_batch);
+
+// Saturate Add with rescale on both inputs.
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output);
+
 }  // namespace tensor_utils

 }  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/batch_matmul.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/batch_matmul.h
@@ -20,7 +20,7 @@ limitations under the License.

 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/internal/tensor_utils_common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/types.h"

 namespace tflite {
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_args.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_args.h
@@ -0,0 +1,56 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_ARGS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_ARGS_H_
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T>
+void BroadcastArgs(const RuntimeShape& input1_shape, const T* input1_data,
+                   const RuntimeShape& input2_shape, const T* input2_data,
+                   const RuntimeShape& output_shape, T* output_data) {
+  // Gets data at the backward index i of the shape tensor. Returns 1 if the
+  // index is out of range.
+  auto get_shape_data = [](const RuntimeShape& shape, const T* data,
+                           int backward_idx) -> T {
+    int forward_idx = shape.FlatSize() - 1 - backward_idx;
+    if (forward_idx < 0) return 1;
+    return data[forward_idx];
+  };
+
+  int output_num_elements = output_shape.FlatSize();
+  for (int i = 0; i < output_num_elements; ++i) {
+    int backward_i = output_num_elements - 1 - i;
+    int shape1_i = get_shape_data(input1_shape, input1_data, i);
+    int shape2_i = get_shape_data(input2_shape, input2_data, i);
+    if (shape1_i == 1) {
+      output_data[backward_i] = shape2_i;
+    } else if (shape2_i == 1) {
+      output_data[backward_i] = shape1_i;
+    } else {
+      TFLITE_CHECK_EQ(shape1_i, shape2_i);
+      output_data[backward_i] = shape1_i;
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_ARGS_H_
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_to.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_to.h
@@ -0,0 +1,97 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace reference_ops {
+template <int N>
+void BroadcastImpl(const NdArrayDesc<N>& input_desc, const char* input_data,
+                   const NdArrayDesc<N>& output_desc, char* output_data,
+                   int indexes[N], int dim, const int last_broadcasting_dim,
+                   const int type_size) {
+  // Copy data from input to output.
+  if (dim == last_broadcasting_dim) {
+    int copy_size = output_desc.strides[dim] * type_size;
+    const char* data_src =
+        input_data + SubscriptToIndex(input_desc, indexes) * type_size;
+    char* data_dst =
+        output_data + SubscriptToIndex(output_desc, indexes) * type_size;
+    for (int i = 0; i < output_desc.extents[dim]; ++i, data_dst += copy_size) {
+      memcpy(data_dst, data_src, copy_size);
+    }
+    return;
+  }
+
+  // Recursive call to find the next broadcasting.
+  for (indexes[dim] = 0; indexes[dim] < input_desc.extents[dim];
+       ++indexes[dim]) {
+    BroadcastImpl<N>(input_desc, input_data, output_desc, output_data, indexes,
+                     dim + 1, last_broadcasting_dim, type_size);
+  }
+
+  // Duplicate data in output tensor.
+  indexes[dim] = 0;
+  if (input_desc.extents[dim] != output_desc.extents[dim]) {
+    int copy_size = output_desc.strides[dim] * type_size;
+    char* data_src =
+        output_data + SubscriptToIndex(output_desc, indexes) * type_size;
+    char* data_dst = data_src + copy_size;
+    for (int i = 1; i < output_desc.extents[dim]; ++i, data_dst += copy_size) {
+      memcpy(data_dst, data_src, copy_size);
+    }
+  }
+}
+
+template <int N>
+inline void BroadcastTo(const RuntimeShape& unextended_input_shape,
+                        const char* input_data,
+                        const RuntimeShape& unextended_output_shape,
+                        char* output_data, TfLiteType data_type) {
+  NdArrayDesc<N> input_desc;
+  NdArrayDesc<N> output_desc;
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_input_shape),
+                 &input_desc);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                 &output_desc);
+
+  // Get the last dimension has broadcasting. At this dimension, the data is
+  // copied from input tensor to output tensor.
+  int last_broadcast_dim = -1;
+  for (int i = N - 1; i >= 0; --i) {
+    if (input_desc.extents[i] != output_desc.extents[i]) {
+      last_broadcast_dim = i;
+      break;
+    }
+  }
+
+  // If non-broadcasting, just copy data from input to output tensor.
+  if (last_broadcast_dim == -1) {
+    memcpy(output_data, input_data,
+           unextended_input_shape.FlatSize() * TfLiteTypeGetSize(data_type));
+    return;
+  }
+
+  // Broadcasting using memcpy.
+  int indexes[N] = {0};
+  BroadcastImpl<N>(input_desc, input_data, output_desc, output_data, indexes, 0,
+                   last_broadcast_dim, TfLiteTypeGetSize(data_type));
+}
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/conv.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/conv.h
@@ -43,7 +43,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
  (void)im2col_data;   // only used in optimized code.
  (void)im2col_shape;  // only used in optimized code.
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int input_depth = input_shape.Dims(3);
  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
  if (bias_data) {
    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
@@ -52,14 +52,20 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
  const int input_width = input_shape.Dims(2);
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
+
  for (int batch = 0; batch < batches; ++batch) {
    for (int out_y = 0; out_y < output_height; ++out_y) {
      const int in_y_origin = (out_y * stride_height) - pad_height;
      for (int out_x = 0; out_x < output_width; ++out_x) {
        const int in_x_origin = (out_x * stride_width) - pad_width;
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
          float total = 0.f;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            const int in_y = in_y_origin + dilation_height_factor * filter_y;
@@ -74,10 +80,11 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
              if (!is_point_inside_image) {
                continue;
              }
-
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                float input_value = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                float input_value =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
                float filter_value = filter_data[Offset(
                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
                total += (input_value * filter_value);
@@ -126,7 +133,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int input_depth = input_shape.Dims(3);
  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
  if (bias_data) {
    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
@@ -135,6 +142,10 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
  const int input_width = input_shape.Dims(2);
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
  for (int batch = 0; batch < batches; ++batch) {
@@ -143,6 +154,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
      for (int out_x = 0; out_x < output_width; ++out_x) {
        const int in_x_origin = (out_x * stride_width) - pad_width;
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
          int32_t acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            const int in_y = in_y_origin + dilation_height_factor * filter_y;
@@ -158,9 +170,11 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                continue;
              }

-              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
                int32_t filter_val = filter_data[Offset(
                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
                acc +=
@@ -206,7 +220,7 @@ inline void HybridConvPerChannel(
  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int input_depth = input_shape.Dims(3);
  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
  if (bias_data) {
    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
@@ -215,18 +229,24 @@ inline void HybridConvPerChannel(
  const int input_width = input_shape.Dims(2);
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
  for (int batch = 0; batch < batches; ++batch) {
    for (int out_y = 0; out_y < output_height; ++out_y) {
      for (int out_x = 0; out_x < output_width; ++out_x) {
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
          const int in_x_origin = (out_x * stride_width) - pad_width;
          const int in_y_origin = (out_y * stride_height) - pad_height;
          int32_t acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
                const int in_x = in_x_origin + dilation_width_factor * filter_x;
                const int in_y =
                    in_y_origin + dilation_height_factor * filter_y;
@@ -235,7 +255,8 @@ inline void HybridConvPerChannel(
                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height)) {
                  int32_t input_val = input_data[Offset(
-                      input_shape, batch, in_y, in_x, in_channel)];
+                      input_shape, batch, in_y, in_x,
+                      in_channel + group * filter_input_depth)];
                  int32_t filter_val =
                      filter_data[Offset(filter_shape, out_channel, filter_y,
                                         filter_x, in_channel)];
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -48,7 +48,7 @@ inline void ConvPerChannel(
  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int input_depth = input_shape.Dims(3);
  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
  if (bias_data) {
    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
@@ -59,6 +59,10 @@ inline void ConvPerChannel(
  const int input_width = input_shape.Dims(2);
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
  for (int batch = 0; batch < batches; ++batch) {
@@ -67,6 +71,7 @@ inline void ConvPerChannel(
      for (int out_x = 0; out_x < output_width; ++out_x) {
        const int in_x_origin = (out_x * stride_width) - pad_width;
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
          int32_t acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            const int in_y = in_y_origin + dilation_height_factor * filter_y;
@@ -82,9 +87,11 @@ inline void ConvPerChannel(
                continue;
              }

-              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
                int32_t filter_val = filter_data[Offset(
                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
                // Accumulate with 32 bits accumulator.
@@ -126,12 +133,13 @@ inline void ConvPerChannel(

 // Fixed-point per-channel-quantization convolution reference kernel.
 // 16-bit data and 8-bit filter
+template <typename AccumScalar>
 inline void ConvPerChannel(
    const ConvParams& params, const int32_t* output_multiplier,
    const int32_t* output_shift, const RuntimeShape& input_shape,
    const int16_t* input_data, const RuntimeShape& filter_shape,
    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    const AccumScalar* bias_data, const RuntimeShape& output_shape,
    int16_t* output_data) {
  // Get parameters.
  const int stride_width = params.stride_width;
@@ -151,7 +159,7 @@ inline void ConvPerChannel(
  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int input_depth = input_shape.Dims(3);
  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
  if (bias_data) {
    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
@@ -162,6 +170,10 @@ inline void ConvPerChannel(
  const int input_width = input_shape.Dims(2);
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
  for (int batch = 0; batch < batches; ++batch) {
@@ -170,7 +182,8 @@ inline void ConvPerChannel(
      for (int out_x = 0; out_x < output_width; ++out_x) {
        const int in_x_origin = (out_x * stride_width) - pad_width;
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          std::int64_t acc = 0;
+          auto group = out_channel / filters_per_group;
+          AccumScalar acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            const int in_y = in_y_origin + dilation_height_factor * filter_y;
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
@@ -185,9 +198,11 @@ inline void ConvPerChannel(
                continue;
              }

-              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
                int32_t filter_val = filter_data[Offset(
                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
                // Accumulate with 64 bits accumulator.
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -34,12 +34,13 @@ inline void FullyConnected(
  const int32_t output_activation_min = params.quantized_activation_min;
  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);

  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  const int filter_dim_count = filter_shape.DimensionsCount();
-  const int batches = output_shape.Dims(0);
-  const int output_depth = output_shape.Dims(1);
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = output_shape.Dims(output_dim_count - 1);
  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
  for (int b = 0; b < batches; ++b) {
@@ -62,11 +63,12 @@ inline void FullyConnected(
  }
 }

+template <typename AccumScalar>
 inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
    const int16_t* input_data, const RuntimeShape& filter_shape,
    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const int64_t* bias_data, const RuntimeShape& output_shape,
+    const AccumScalar* bias_data, const RuntimeShape& output_shape,
    int16_t* output_data) {
  const int32_t filter_offset = params.weights_offset;
  const int32_t output_multiplier = params.output_multiplier;
@@ -85,7 +87,7 @@ inline void FullyConnected(
  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
  for (int b = 0; b < batches; ++b) {
    for (int out_c = 0; out_c < output_depth; ++out_c) {
-      int64_t acc = 0;
+      AccumScalar acc = 0;
      for (int d = 0; d < accum_depth; ++d) {
        int32_t input_val = input_data[b * accum_depth + d];
        int32_t filter_val = filter_data[out_c * accum_depth + d];
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
@@ -119,15 +119,16 @@ inline void TransposeConv(
  }
 }

-// int16_t input (zero_point=0), int8_t filter, int64 accumulator
+// int16_t input (zero_point=0), int8_t filter, int32 or int64 accumulator
+template <typename Scalar>
 inline void TransposeConv(
    const ConvParams& params, const int32_t* output_multiplier,
    const int32_t* output_shift, const RuntimeShape& input_shape,
    const int16_t* input_data, const RuntimeShape& filter_shape,
    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    const Scalar* bias_data, const RuntimeShape& output_shape,
    int16_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
-    std::int64_t* scratch_buffer) {
+    Scalar* scratch_buffer) {
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
  const int pad_width = params.padding_values.width;
@@ -157,7 +158,7 @@ inline void TransposeConv(
  const int num_elements = output_shape.FlatSize();
  // We need to initialize scratch_buffer to all 0s, as we apply the same
  // 'scatter' based trick as in float version.
-  memset(scratch_buffer, 0, num_elements * sizeof(std::int64_t));
+  memset(scratch_buffer, 0, num_elements * sizeof(Scalar));

  // Loop through input elements one at a time.
  for (int batch = 0; batch < batches; ++batch) {
@@ -198,8 +199,8 @@ inline void TransposeConv(
    for (int out_y = 0; out_y < output_height; ++out_y) {
      for (int out_x = 0; out_x < output_width; ++out_x) {
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          std::int64_t acc = scratch_buffer[Offset(output_shape, batch, out_y,
-                                                   out_x, out_channel)];
+          Scalar acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                             out_channel)];
          if (bias_data) {
            acc += bias_data[out_channel];
          }
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/lstm_cell.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/lstm_cell.h
@@ -0,0 +1,422 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LSTM_CELL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LSTM_CELL_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/concatenation.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void LstmCell(
+    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
+    const float* input_data, const RuntimeShape& unextended_prev_activ_shape,
+    const float* prev_activ_data, const RuntimeShape& weights_shape,
+    const float* weights_data, const RuntimeShape& unextended_bias_shape,
+    const float* bias_data, const RuntimeShape& unextended_prev_state_shape,
+    const float* prev_state_data,
+    const RuntimeShape& unextended_output_state_shape, float* output_state_data,
+    const RuntimeShape& unextended_output_activ_shape, float* output_activ_data,
+    const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data,
+    const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data) {
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape =
+      RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int batches =
+      MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
+                  output_state_shape, 0, output_activ_shape, 0);
+  const int height =
+      MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
+                  output_state_shape, 1, output_activ_shape, 1);
+  const int width =
+      MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
+                  output_state_shape, 2, output_activ_shape, 2);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
+                   total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  const int intern_activ_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
+                   intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                  3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  float const* concat_input_arrays_data[2] = {input_data, prev_activ_data};
+  const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
+                                                       &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes,
+                concat_input_arrays_data, concat_temp_shape, concat_temp_data);
+
+  // Fully connected
+  tflite::FullyConnectedParams fc_params;
+  fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+  fc_params.float_activation_max = std::numeric_limits<float>::max();
+  FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape,
+                 weights_data, bias_shape, bias_data, activ_temp_shape,
+                 activ_temp_data);
+
+  // Memory state update (the LSTM "guts")
+  for (int b = 0; b < batches; ++b) {
+    for (int w = 0; w < width; ++w) {
+      for (int h = 0; h < height; ++h) {
+        for (int c = 0; c < output_depth; ++c) {
+          const float input_gate =
+              1.f /
+              (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
+                                                      0 * output_depth + c)]));
+          const float new_input = std::tanh(activ_temp_data[Offset(
+              activ_temp_shape, b, h, w, 1 * output_depth + c)]);
+          const float forget_gate =
+              1.f /
+              (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
+                                                      2 * output_depth + c)]));
+          const float output_gate =
+              1.f /
+              (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
+                                                      3 * output_depth + c)]));
+          const float new_state =
+              input_gate * new_input +
+              forget_gate *
+                  prev_state_data[Offset(prev_state_shape, b, h, w, c)];
+          output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state;
+          output_activ_data[Offset(output_activ_shape, b, h, w, c)] =
+              output_gate * std::tanh(new_state);
+        }
+      }
+    }
+  }
+}
+
+// Quantized LSTM cell implementation.
+// The quantization of the input, output arrays is as follows:
+//  - The input activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that is the natural interval for output
+//    activations (see next point) and these need to be concatenated together.
+//    We could accommodate different ranges by re-scaling, but we empirically
+//    found that setting the input activations range to be [-1, 127/128] in the
+//    first place, removing the need for re-scaling, greatly improves accuracy.
+//  - The output activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that the definition of a LSTM cell makes them
+//    intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
+//    makes for simpler, more accurate fixed-point arithmetic.
+//  - The output-at-previous-timestep state array is obviously quantized as
+//    the output activations.
+//  - The internal LSTM memory (not the output-at-previous-timestep, the other
+//    internal state array) is int16-quantized and may use any power-of-two,
+//    symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
+//    StateIntegerBits below, see the below discussion of that template
+//    parameter ("The StateIntegerBits template parameter").
+//  - The output of the internal fully-connected node is int16-quantized
+//    on the interval [-8, 8 * 32767/32768], the rationale for which is
+//    explained just below ("Why [-8, 8] for fully-connected output?").
+//
+//
+// === The StateIntegerBits template parameter ===
+//
+// The StateIntegerBits template parameter controls the fixed-point format used
+// to represent the internal memory of the LSTM cell (not the
+// output-at-previous-timestep, the other internal state array). It's currently
+// a template parameter so that the model can control that. The most typical
+// value for StateIntegerBits is 4. Other plausible values are anywhere between
+// 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
+// and drop that template parameter. The reason why it can't be a runtime
+// parameter is that this controls the fixed-point format used, i.e. we need to
+// generate actually different code based on it. In particular, we generate code
+// for a fixed-point tanh() implementation for that format, which internally
+// uses a fixed-point exp() implementation, which internally uses a
+// barrel-shifter with a number of steps that depends on StateIntegerBits.
+// Another consequence of that is that a higher value of StateIntegerBits
+// results in a more expensive implementation (more barrel shifter steps
+// needed).
+//
+//
+// === Why [-8, 8] for fully-connected output? ===
+//
+// This array is only fed to Logistic and Tanh functions, for which
+// the quantized implementation will want to use fixed-point arithmetic,
+// requiring a power-of-two representation interval. Thus, we should right
+// away quantize this array to a power-of-two interval; otherwise,
+// implementation will need to rescale that, losing any benefit that a tighter
+// representation interval might otherwise yield, while introducing some
+// numerical error and computational overhead.
+//
+// Now, Logistic and Tanh
+// are nearly constant (nearly equal to their horizontal asymptotes)
+// outside of a small bounded interval around 0:
+//
+//   Logistic(4) = 1 - 1.8e-2     Tanh(4) = 1 - 6.7e-4
+//   Logistic(8) = 1 - 3.4e-4     Tanh(8) = 1 - 2.3e-7
+//   Logistic(16) = 1 - 1.1e-7    Tanh(16) = 1 - 2.5e-14
+//
+// From this, we see that clamping to [-4, 4] would be too inaccurate
+// (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
+// while clamping to [-16, 16] would make no difference even in float32.
+// However, for a fixed-point implementation in 16-bit integers, using 5
+// integer bits to represent the [-16, 16] range would leave only 11
+// fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
+// representable values. Notice that is higher than the
+// worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
+// Using [-8, 8] thus seems like the better compromise overall, enjoying
+// an increment of 2.4e-4 between representable values and a worst-case
+// clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
+// [-16, 16].
+//
+// Moreover, all other things being equal, it is nice to choose the narrower
+// representation range, as that makes the implementation of fixed-point
+// math functions a little cheaper (each integer bit requires an additional
+// barrel-shifter atep in the implementation of exp(-x)). That is further
+// reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
+// sense for 32-bit float or 32-bit fixed-point quantization, but we are
+// aiming for 16-bit fixed-point quantization of these internal nodes here.
+//
+template <int StateIntegerBits>
+inline void LstmCell(const LstmCellParams& params,
+                     const RuntimeShape& unextended_input_shape,
+                     const uint8_t* input_data_uint8,
+                     const RuntimeShape& unextended_prev_activ_shape,
+                     const uint8_t* prev_activ_data_uint8,
+                     const RuntimeShape& weights_shape,
+                     const uint8_t* weights_data_uint8,
+                     const RuntimeShape& unextended_bias_shape,
+                     const int32_t* bias_data_int32,
+                     const RuntimeShape& unextended_prev_state_shape,
+                     const int16_t* prev_state_data_int16,
+                     const RuntimeShape& unextended_output_state_shape,
+                     int16_t* output_state_data_int16,
+                     const RuntimeShape& unextended_output_activ_shape,
+                     uint8_t* output_activ_data_uint8,
+                     const RuntimeShape& unextended_concat_temp_shape,
+                     uint8_t* concat_temp_data_uint8,
+                     const RuntimeShape& unextended_activ_temp_shape,
+                     int16_t* activ_temp_data_int16, void* gemmlowp_context) {
+  (void)gemmlowp_context;  // only used in optimized code.
+  int32_t weights_zero_point = params.weights_zero_point;
+  int32_t accum_multiplier = params.accum_multiplier;
+  int accum_shift = params.accum_shift;
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape =
+      RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  // Gather dimensions information, and perform consistency checks.
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int outer_size = MatchingFlatSizeSkipDim(
+      input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape,
+      output_activ_shape);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
+                   total_input_depth);
+  const int intern_activ_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
+                   intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                  3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
+  const int fc_output_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
+  const int fc_accum_depth = total_input_depth;
+  TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8_t const* concat_input_arrays_data[2] = {input_data_uint8,
+                                                prev_activ_data_uint8};
+  const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
+                                                       &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes,
+                concat_input_arrays_data, concat_temp_shape,
+                concat_temp_data_uint8);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  for (int b = 0; b < fc_batches; ++b) {
+    for (int out_c = 0; out_c < fc_output_depth; ++out_c) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum = bias_data_int32[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < fc_accum_depth; ++d) {
+        int16_t input_val =
+            concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
+        int16_t weights_val =
+            weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
+        accum += input_val * weights_val;
+      }
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, using 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      accum =
+          MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
+      // Saturate, cast to int16, and store to the temporary activations array.
+      accum = std::max(-32768, std::min(32767, accum));
+      activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
+    }
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  for (int b = 0; b < outer_size; ++b) {
+    for (int c = 0; c < output_depth; ++c) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      output_state_data_int16[b * output_depth + c] = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16_t rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16_t clamped_output_activ = std::max<int16_t>(
+          -128, std::min<int16_t>(127, rescaled_output_activ));
+      output_activ_data_uint8[b * output_depth + c] =
+          128 + clamped_output_activ;
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LSTM_CELL_H_
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -227,6 +227,41 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
  }
 }

+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result) {
+  const int kBlockSize = 16;
+  TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
+  for (int batch = 0; batch < n_batch; ++batch) {
+    const int8_t* matrix_ptr = matrix;
+    for (int row = 0; row < m_rows; ++row) {
+      int32_t dot_prod = 0;
+      const int8_t* vector_in_batch = vector + batch * m_cols;
+      for (int i = segments[row]; i < segments[row + 1]; ++i) {
+        const int block_start_index = indices[i] * kBlockSize;
+        const int8_t* vector_block_in_batch_ptr =
+            vector_in_batch + block_start_index;
+        for (int c = 0; c < kBlockSize; c++) {
+          dot_prod += *matrix_ptr * *vector_block_in_batch_ptr++;
+          dot_prod += *matrix_ptr++ * input_offset;
+        }
+      }
+      const int32_t bias_value = bias_vector != nullptr ? bias_vector[row] : 0;
+      dot_prod = MultiplyByQuantizedMultiplier(dot_prod + bias_value,
+                                               output_multiplier, output_shift);
+      dot_prod += output_offset;
+      result[batch * m_rows + row] =
+          static_cast<int8_t>(ActivationFunctionWithMinMax(
+              dot_prod, output_activation_min, output_activation_max));
+    }
+  }
+}
+
 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -0,0 +1,333 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
+
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
+namespace tflite {
+namespace tensor_utils {
+
+// Check if all entries of a vector are zero for float.
+bool IsZeroVector(const float* vector, int v_size) {
+  return PortableIsZeroVector(vector, v_size);
+}
+
+// Check if all entries of a vector are zero for int8_t.
+bool IsZeroVector(const int8_t* vector, int v_size) {
+  return PortableIsZeroVector(vector, v_size);
+}
+
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min, float* max,
+                             float* scaling_factor) {
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, min, max,
+                                  scaling_factor);
+}
+
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float min_value,
+                             float max_value, float* scaling_factor) {
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, min_value,
+                                  max_value, scaling_factor);
+}
+
+void AsymmetricQuantizeFloats(const float* values, const int size,
+                              int8_t* quantized_values, float* scaling_factor,
+                              int32_t* offset) {
+  PortableAsymmetricQuantizeFloats(values, size, quantized_values,
+                                   scaling_factor, offset);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              n_batch, result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                         const int m_rows, const int m_cols,
+                                         const int8_t* __restrict__ vector,
+                                         const float* scaling_factors,
+                                         int n_batch,
+                                         float* __restrict__ result) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              scaling_factors, n_batch, result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      per_channel_scale, input_offset, scratch, row_sums, compute_row_sums,
+      context);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                         const int m_rows, const int m_cols,
+                                         const int8_t* __restrict__ vector,
+                                         const float* scaling_factors,
+                                         int n_batch, int32_t* scratch,
+                                         float* __restrict__ result,
+                                         CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              scaling_factors, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
+      matrix, segments, indices, m_rows, m_cols, vector, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vector, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+
+    int8_t* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
+      matrix, segments, indices, m_rows, m_cols, vector, bias_vector, n_batch,
+      input_offset, output_multiplier, output_shift, output_offset,
+      output_activation_min, output_activation_max, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vectors, scaling_factors, n_batch,
+      result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
+      n_output, output_zp, scratch, output, context);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
+      n_output, output_zp, scratch, output, context);
+}
+
+void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
+                                    int32_t n_row, int32_t n_col,
+                                    int32_t* output) {
+  PortableMatrixScalarMultiplyAccumulate(matrix, scalar, n_row, n_col, output);
+}
+
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp) {
+  PortableMatrixBatchVectorMultiply(
+      input, input_zeropoint, input_to_gate_weights,
+      input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
+      n_input, n_cell, gate_output, gate_output_zp);
+}
+
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output) {
+  PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
+                                    proj_effective_scale_a,
+                                    proj_effective_scale_b, gate_bias, n_batch,
+                                    n_hidden, n_output, output_zp, proj_output);
+}
+
+void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                    const int32_t* bias, int32_t layer_norm_scale_a,
+                    int32_t layer_norm_scale_b, int32_t variance_limit,
+                    int n_batch, int n_input, int16_t* output) {
+  PortableApplyLayerNorm(input, layer_norm_weights, bias, layer_norm_scale_a,
+                         layer_norm_scale_b, variance_limit, n_batch, n_input,
+                         output);
+}
+
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output) {
+  PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
+                              layer_norm_scale_b, bias, n_batch, n_input,
+                              output);
+}
+
+void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                  int16_t* output) {
+  PortableApplySigmoid(input, n_batch, n_input, output);
+}
+
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output) {
+  PortableApplySigmoidFloat(input, n_batch, n_input, output);
+}
+
+void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
+               int32_t n_input, int16_t* output) {
+  PortableApplyTanh(integer_bits, input, n_batch, n_input, output);
+}
+
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output) {
+  PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int16_t* output) {
+  PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2,
+              int32_t multiplier, int32_t shift, int32_t n_batch,
+              int32_t n_input, int32_t output_zp, int8_t* output) {
+  PortableCwiseMul(input_1, input_2, multiplier, shift, n_batch, n_input,
+                   output_zp, output);
+}
+
+void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int16_t* output) {
+  PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
+}
+
+void CwiseClipping(float* vector, const int v_size,
+                   const float clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
+                                             const int16_t* batch_vector,
+                                             int n_batch, int32_t multiplier,
+                                             int shift, int16_t* result) {
+  PortableVectorBatchVectorCwiseProductAccumulate(
+      vector, v_size, batch_vector, n_batch, multiplier, shift, result);
+}
+
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size) {
+  return PortableVectorVectorDotProduct(vector1, vector2, v_size);
+}
+
+void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
+                                      const int16_t* vector2, int v_size,
+                                      int n_batch, int32_t* result) {
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result);
+}
+
+void Sub1Vector(const float* vector, int v_size, float* result) {
+  PortableSub1Vector(vector, v_size, result);
+}
+
+void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) {
+  PortableSub1Vector(vector, v_size, result);
+}
+
+// Multiply all elements of vector with a scalar.
+void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                          float* result) {
+  PortableVectorScalarMultiply(vector, v_size, scale, result);
+}
+
+void ReductionSumVector(const float* input_vector, float* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+void MeanStddevNormalization(const float* input_vector, float* output_vector,
+                             int v_size, int n_batch) {
+  PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
+}
+
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturatingAdd(
+      input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
+      input_effective_scale_b, recurrent_effective_scale_a,
+      recurrent_effective_scale_b, n_batch, n_cell, output);
+}
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -87,6 +87,15 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate(
    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
    float* __restrict__ result);

+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result);
+
 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
    const int m_cols, const int8_t* __restrict__ vectors,
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/sub.h
@@ -273,6 +273,9 @@ void BroadcastQuantSubSlow(const ArithmeticParams& params,
                           const T* input2_data,
                           const RuntimeShape& output_shape, T* output_data) {
  ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T");
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
  NdArrayDesc<N> desc1;
  NdArrayDesc<N> desc2;
  NdArrayDesc<N> output_desc;
--- a/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.cc
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.cc
@@ -27,6 +27,7 @@ limitations under the License.

 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"

@@ -466,10 +467,10 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
    const int d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
    const int d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
    if (!(d1 == d2 || d1 == 1 || d2 == 1)) {
-      context->ReportError(context,
-                           "Given shapes, %s and %s, are not broadcastable.",
-                           GetShapeDebugString(input1->dims).c_str(),
-                           GetShapeDebugString(input2->dims).c_str());
+      TF_LITE_KERNEL_LOG(context,
+                         "Given shapes, %s and %s, are not broadcastable.",
+                         GetShapeDebugString(input1->dims).c_str(),
+                         GetShapeDebugString(input2->dims).c_str());
      return kTfLiteError;
    }

@@ -504,11 +505,11 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
    if (min_value == 0) max_value = 0;
    if (!(d1 == 1 || d1 == max_value) || !(d2 == 1 || d2 == max_value) ||
        !(d3 == 1 || d3 == max_value)) {
-      context->ReportError(
-          context, "Given shapes, %s, %s and %s, are not broadcastable.",
-          GetShapeDebugString(input1->dims).c_str(),
-          GetShapeDebugString(input2->dims).c_str(),
-          GetShapeDebugString(input3->dims).c_str());
+      TF_LITE_KERNEL_LOG(context,
+                         "Given shapes, %s, %s and %s, are not broadcastable.",
+                         GetShapeDebugString(input1->dims).c_str(),
+                         GetShapeDebugString(input2->dims).c_str(),
+                         GetShapeDebugString(input3->dims).c_str());
      return kTfLiteError;
    }
    shape->data[out_dims - i - 1] = max_value;
@@ -529,6 +530,9 @@ int TfLiteTypeGetSize(TfLiteType type) {
      return 1;
    case kTfLiteBool:
      return sizeof(bool);
+    case kTfLiteUInt16:
+      static_assert(sizeof(uint16_t) == 2, "");
+      return 2;
    case kTfLiteInt16:
      static_assert(sizeof(int16_t) == 2, "");
      return 2;
@@ -575,4 +579,15 @@ bool IsMobilePlatform() {
  return false;
 }

+bool HasUnspecifiedDimension(const TfLiteTensor* tensor) {
+#ifndef TF_LITE_STATIC_MEMORY
+  if (tensor->dims_signature) {
+    for (int i : TfLiteIntArrayView(tensor->dims_signature)) {
+      if (i == -1) return true;
+    }
+  }
+#endif  // TF_LITE_STATIC_MEMORY
+  return false;
+}
+
 }  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.h
@@ -314,6 +314,9 @@ int TfLiteTypeGetSize(TfLiteType type);
 // Whether the current platform is mobile (Android or iOS).
 bool IsMobilePlatform();

+// Returns whether there is unspecified dimension in the tensor's dim signature.
+bool HasUnspecifiedDimension(const TfLiteTensor* tensor);
+
 }  // namespace tflite

 #endif  // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
--- a/code/components/tflite-lib/tensorflow/lite/micro/all_ops_resolver.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/all_ops_resolver.cc
@@ -29,8 +29,12 @@ AllOpsResolver::AllOpsResolver() {
  AddAssignVariable();
  AddAveragePool2D();
  AddBatchToSpaceNd();
+  AddBroadcastArgs();
+  AddBroadcastTo();
  AddCallOnce();
+  AddCast();
  AddCeil();
+  AddCircularBuffer();
  AddConcatenation();
  AddConv2D();
  AddCos();
@@ -49,9 +53,12 @@ AllOpsResolver::AllOpsResolver() {
  AddFloorDiv();
  AddFloorMod();
  AddFullyConnected();
+  AddGather();
+  AddGatherNd();
  AddGreater();
  AddGreaterEqual();
  AddHardSwish();
+  AddIf();
  AddL2Normalization();
  AddL2Pool2D();
  AddLeakyRelu();
@@ -66,6 +73,7 @@ AllOpsResolver::AllOpsResolver() {
  AddMaximum();
  AddMean();
  AddMinimum();
+  AddMirrorPad();
  AddMul();
  AddNeg();
  AddNotEqual();
@@ -85,6 +93,7 @@ AllOpsResolver::AllOpsResolver() {
  AddRsqrt();
  AddShape();
  AddSin();
+  AddSlice();
  AddSoftmax();
  AddSpaceToBatchNd();
  AddSpaceToDepth();
@@ -101,6 +110,8 @@ AllOpsResolver::AllOpsResolver() {
  AddTransposeConv();
  AddUnpack();
  AddVarHandle();
+  AddWhile();
+  AddZerosLike();
 }

 }  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.cc
@@ -0,0 +1,107 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/fake_micro_context.h"
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_arena_constants.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/micro/simple_memory_allocator.h"
+
+namespace tflite {
+namespace {
+// Dummy static variables to allow creation of dummy MicroAllocator.
+// All tests are guarateed to run serially.
+static constexpr int KDummyTensorArenaSize = 256;
+static uint8_t dummy_tensor_arena[KDummyTensorArenaSize];
+}  // namespace
+
+FakeMicroContext::FakeMicroContext(TfLiteTensor* tensors,
+                                   SimpleMemoryAllocator* allocator,
+                                   MicroGraph* micro_graph)
+    : MicroContext(
+          MicroAllocator::Create(dummy_tensor_arena, KDummyTensorArenaSize,
+                                 GetMicroErrorReporter()),
+          nullptr, micro_graph),
+      tensors_(tensors),
+      allocator_(allocator) {}
+
+TfLiteTensor* FakeMicroContext::AllocateTempTfLiteTensor(int tensor_index) {
+  allocated_tensor_count_++;
+  return &tensors_[tensor_index];
+}
+
+void FakeMicroContext::DeallocateTempTfLiteTensor(TfLiteTensor* tensor) {
+  allocated_tensor_count_--;
+}
+
+bool FakeMicroContext::IsAllTempTfLiteTensorDeallocated() {
+  return !allocated_tensor_count_;
+}
+
+TfLiteEvalTensor* FakeMicroContext::GetEvalTensor(int tensor_index) {
+  TfLiteEvalTensor* eval_tensor =
+      reinterpret_cast<TfLiteEvalTensor*>(allocator_->AllocateTemp(
+          sizeof(TfLiteEvalTensor), alignof(TfLiteEvalTensor)));
+  TFLITE_DCHECK(eval_tensor != nullptr);
+
+  // In unit tests, the TfLiteTensor pointer contains the source of truth for
+  // buffers and values:
+  eval_tensor->data = tensors_[tensor_index].data;
+  eval_tensor->dims = tensors_[tensor_index].dims;
+  eval_tensor->type = tensors_[tensor_index].type;
+  return eval_tensor;
+}
+
+void* FakeMicroContext::AllocatePersistentBuffer(size_t bytes) {
+  // FakeMicroContext use SimpleMemoryAllocator, which does not automatically
+  // apply the buffer alignment like MicroAllocator.
+  // The buffer alignment is potentially wasteful but allows the
+  // fake_micro_context to work correctly with optimized kernels.
+  return allocator_->AllocatePersistentBuffer(bytes,
+                                              MicroArenaBufferAlignment());
+}
+
+TfLiteStatus FakeMicroContext::RequestScratchBufferInArena(size_t bytes,
+                                                           int* buffer_index) {
+  TFLITE_DCHECK(buffer_index != nullptr);
+
+  if (scratch_buffer_count_ == kNumScratchBuffers_) {
+    MicroPrintf("Exceeded the maximum number of scratch tensors allowed (%d).",
+                kNumScratchBuffers_);
+    return kTfLiteError;
+  }
+
+  // For tests, we allocate scratch buffers from the tail and keep them around
+  // for the lifetime of model. This means that the arena size in the tests will
+  // be more than what we would have if the scratch buffers could share memory.
+  scratch_buffers_[scratch_buffer_count_] =
+      allocator_->AllocatePersistentBuffer(bytes, MicroArenaBufferAlignment());
+  TFLITE_DCHECK(scratch_buffers_[scratch_buffer_count_] != nullptr);
+
+  *buffer_index = scratch_buffer_count_++;
+  return kTfLiteOk;
+}
+
+void* FakeMicroContext::GetScratchBuffer(int buffer_index) {
+  TFLITE_DCHECK(scratch_buffer_count_ <= kNumScratchBuffers_);
+  if (buffer_index >= scratch_buffer_count_) {
+    return nullptr;
+  }
+  return scratch_buffers_[buffer_index];
+}
+
+}  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.h
@@ -0,0 +1,56 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_FAKE_MICRO_CONTEXT_H_
+#define TENSORFLOW_LITE_MICRO_FAKE_MICRO_CONTEXT_H_
+
+#include "tensorflow/lite/micro/micro_context.h"
+#include "tensorflow/lite/micro/micro_graph.h"
+
+namespace tflite {
+// A fake of MicroContext for kernel util tests.
+class FakeMicroContext : public MicroContext {
+ public:
+  FakeMicroContext(TfLiteTensor* tensors, SimpleMemoryAllocator* allocator,
+                   MicroGraph* micro_graph);
+
+  void* AllocatePersistentBuffer(size_t bytes) override;
+  TfLiteStatus RequestScratchBufferInArena(size_t bytes,
+                                           int* buffer_index) override;
+  void* GetScratchBuffer(int buffer_index) override;
+
+  TfLiteTensor* AllocateTempTfLiteTensor(int tensor_index) override;
+  void DeallocateTempTfLiteTensor(TfLiteTensor* tensor) override;
+  bool IsAllTempTfLiteTensorDeallocated();
+
+  TfLiteEvalTensor* GetEvalTensor(int tensor_index) override;
+
+ private:
+  static constexpr int kNumScratchBuffers_ = 12;
+
+  int scratch_buffer_count_ = 0;
+  uint8_t* scratch_buffers_[kNumScratchBuffers_];
+
+  TfLiteTensor* tensors_;
+  int allocated_tensor_count_ = 0;
+
+  SimpleMemoryAllocator* allocator_;
+
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_FAKE_MICRO_CONTEXT_H_
--- a/code/components/tflite-lib/tensorflow/lite/micro/ibuffer_allocator.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/ibuffer_allocator.h
@@ -0,0 +1,100 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_IBUFFER_ALLOCATOR_H_
+#define TENSORFLOW_LITE_MICRO_IBUFFER_ALLOCATOR_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorflow/lite/c/c_api_types.h"
+
+namespace tflite {
+// Interface classes that the TFLM framework relies on to get buffers it needs.
+// There are two types of buffers that the TFLM framework requires: persistent
+// and non-persistent. Persistent buffers, once allocated, are never freed by
+// the TFLM framework. Non-persist buffers can be allocated and deallocated by
+// the TFLM framework. This file defines two interfaces classes that TFLM
+// framework will rely on to manage these buffers.
+
+// Interface class for managing persistent buffers.
+class IPersistentBufferAllocator {
+ public:
+  IPersistentBufferAllocator() {}
+  virtual ~IPersistentBufferAllocator() {}
+
+  // Allocates persistent memory. The persistent buffer is never freed.
+  virtual uint8_t* AllocatePersistentBuffer(size_t size, size_t alignment) = 0;
+
+  // Returns the size of all persistent allocations in bytes.
+  virtual size_t GetPersistentUsedBytes() const = 0;
+};
+
+// Interface class for managing non-persistent buffers.
+// The default non-persistent buffers are temp buffers that are not resizable.
+// Support of at least one resizable buffer is required.
+class INonPersistentBufferAllocator {
+ public:
+  INonPersistentBufferAllocator() {}
+  virtual ~INonPersistentBufferAllocator() {}
+
+  // Allocates a temporary buffer. This buffer is not resizable.
+  virtual uint8_t* AllocateTemp(size_t size, size_t alignment) = 0;
+
+  // Signals that a temporary buffer is no longer needed.
+  virtual void DeallocateTemp(uint8_t* buf) = 0;
+
+  // Returns true if all temporary buffers are already deallocated.
+  virtual bool IsAllTempDeallocated() = 0;
+
+  // Signals that all temporary allocations can be reclaimed. TFLM calls this
+  // API when it knows that all temporary buffers that it requested has been
+  // deallocated. The goal of API is to facilitate implementations of
+  // INonPersistentBufferAllocator can reuse buffer with some reasonable
+  // complexity.
+  virtual TfLiteStatus ResetTempAllocations() = 0;
+
+  // Returns a buffer that is resizable viable ResizeBuffer().
+  virtual uint8_t* AllocateResizableBuffer(size_t size, size_t alignment) = 0;
+
+  // Resizes a buffer that is previously returned by the
+  // AllocateResizableBuffer.
+  virtual TfLiteStatus ResizeBuffer(uint8_t* resizable_buf, size_t size,
+                                    size_t alignment) = 0;
+
+  // Frees up the memory occupied by the resizable buffer.
+  virtual TfLiteStatus DeallocateResizableBuffer(uint8_t* resizable_buf) = 0;
+
+  // Returns a pointer pointing to the start of the overlay memory, which is
+  // used for activation tensors and scratch buffers by kernels at Invoke stage.
+  virtual uint8_t* GetOverlayMemoryAddress() const = 0;
+
+  // Reserves the size of the overlay memory. This overlay is reserved for the
+  // kernels at Invoke stage. This is referred to as the overlay because before
+  // Invoket state, the same memory can be used for temp buffers. The layout of
+  // the memory is planned by the memory planner separately at Invoke stage.
+  virtual TfLiteStatus ReserveNonPersistentOverlayMemory(size_t size,
+                                                         size_t alignment) = 0;
+
+  // Returns the size of non-persistent buffer in use.
+  virtual size_t GetNonPersistentUsedBytes() const = 0;
+
+  // Returns the number of bytes available with a given alignment. This number
+  // takes in account any temporary allocations.
+  virtual size_t GetAvailableMemory(size_t alignment) const = 0;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_IBUFFER_ALLOCATOR_H_
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations_common.cc
@@ -117,15 +117,21 @@ TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  ReluOpData* data = static_cast<ReluOpData*>(node->user_data);

-  const TfLiteTensor* input = GetInput(context, node, kActivationsInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kActivationsInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kActivationsOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kActivationsOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  if (input->type == kTfLiteInt8) {
    CalculateReluOpData<int8_t>(input, output, data);
  }

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

@@ -133,7 +139,9 @@ TfLiteStatus Relu6Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  Relu6OpData* data = static_cast<Relu6OpData*>(node->user_data);

-  const TfLiteTensor* input = GetInput(context, node, kActivationsInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kActivationsInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);

  if (input->type == kTfLiteInt8) {
@@ -142,6 +150,8 @@ TfLiteStatus Relu6Prepare(TfLiteContext* context, TfLiteNode* node) {
    data->zero_int8 = input->params.zero_point;
  }

+  micro_context->DeallocateTempTfLiteTensor(input);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_common.cc
@@ -80,11 +80,15 @@ TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  TFLITE_DCHECK(node->builtin_data != nullptr);

-  const TfLiteTensor* input1 = GetInput(context, node, kAddInputTensor1);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kAddInputTensor1);
  TF_LITE_ENSURE(context, input1 != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kAddInputTensor2);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kAddInputTensor2);
  TF_LITE_ENSURE(context, input2 != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kAddOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kAddOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  OpDataAdd* data = static_cast<OpDataAdd*>(node->user_data);
@@ -93,6 +97,9 @@ TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_STATUS(
      CalculateOpDataAdd(context, params, input1, input2, output, data));

+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_n.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_n.cc
@@ -50,18 +50,19 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE(context, num_inputs >= 2);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input_tensor_first;
-  TF_LITE_ENSURE_OK(
-      context, GetInputSafe(context, node, kInputTensor0, &input_tensor_first));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input_tensor_first =
+      micro_context->AllocateTempInputTensor(node, kInputTensor0);
+  TF_LITE_ENSURE(context, input_tensor_first != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);

  // Check that all tensors have the same shape and type.
  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_tensor_first->type);
  for (int i = kInputTensor0 + 1; i < num_inputs; ++i) {
-    const TfLiteTensor* input;
-    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, i, &input));
+    TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, i);
+    TF_LITE_ENSURE(context, input != nullptr);
    TF_LITE_ENSURE(context, HaveSameShapes(input_tensor_first, input));
    TF_LITE_ENSURE_TYPES_EQ(context, input_tensor_first->type, input->type);

@@ -72,6 +73,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
      TF_LITE_ENSURE(context,
                     input_tensor_first->params.scale == input->params.scale);
    }
+
+    micro_context->DeallocateTempTfLiteTensor(input);
  }

  if (output->type == kTfLiteFloat32) {
@@ -123,6 +126,9 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
    return kTfLiteError;
  }

+  micro_context->DeallocateTempTfLiteTensor(input_tensor_first);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/assign_variable.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/assign_variable.cc
@@ -52,21 +52,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                           input_resource_id_tensor->type == kTfLiteInt32));
  TF_LITE_ENSURE_EQ(context, NumElements(input_resource_id_tensor->dims), 1);

-  const TfLiteTensor* input_value = GetInput(context, node, kInputValue);
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  TfLiteTensor* input_value =
+      micro_context->AllocateTempInputTensor(node, kInputValue);
  TFLITE_DCHECK(input_value != nullptr);

-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
-  MicroResourceVariables* resources = graph_info->GetResourceVariables();
+  MicroGraph& graph_info = micro_context->graph();
+
+  MicroResourceVariables* resources = graph_info.GetResourceVariables();
  TF_LITE_ENSURE_OK(context,
                    resources->Allocate(input_resource_id_tensor->data.i32[0],
                                        context, input_value));

+  micro_context->DeallocateTempTfLiteTensor(input_value);
  return kTfLiteOk;
 }

@@ -79,14 +77,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
      tflite::micro::GetEvalInput(context, node, kInputValue);
  TFLITE_DCHECK(input_value != nullptr);

-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
-  MicroResourceVariables* resources = graph_info->GetResourceVariables();
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  MicroGraph& graph_info = micro_context->graph();
+
+  MicroResourceVariables* resources = graph_info.GetResourceVariables();
  if (resources == nullptr) {
    MicroPrintf(
        "ASSIGN_VARIABLE requires resource variables. Please create "
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/batch_to_space_nd.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/batch_to_space_nd.cc
@@ -41,8 +41,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
  TF_LITE_ENSURE(context, input != nullptr && output != nullptr);

  TF_LITE_ENSURE(context, NumDimensions(input) >= kInputOutputMinDimensionNum);
@@ -51,6 +55,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE(context, NumDimensions(output) <= kInputOutputMaxDimensionNum);
  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_args.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_args.cc
@@ -0,0 +1,97 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/broadcast_args.h"
+
+#include <stdint.h>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_context.h"
+
+namespace tflite {
+namespace {
+constexpr int kShape1Tensor = 0;
+constexpr int kShape2Tensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus BroadcastArgsPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, NumInputs(node) == 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* shape1 =
+      micro_context->AllocateTempInputTensor(node, kShape1Tensor);
+  TfLiteTensor* shape2 =
+      micro_context->AllocateTempInputTensor(node, kShape2Tensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+
+  TF_LITE_ENSURE(context,
+                 shape1->type == kTfLiteInt32 || shape1->type == kTfLiteInt64);
+  TF_LITE_ENSURE_EQ(context, shape1->type, shape2->type);
+  TF_LITE_ENSURE_EQ(context, shape1->type, output->type);
+
+  // Ensures the shapes are 1D tensor.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(shape1), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(shape2), 1);
+
+  // Ensure the shape of the output tensor is compatible
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 1);
+
+  micro_context->DeallocateTempTfLiteTensor(shape1);
+  micro_context->DeallocateTempTfLiteTensor(shape2);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus BroadcastArgsEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* shape1 =
+      micro::GetEvalInput(context, node, kShape1Tensor);
+  const TfLiteEvalTensor* shape2 =
+      micro::GetEvalInput(context, node, kShape2Tensor);
+  TfLiteEvalTensor* output = micro::GetEvalOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteInt32) {
+    reference_ops::BroadcastArgs(
+        micro::GetTensorShape(shape1), micro::GetTensorData<int32_t>(shape1),
+        micro::GetTensorShape(shape2), micro::GetTensorData<int32_t>(shape2),
+        micro::GetTensorShape(output), micro::GetTensorData<int32_t>(output));
+  } else {
+    reference_ops::BroadcastArgs(
+        micro::GetTensorShape(shape1), micro::GetTensorData<int64_t>(shape1),
+        micro::GetTensorShape(shape2), micro::GetTensorData<int64_t>(shape2),
+        micro::GetTensorShape(output), micro::GetTensorData<int64_t>(output));
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_BROADCAST_ARGS() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/BroadcastArgsPrepare,
+          /*invoke=*/BroadcastArgsEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_to.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_to.cc
@@ -0,0 +1,129 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/broadcast_to.h"
+
+#include <stdint.h>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_context.h"
+
+namespace tflite {
+
+namespace {
+constexpr int kInputTensor = 0;
+constexpr int kShapeTensor = 1;
+constexpr int kOutputTensor = 0;
+// Support a maximum of 5 dimensions in TFLM.
+constexpr int kMaxDims = 5;
+
+TfLiteStatus ValidateOutputTensor(TfLiteContext* context, TfLiteTensor* input,
+                                  TfLiteTensor* shape, TfLiteTensor* output) {
+  // Ensures the shape is 1D tensor.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(shape), 1);
+
+  // Ensure output dims is not less than input dims.
+  int input_num_dims = NumDimensions(input);
+  int output_num_dims = NumDimensions(output);
+  int shape_num_dims = SizeOfDimension(shape, 0);
+  TF_LITE_ENSURE_MSG(context, output_num_dims == shape_num_dims,
+                     "Output must match with the expected shape dimension.");
+  TF_LITE_ENSURE_MSG(context, input_num_dims <= output_num_dims,
+                     "Output shape must be broadcastable from input shape.");
+  TF_LITE_ENSURE_MSG(context, output_num_dims <= kMaxDims,
+                     "BroadcastTo only supports 1-5D tensor.");
+
+  // Check if output shape is broadcastable from input shape.
+  auto get_shape_data = [shape](int i) -> int32_t {
+    if (shape->type == kTfLiteInt32) {
+      return GetTensorData<int32_t>(shape)[i];
+    } else {
+      return GetTensorData<int64_t>(shape)[i];
+    }
+  };
+
+  int extending_dims = output_num_dims - input_num_dims;
+  for (int idx = 0; idx < input_num_dims; ++idx) {
+    TF_LITE_ENSURE_MSG(
+        context,
+        (SizeOfDimension(input, idx) == 1 ||
+         SizeOfDimension(input, idx) == get_shape_data(extending_dims + idx)),
+        "Output shape must be broadcastable from input shape.");
+  }
+
+  // Validating the shape of the output tensor.
+  tflite::RuntimeShape output_shape = tflite::GetTensorShape(output);
+  for (int idx = 0; idx < output_num_dims; ++idx) {
+    TF_LITE_ENSURE(context, output_shape.Dims(idx) == get_shape_data(idx));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus BroadcastToPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, NumInputs(node) == 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* shape =
+      micro_context->AllocateTempInputTensor(node, kShapeTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+
+  TF_LITE_ENSURE_MSG(context, (NumDimensions(input) <= kMaxDims),
+                     "BroadcastTo only supports 1-5D tensor.");
+
+  TF_LITE_ENSURE(context,
+                 shape->type == kTfLiteInt32 || shape->type == kTfLiteInt64);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  // Does not support String type due to its variable size. This limitation is
+  // the same as TFLite.
+  TF_LITE_ENSURE(context, input->type != kTfLiteString);
+
+  TF_LITE_ENSURE_STATUS(ValidateOutputTensor(context, input, shape, output));
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(shape);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
+}
+
+TfLiteStatus BroadcastToEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output = micro::GetEvalOutput(context, node, kOutputTensor);
+
+  // BroadcastTo op support upto 5 dims, different from 8 dims in TFLite.
+  reference_ops::BroadcastTo<kMaxDims>(
+      micro::GetTensorShape(input), input->data.raw,
+      micro::GetTensorShape(output), output->data.raw, input->type);
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteRegistration Register_BROADCAST_TO() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/BroadcastToPrepare,
+          /*invoke=*/BroadcastToEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/call_once.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/call_once.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_context.h"
 #include "tensorflow/lite/micro/micro_graph.h"
 #include "tensorflow/lite/schema/schema_generated.h"

@@ -50,16 +51,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE(context, NumInputs(node) == 0);
  TF_LITE_ENSURE(context, NumOutputs(node) == 0);

-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  MicroGraph& graph_info = micro_context->graph();

  TF_LITE_ENSURE(context,
-                 op_data->init_subgraph_index < graph_info->NumSubgraphs());
+                 op_data->init_subgraph_index < graph_info.NumSubgraphs());

  return kTfLiteOk;
 }
@@ -72,16 +68,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
    return kTfLiteOk;
  }

-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  MicroGraph& graph_info = micro_context->graph();

  TF_LITE_ENSURE_OK(context,
-                    graph_info->InvokeSubgraph(op_data->init_subgraph_index));
+                    graph_info.InvokeSubgraph(op_data->init_subgraph_index));

  op_data->has_run = true;

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/cast.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/cast.cc
@@ -28,11 +28,19 @@ constexpr int kOutputTensor = 0;
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

@@ -83,6 +91,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
    case kTfLiteInt32:
      return copyToTensor(context, tflite::micro::GetTensorData<int32_t>(input),
                          output, num_elements);
+    case kTfLiteUInt32:
+      return copyToTensor(context,
+                          tflite::micro::GetTensorData<uint32_t>(input), output,
+                          num_elements);
    case kTfLiteFloat32:
      return copyToTensor(context, tflite::micro::GetTensorData<float>(input),
                          output, num_elements);
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/ceil.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/ceil.cc
@@ -29,9 +29,13 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -42,6 +46,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  for (int i = 0; i < output->dims->size; ++i) {
    TF_LITE_ENSURE_EQ(context, output->dims->data[i], input->dims->data[i]);
  }
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer_common.cc
@@ -39,9 +39,13 @@ const int kCircularBufferCyclesMaxIndex = 0;  // 'cycles_max'
 const TfLiteStatus kTfLiteAbort = static_cast<TfLiteStatus>(-9);

 TfLiteStatus CircularBufferPrepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input =
-      GetInput(context, node, kCircularBufferInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kCircularBufferOutputTensor);
+
+  MicroContext * micro_context = GetMicroContext(context);
+
+   TfLiteTensor* input =
+    micro_context->  AllocateTempInputTensor(node, kCircularBufferInputTensor);
+  TfLiteTensor* output =
+      micro_context-> AllocateTempOutputTensor(node, kCircularBufferOutputTensor);

  TFLITE_DCHECK(node->user_data != nullptr);
  OpDataCircularBuffer* op_data =
@@ -85,6 +89,9 @@ TfLiteStatus CircularBufferPrepare(TfLiteContext* context, TfLiteNode* node) {
  op_data->cycles_until_run = op_data->cycles_max;
  node->user_data = op_data;

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/comparisons.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/comparisons.cc
@@ -540,9 +540,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  OpData* data = static_cast<OpData*>(node->user_data);

-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor1);
  TF_LITE_ENSURE(context, input1 != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor2);
  TF_LITE_ENSURE(context, input2 != nullptr);

  if (input1->type == kTfLiteInt8) {
@@ -570,6 +574,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
    data->params.input2_shift = input2_shift;
  }

+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/concatenation.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/concatenation.cc
@@ -115,13 +115,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteConcatenationParams* params =
      reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);

-  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input_tensor = micro_context->AllocateTempInputTensor(node, 0);
  TF_LITE_ENSURE(context, input_tensor != nullptr);
  TfLiteType input_type = input_tensor->type;
-  const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output_tensor =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
  TF_LITE_ENSURE(context, output_tensor != nullptr);
  TfLiteType output_type = output_tensor->type;

+  micro_context->DeallocateTempTfLiteTensor(input_tensor);
+  micro_context->DeallocateTempTfLiteTensor(output_tensor);
+
  // Check activation and input type
  TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
  TF_LITE_ENSURE(context,
@@ -138,7 +144,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {

  // Shapes with dimensions >4 are not yet supported with static allocation.
  for (int i = 0; i < num_inputs; ++i) {
-    const TfLiteTensor* input = GetInput(context, node, i);
+    TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, i);
    TF_LITE_ENSURE(context, input != nullptr);
    int num_dimensions = NumDimensions(input);

@@ -150,13 +156,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
          num_dimensions);
      return kTfLiteError;
    }
+    micro_context->DeallocateTempTfLiteTensor(input);
  }

  // Calculate OpData.
  TFLITE_DCHECK(node->user_data != nullptr);
  OpData* data = static_cast<OpData*>(node->user_data);

-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  switch (output_type) {  // Already know in/outtypes are same.
@@ -183,10 +191,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
      // Allocate persistent scale and zeropoint buffers.
      // Store input scale and zero point values in OpParams:
      for (int i = 0; i < node->inputs->size; ++i) {
-        const TfLiteTensor* t = GetInput(context, node, i);
+        TfLiteTensor* t = micro_context->AllocateTempInputTensor(node, i);
        TF_LITE_ENSURE(context, t != nullptr);
        input_scales[i] = t->params.scale;
        input_zero_points[i] = t->params.zero_point;
+        micro_context->DeallocateTempTfLiteTensor(t);
      }

      data->params.input_scale = input_scales;
@@ -202,6 +211,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
      return kTfLiteError;
  }

+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -79,7 +79,8 @@ TfLiteRegistration Register_CONV_2D();

 #if defined(XTENSA)
 // Returns a TfLiteRegistration struct for kernel variant that only supports
-// int8 inputs and outputs.
+// int8 activations and int8 weights and always calls the reference
+// implementation.
 TfLiteRegistration Register_CONV_2D_INT8REF();
 #else
 inline TfLiteRegistration Register_CONV_2D_INT8REF() {
@@ -87,6 +88,25 @@ inline TfLiteRegistration Register_CONV_2D_INT8REF() {
 }
 #endif

+#if defined(CMSIS_NN)
+// Returns a TfLiteRegistration struct for kernel variant that only supports
+// int8 activations and int8 weights and uses the latency optimized
+// implementations.
+TfLiteRegistration Register_CONV_2D_INT8();
+
+// Returns a TfLiteRegistration struct for kernel variant that only supports
+// int16 activations and int8 weights and uses the latency optimized
+// implementations.
+TfLiteRegistration Register_CONV_2D_INT16();
+
+#else
+inline TfLiteRegistration Register_CONV_2D_INT8() { return Register_CONV_2D(); }
+
+inline TfLiteRegistration Register_CONV_2D_INT16() {
+  return Register_CONV_2D();
+}
+#endif
+
 }  // namespace tflite

 #endif  // TENSORFLOW_LITE_MICRO_KERNELS_CONV_H_
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_common.cc
@@ -93,13 +93,18 @@ TfLiteStatus CalculateOpDataConv(TfLiteContext* context, TfLiteNode* node,
      params.dilation_width_factor, height, width, filter_height, filter_width,
      padding, &out_height, &out_width);

-  const TfLiteTensor* input = GetInput(context, node, kConvInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kConvInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter = GetInput(context, node, kConvWeightsTensor);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kConvWeightsTensor);
  TF_LITE_ENSURE(context, filter != nullptr);
-  const TfLiteTensor* bias =
-      GetOptionalInputTensor(context, node, kConvBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kConvOutputTensor);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kConvBiasTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kConvOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  // Note that quantized inference requires that all tensors have their
@@ -119,6 +124,11 @@ TfLiteStatus CalculateOpDataConv(TfLiteContext* context, TfLiteNode* node,
  data->filter_zero_point = filter->params.zero_point;
  data->output_zero_point = output->params.zero_point;

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(bias);
+
  return kTfLiteOk;
 }

@@ -129,12 +139,16 @@ TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node) {
  OpDataConv* data = static_cast<OpDataConv*>(node->user_data);
  const auto& params =
      *(static_cast<const TfLiteConvParams*>(node->builtin_data));
+  MicroContext* micro_context = GetMicroContext(context);

-  TfLiteTensor* output = GetOutput(context, node, kConvOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kConvOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
-  const TfLiteTensor* input = GetInput(context, node, kConvInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kConvInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter = GetInput(context, node, kConvWeightsTensor);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kConvWeightsTensor);
  TF_LITE_ENSURE(context, filter != nullptr);

  const int input_width = input->dims->data[2];
@@ -174,6 +188,10 @@ TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node) {
      context, node, params, input_width, input_height, filter_width,
      filter_height, output_width, output_height, input->type, data));

+  micro_context->DeallocateTempTfLiteTensor(filter);
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }
 }  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/cumsum.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/cumsum.cc
@@ -47,8 +47,12 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* axis = GetInput(context, node, kAxisTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* axis =
+      micro_context->AllocateTempInputTensor(node, kAxisTensor);

  TF_LITE_ENSURE(context,
                 input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
@@ -58,7 +62,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {

  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);

-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);

  TF_LITE_ENSURE_EQ(context, input->type, output->type);
  TF_LITE_ENSURE(context, HaveSameShapes(input, output));
@@ -91,6 +96,10 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
        &data->output_activation_max));
  }

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(axis);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depth_to_space.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depth_to_space.cc
@@ -40,11 +40,14 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);

@@ -83,6 +86,9 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
  output->dims->data[kWidthRank] = output_width;
  output->dims->data[kDepthRank] = output_channels;

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv_common.cc
@@ -94,13 +94,18 @@ TfLiteStatus CalculateOpDataDepthwiseConv(
      params.dilation_width_factor, height, width, filter_height, filter_width,
      padding, &out_height, &out_width);

-  const TfLiteTensor* input = GetInput(context, node, kConvInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kConvInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter = GetInput(context, node, kConvWeightsTensor);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kConvWeightsTensor);
  TF_LITE_ENSURE(context, filter != nullptr);
-  const TfLiteTensor* bias =
-      GetOptionalInputTensor(context, node, kConvBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kConvOutputTensor);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kConvBiasTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kConvOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  // Note that quantized inference requires that all tensors have their
@@ -120,6 +125,11 @@ TfLiteStatus CalculateOpDataDepthwiseConv(
  data->filter_zero_point = filter->params.zero_point;
  data->output_zero_point = output->params.zero_point;

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+  micro_context->DeallocateTempTfLiteTensor(bias);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

@@ -130,14 +140,16 @@ TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node) {
  OpDataConv* data = static_cast<OpDataConv*>(node->user_data);
  const auto& params =
      *(static_cast<const TfLiteDepthwiseConvParams*>(node->builtin_data));
+  MicroContext* micro_context = GetMicroContext(context);

-  TfLiteTensor* output = GetOutput(context, node, kDepthwiseConvOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kDepthwiseConvOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
-  const TfLiteTensor* input =
-      GetInput(context, node, kDepthwiseConvInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kDepthwiseConvInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter =
-      GetInput(context, node, kDepthwiseConvWeightsTensor);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kDepthwiseConvWeightsTensor);
  TF_LITE_ENSURE(context, filter != nullptr);

  const int input_width = input->dims->data[2];
@@ -180,6 +192,10 @@ TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node) {
      context, node, params, input_width, input_height, filter_width,
      filter_height, output_width, output_height, input->type, data));

+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize_common.cc
@@ -33,10 +33,12 @@ TfLiteStatus DequantizePrepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

+  MicroContext* micro_context = GetMicroContext(context);
+
  // TODO(b/140515557): Add cached dequant to improve hybrid model performance.
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE(context,
@@ -54,6 +56,10 @@ TfLiteStatus DequantizePrepare(TfLiteContext* context, TfLiteNode* node) {
  data->quantization_params.zero_point = input->params.zero_point;
  data->quantization_params.scale = static_cast<double>(input->params.scale);
  data->output_zero_point = output->params.zero_point;
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/detection_postprocess.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/detection_postprocess.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

+#include <algorithm>
 #include <numeric>
+#include <tuple>

 #include "flatbuffers/flexbuffers.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
@@ -152,14 +154,17 @@ void Free(TfLiteContext* context, void* buffer) {}
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  auto* op_data = static_cast<OpData*>(node->user_data);

+  MicroContext* micro_context = GetMicroContext(context);
+
  // Inputs: box_encodings, scores, anchors
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
-  const TfLiteTensor* input_box_encodings =
-      GetInput(context, node, kInputTensorBoxEncodings);
-  const TfLiteTensor* input_class_predictions =
-      GetInput(context, node, kInputTensorClassPredictions);
-  const TfLiteTensor* input_anchors =
-      GetInput(context, node, kInputTensorAnchors);
+  TfLiteTensor* input_box_encodings =
+      micro_context->AllocateTempInputTensor(node, kInputTensorBoxEncodings);
+  TfLiteTensor* input_class_predictions =
+      micro_context->AllocateTempInputTensor(node,
+                                             kInputTensorClassPredictions);
+  TfLiteTensor* input_anchors =
+      micro_context->AllocateTempInputTensor(node, kInputTensorAnchors);
  TF_LITE_ENSURE_EQ(context, NumDimensions(input_box_encodings), 3);
  TF_LITE_ENSURE_EQ(context, NumDimensions(input_class_predictions), 3);
  TF_LITE_ENSURE_EQ(context, NumDimensions(input_anchors), 2);
@@ -217,6 +222,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  // num_detections
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 4);

+  micro_context->DeallocateTempTfLiteTensor(input_box_encodings);
+  micro_context->DeallocateTempTfLiteTensor(input_class_predictions);
+  micro_context->DeallocateTempTfLiteTensor(input_anchors);
+
  return kTfLiteOk;
 }

@@ -313,9 +322,10 @@ TfLiteStatus DecodeCenterSizeBoxes(TfLiteContext* context, TfLiteNode* node,
 void DecreasingPartialArgSort(const float* values, int num_values,
                              int num_to_sort, int* indices) {
  std::iota(indices, indices + num_values, 0);
-  std::partial_sort(
-      indices, indices + num_to_sort, indices + num_values,
-      [&values](const int i, const int j) { return values[i] > values[j]; });
+  std::partial_sort(indices, indices + num_to_sort, indices + num_values,
+                    [&values](const int i, const int j) {
+                      return std::tie(values[i], j) > std::tie(values[j], i);
+                    });
 }

 template <typename Compare>
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/elementwise.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/elementwise.cc
@@ -38,11 +38,13 @@ bool IsLogicalSupportedType(const TfLiteType type) {
 typedef bool (*IsSupportedType)(TfLiteType);
 template <IsSupportedType>
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
  if (!IsSupportedType(input->type)) {
@@ -50,6 +52,9 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
                       TfLiteTypeGetName(input->type), input->type);
    return kTfLiteError;
  }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/elu.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/elu.cc
@@ -80,13 +80,16 @@ void EvalUsingLookupTable(const OpData* data, const TfLiteEvalTensor* input,
 }

 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);

  // Use LUT to handle quantized elu path.
@@ -97,7 +100,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
    };
    PopulateLookupTable<int8_t>(input, output, transform, data);
  }
-
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/README.md
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/README.md
@@ -0,0 +1,11 @@
+# Info
+
+These are the Espressif chipset specific replacement kernels.
+The kernels call optimized routines or reference routines depending upon optimization option selected.
+
+By default optimizations are selected if available.
+To change this behaviour, please make the appropriate `ESP-NN` menu selection after running:
+
+```
+idf.py menuconfig
+```
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/add.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/add.cc
@@ -0,0 +1,209 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/add.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+#include <esp_timer.h>
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+long long add_total_time = 0;
+
+namespace tflite {
+
+void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
+             const OpDataAdd* data, const TfLiteEvalTensor* input1,
+             const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(data->output_activation_min_f32,
+                      data->output_activation_max_f32, &op_params);
+  if (data->requires_broadcast) {
+    reference_ops::BroadcastAdd4DSlow(
+        op_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<float>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<float>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<float>(output));
+  } else {
+    reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
+                       tflite::micro::GetTensorData<float>(input1),
+                       tflite::micro::GetTensorShape(input2),
+                       tflite::micro::GetTensorData<float>(input2),
+                       tflite::micro::GetTensorShape(output),
+                       tflite::micro::GetTensorData<float>(output));
+  }
+}
+
+TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
+                              TfLiteAddParams* params, const OpDataAdd* data,
+                              const TfLiteEvalTensor* input1,
+                              const TfLiteEvalTensor* input2,
+                              TfLiteEvalTensor* output) {
+  tflite::ArithmeticParams op_params;
+  op_params.left_shift = data->left_shift;
+  op_params.input1_offset = data->input1_offset;
+  op_params.input1_multiplier = data->input1_multiplier;
+  op_params.input1_shift = data->input1_shift;
+  op_params.input2_offset = data->input2_offset;
+  op_params.input2_multiplier = data->input2_multiplier;
+  op_params.input2_shift = data->input2_shift;
+  op_params.output_offset = data->output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+  SetActivationParams(data->output_activation_min, data->output_activation_max,
+                      &op_params);
+  bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+      tflite::micro::GetTensorShape(input1),
+      tflite::micro::GetTensorShape(input2), &op_params);
+
+  switch (output->type) {
+    case kTfLiteInt8: {
+      if (need_broadcast) {
+        reference_integer_ops::BroadcastAdd4DSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
+      } else {
+#if ESP_NN
+        const int8_t *input1_data = tflite::micro::GetTensorData<int8_t>(input1);
+        const int8_t *input2_data = tflite::micro::GetTensorData<int8_t>(input2);
+        int8_t *out_data = tflite::micro::GetTensorData<int8_t>(output);
+
+        esp_nn_add_elementwise_s8(input1_data,
+                                  input2_data,
+                                  data->input1_offset,
+                                  data->input2_offset,
+                                  data->input1_multiplier,
+                                  data->input2_multiplier,
+                                  data->input1_shift,
+                                  data->input2_shift,
+                                  data->left_shift,
+                                  out_data,
+                                  data->output_offset,
+                                  data->output_multiplier,
+                                  data->output_shift,
+                                  data->output_activation_min,
+                                  data->output_activation_max,
+                                  MatchingElementsSize(tflite::micro::GetTensorShape(input1),
+                                                       tflite::micro::GetTensorShape(input2),
+                                                       tflite::micro::GetTensorShape(output))
+                                  );
+#else
+        reference_integer_ops::Add(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
+#endif
+      }
+      break;
+    }
+    case kTfLiteInt16: {
+      if (need_broadcast) {
+        reference_ops::BroadcastAdd4DSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int16_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int16_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int16_t>(output));
+      } else {
+        reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
+                           tflite::micro::GetTensorData<int16_t>(input1),
+                           tflite::micro::GetTensorShape(input2),
+                           tflite::micro::GetTensorData<int16_t>(input2),
+                           tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<int16_t>(output),
+                           false);
+      }
+      break;
+    }
+    default:
+      MicroPrintf("Type %s (%d) not supported.",
+                  TfLiteTypeGetName(output->type), output->type);
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+void* AddInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataAdd));
+}
+
+TfLiteStatus AddEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataAdd* data = static_cast<const OpDataAdd*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kAddInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kAddInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kAddOutputTensor);
+
+  long long start_time = esp_timer_get_time();
+
+  if (output->type == kTfLiteFloat32) {
+    EvalAdd(context, node, params, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt8 || output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, data,
+                                                input1, input2, output));
+  } else {
+    MicroPrintf("Type %s (%d) not supported.", TfLiteTypeGetName(output->type),
+                output->type);
+    return kTfLiteError;
+  }
+  add_total_time += esp_timer_get_time() - start_time;
+
+  return kTfLiteOk;
+}
+
+TfLiteRegistration Register_ADD() {
+  return {/*init=*/AddInit,
+          /*free=*/nullptr,
+          /*prepare=*/AddPrepare,
+          /*invoke=*/AddEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/conv.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/conv.cc
@@ -0,0 +1,319 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/conv.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+#include "freertos/FreeRTOS.h"
+#include <esp_timer.h>
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+
+long long conv_total_time = 0;
+
+namespace tflite {
+namespace {
+
+struct NodeData {
+  OpDataConv op_data;
+#if ESP_NN
+  int buffer_idx;
+#endif
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(NodeData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  NodeData* data = static_cast<NodeData*>(node->user_data);
+  const auto& params =
+      *(static_cast<const TfLiteConvParams*>(node->builtin_data));
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kConvInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kConvWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kConvOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  const int input_width = input->dims->data[2];
+  const int input_height = input->dims->data[1];
+  const int filter_width = filter->dims->data[2];
+  const int filter_height = filter->dims->data[1];
+  const int output_width = output->dims->data[2];
+  const int output_height = output->dims->data[1];
+
+  // Dynamically allocate per-channel quantization parameters.
+  const int num_channels = filter->dims->data[kConvQuantizedDimension];
+  data->op_data.per_channel_output_multiplier =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->op_data.per_channel_output_shift =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+    TFLITE_DCHECK(affine_quantization != nullptr);
+    TFLITE_DCHECK(affine_quantization->scale != nullptr);
+    TFLITE_DCHECK(affine_quantization->zero_point != nullptr);
+
+    TF_LITE_ENSURE(context,
+                   affine_quantization->scale->size == 1 ||
+                       affine_quantization->scale->size ==
+                           filter->dims->data[kConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpDataConv(
+      context, node, params, input_width, input_height, filter_width,
+      filter_height, output_width, output_height, input->type, &data->op_data));
+
+#if ESP_NN
+  if (input->type == kTfLiteInt8) {
+    int scratch_buf_size = esp_nn_get_conv_scratch_size(
+        input_width, input_height, input->dims->data[3],
+        output->dims->data[3], filter_width, filter_height);
+    if (scratch_buf_size > 0) {
+      TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+        context, scratch_buf_size, &data->buffer_idx));
+    } else {
+      data->buffer_idx = -1;
+    }
+  }
+#endif
+
+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+
+  return kTfLiteOk;
+}
+
+#if ESP_NN
+// Fixed-point per-channel-quantization convolution Int8 function wrapper.
+inline void EvalQuantizedPerChannel(
+    TfLiteContext* context, TfLiteNode* node, const TfLiteConvParams& params,
+    const NodeData& data, const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
+    TfLiteEvalTensor* output) {
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+
+  if (dilation_width_factor == 1 && dilation_height_factor == 1) {
+    // Get parameters.
+    RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
+    RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+    RuntimeShape bias_shape = tflite::micro::GetTensorShape(bias);
+
+    const int8_t *input_data = tflite::micro::GetTensorData<int8_t>(input);
+    int8_t *output_data = tflite::micro::GetTensorData<int8_t>(output);
+
+    const int32_t input_offset = -data.op_data.input_zero_point;
+    const int32_t output_offset = data.op_data.output_zero_point;
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int pad_width = data.op_data.padding.width;
+    const int pad_height = data.op_data.padding.height;
+
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+
+    // Set min and max value of the output.
+    const int32_t activation_min = data.op_data.output_activation_min;
+    const int32_t activation_max = data.op_data.output_activation_max;
+
+    // Consistency check.
+    TFLITE_DCHECK_LE(activation_min, activation_max);
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+    const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+    const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+    const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+
+    if (tflite::micro::GetTensorData<int8_t>(bias)) {
+      TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+    }
+
+    void *scratch_buf = NULL;
+    if (data.buffer_idx > -1) {
+      scratch_buf = context->GetScratchBuffer(context, data.buffer_idx);
+    }
+    esp_nn_set_conv_scratch_buf(scratch_buf);
+
+    const int input_size = input_width * input_height * input_depth;
+    const int output_size = output_width * output_height * output_depth;
+
+    for (int i_batch = 0; i_batch < batch_size; i_batch++) {
+      esp_nn_conv_s8(input_data + i_batch * input_size,
+                     input_width, input_height, input_depth, input_offset,
+                     pad_width, pad_height, stride_width, stride_height,
+                     tflite::micro::GetTensorData<int8_t>(filter),
+                     filter_width, filter_height,
+                     tflite::micro::GetTensorData<int32_t>(bias),
+                     output_data + i_batch * output_size,
+                     output_width, output_height, output_depth, output_offset,
+                     data.op_data.per_channel_output_shift,
+                     data.op_data.per_channel_output_multiplier,
+                     activation_min, activation_max);
+    }
+  } else {
+    reference_integer_ops::ConvPerChannel(
+        ConvParamsQuantized(params, data.op_data),
+        data.op_data.per_channel_output_multiplier,
+        data.op_data.per_channel_output_shift,
+        tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<int8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
+  }
+}
+#endif
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kConvInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kConvWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kConvBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kConvOutputTensor);
+
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto& params =
+      *(reinterpret_cast<TfLiteConvParams*>(node->builtin_data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const auto& data = *(static_cast<const NodeData*>(node->user_data));
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
+  long long start_time = esp_timer_get_time();
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32: {
+      tflite::reference_ops::Conv(
+          ConvParamsFloat(params, data.op_data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output),
+          tflite::micro::GetTensorShape(nullptr), nullptr);
+      break;
+    }
+    case kTfLiteInt8: {
+#if ESP_NN
+      EvalQuantizedPerChannel(context, node, params, data, input, filter,
+                              bias, output);
+#else
+      reference_integer_ops::ConvPerChannel(
+          ConvParamsQuantized(params, data.op_data),
+          data.op_data.per_channel_output_multiplier,
+          data.op_data.per_channel_output_shift,
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<int8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+#endif
+      break;
+    }
+    case kTfLiteUInt8: {
+      //EvalQuantized
+      reference_ops::Conv(ConvParamsQuantized(params, data.op_data),
+                          tflite::micro::GetTensorShape(input),
+                          tflite::micro::GetTensorData<uint8_t>(input),
+                          tflite::micro::GetTensorShape(filter),
+                          tflite::micro::GetTensorData<uint8_t>(filter),
+                          tflite::micro::GetTensorShape(bias),
+                          tflite::micro::GetTensorData<int32_t>(bias),
+                          tflite::micro::GetTensorShape(output),
+                          tflite::micro::GetTensorData<uint8_t>(output),
+                          tflite::micro::GetTensorShape(nullptr), nullptr,
+                          nullptr);
+      break;
+    }
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  conv_total_time += esp_timer_get_time() - start_time;
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_CONV_2D() {
+  return {/*init=*/Init,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/depthwise_conv.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/depthwise_conv.cc
@@ -0,0 +1,319 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/depthwise_conv.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+#include "freertos/FreeRTOS.h"
+#include <esp_timer.h>
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+long long dc_total_time = 0;
+
+namespace tflite {
+namespace {
+
+struct NodeData {
+  OpDataConv op_data;
+#if ESP_NN
+  int buffer_idx;
+#endif
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(NodeData));
+}
+
+#if ESP_NN
+inline void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                                    const TfLiteDepthwiseConvParams& params,
+                                    const NodeData& data,
+                                    const TfLiteEvalTensor* input,
+                                    const TfLiteEvalTensor* filter,
+                                    const TfLiteEvalTensor* bias,
+                                    TfLiteEvalTensor* output) {
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+
+  if (dilation_width_factor == 1 && dilation_height_factor == 1) {
+    // Get parameters.
+    RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
+    RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+    RuntimeShape bias_shape = tflite::micro::GetTensorShape(bias);
+
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+    const int8_t *input_data = tflite::micro::GetTensorData<int8_t>(input);
+    int8_t *output_data = tflite::micro::GetTensorData<int8_t>(output);
+
+    const int depth_multiplier = params.depth_multiplier;
+    const int32_t input_offset = -data.op_data.input_zero_point;
+    const int32_t output_offset = data.op_data.output_zero_point;
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int pad_width = data.op_data.padding.width;
+    const int pad_height = data.op_data.padding.height;
+
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+
+    // Set min and max value of the output.
+    const int32_t activation_min = data.op_data.output_activation_min;
+    const int32_t activation_max = data.op_data.output_activation_max;
+
+    // Consistency check.
+    TFLITE_DCHECK_LE(activation_min, activation_max);
+    const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+
+    TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+    if (tflite::micro::GetTensorData<int8_t>(bias)) {
+      TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+    }
+
+    const int input_size = input_width * input_height * input_depth;
+    const int output_size = output_width * output_height * output_depth;
+    void *scratch_buf = NULL;
+    if (data.buffer_idx > -1) {
+      scratch_buf = context->GetScratchBuffer(context, data.buffer_idx);
+    }
+    esp_nn_set_depthwise_conv_scratch_buf(scratch_buf);
+
+    for (int i_batch = 0; i_batch < batch_size; i_batch++) {
+      esp_nn_depthwise_conv_s8(input_data + i_batch * input_size, input_width,
+                               input_height, input_depth, input_offset,
+                               pad_width, pad_height,
+                               stride_width, stride_height, depth_multiplier,
+                               tflite::micro::GetTensorData<int8_t>(filter),
+                               filter_width, filter_height,
+                               tflite::micro::GetTensorData<int32_t>(bias),
+                               output_data + i_batch * output_size,
+                               output_width, output_height, output_offset,
+                               data.op_data.per_channel_output_shift,
+                               data.op_data.per_channel_output_multiplier,
+                               activation_min, activation_max);
+    }
+  } else {
+    reference_integer_ops::DepthwiseConvPerChannel(
+        DepthwiseConvParamsQuantized(params, data.op_data),
+        data.op_data.per_channel_output_multiplier,
+        data.op_data.per_channel_output_shift,
+        tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<int8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
+  }
+}
+#endif
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  NodeData* data = static_cast<NodeData*>(node->user_data);
+  const TfLiteDepthwiseConvParams& params =
+      *(static_cast<const TfLiteDepthwiseConvParams*>(node->builtin_data));
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kConvInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kConvWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kConvBiasTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kConvOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  const int input_width = input->dims->data[2];
+  const int input_height = input->dims->data[1];
+  const int filter_width = filter->dims->data[2];
+  const int filter_height = filter->dims->data[1];
+  const int output_width = output->dims->data[2];
+  const int output_height = output->dims->data[1];
+
+  // Dynamically allocate per-channel quantization parameters.
+  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+  data->op_data.per_channel_output_multiplier =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->op_data.per_channel_output_shift =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+    TFLITE_DCHECK(affine_quantization != nullptr);
+    TFLITE_DCHECK(affine_quantization->scale != nullptr);
+    TFLITE_DCHECK(affine_quantization->zero_point != nullptr);
+
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpDataDepthwiseConv(
+      context, node, params, input_width, input_height, filter_width,
+      filter_height, output_width, output_height, input->type, &data->op_data));
+
+#if ESP_NN
+  if (input->type == kTfLiteInt8) {
+    int scratch_buf_size = esp_nn_get_depthwise_conv_scratch_size(
+        input_width, input_height, input->dims->data[3],
+        params.depth_multiplier, filter_width, filter_height);
+    if (scratch_buf_size > 0) {
+      TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+        context, scratch_buf_size, &data->buffer_idx));
+    } else {
+      data->buffer_idx = -1;
+    }
+  }
+#endif
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+  micro_context->DeallocateTempTfLiteTensor(bias);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto& params =
+      *(reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data));
+  const NodeData& data = *(static_cast<const NodeData*>(node->user_data));
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kDepthwiseConvOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kDepthwiseConvInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kDepthwiseConvWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kDepthwiseConvBiasTensor)
+          : nullptr;
+
+  long long start_time = esp_timer_get_time();
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      tflite::reference_ops::DepthwiseConv(
+          DepthwiseConvParamsFloat(params, data.op_data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+      break;
+    case kTfLiteInt8:
+#if ESP_NN
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
+                              output);
+#else
+      reference_integer_ops::DepthwiseConvPerChannel(
+          DepthwiseConvParamsQuantized(params, data.op_data),
+          data.op_data.per_channel_output_multiplier,
+          data.op_data.per_channel_output_shift,
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<int8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+#endif
+      break;
+    case kTfLiteUInt8:
+      //EvalQuantized(context, node, params, &data, input, filter, bias, output);
+      reference_ops::DepthwiseConv(
+          DepthwiseConvParamsQuantized(params, data.op_data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<uint8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<uint8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<uint8_t>(output));
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  dc_total_time += esp_timer_get_time() - start_time;
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
+  return {/*init=*/Init,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/fully_connected.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/fully_connected.cc
@@ -0,0 +1,198 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+#include <esp_timer.h>
+
+long long fc_total_time = 0;
+
+namespace tflite {
+namespace {
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context,
+                                           sizeof(OpDataFullyConnected));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto* data = static_cast<OpDataFullyConnected*>(node->user_data);
+  const auto params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kFullyConnectedInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* filter = micro_context->AllocateTempInputTensor(
+      node, kFullyConnectedWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kFullyConnectedBiasTensor);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(
+      node, kFullyConnectedOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
+  TF_LITE_ENSURE_OK(context, CalculateOpDataFullyConnected(
+                                 context, params->activation, input->type,
+                                 input, filter, bias, output, data));
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+  if (bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(bias);
+  }
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto* params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedBiasTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kFullyConnectedOutputTensor);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const auto& data =
+      *(static_cast<const OpDataFullyConnected*>(node->user_data));
+
+  long long start_time = esp_timer_get_time();
+  // Checks in Prepare ensure input, output and filter types are all the same.
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      tflite::reference_ops::FullyConnected(
+          FullyConnectedParamsFloat(params->activation),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+      break;
+    }
+
+    case kTfLiteInt8: {
+      const int32_t* bias_data =
+          nullptr != bias ? tflite::micro::GetTensorData<int32_t>(bias)
+                          : nullptr;
+#if ESP_NN
+      const RuntimeShape& filter_shape = tflite::micro::GetTensorShape(filter);
+      const RuntimeShape& output_shape = tflite::micro::GetTensorShape(output);
+      const int filter_dim_count = filter_shape.DimensionsCount();
+      const int batches = output_shape.Dims(0);
+      const int output_depth = output_shape.Dims(1);
+      TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+      const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+
+      const int8_t *input_data = tflite::micro::GetTensorData<int8_t>(input);
+      int8_t *output_data = tflite::micro::GetTensorData<int8_t>(output);
+      const int8_t *filter_data = tflite::micro::GetTensorData<int8_t>(filter);
+
+      for (int b = 0; b < batches; ++b) {
+        esp_nn_fully_connected_s8(input_data, -data.input_zero_point,
+                                  accum_depth,
+                                  filter_data, -data.filter_zero_point,
+                                  bias_data, output_data, output_depth,
+                                  data.output_zero_point,
+                                  data.output_shift, data.output_multiplier,
+                                  data.output_activation_min,
+                                  data.output_activation_max);
+        input_data += accum_depth;
+        output_data += output_depth;
+      }
+#else
+      tflite::reference_integer_ops::FullyConnected(
+          FullyConnectedParamsQuantized(data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<int8_t>(filter),
+          tflite::micro::GetTensorShape(bias), bias_data,
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+#endif
+      break;
+    }
+
+    case kTfLiteUInt8: {
+      tflite::reference_ops::FullyConnected(
+          FullyConnectedParamsQuantized(data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<uint8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<uint8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<uint8_t>(output));
+      break;
+    }
+    default: {
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+    }
+  }
+  fc_total_time += esp_timer_get_time() - start_time;
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_FULLY_CONNECTED() {
+  return {/*init=*/Init,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/mul.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/mul.cc
@@ -0,0 +1,131 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/mul.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+#include <esp_timer.h>
+
+long long mul_total_time = 0;
+
+namespace tflite {
+#if ESP_NN
+void MulEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                      const OpDataMul* data, const TfLiteEvalTensor* input1,
+                      const TfLiteEvalTensor* input2,
+                      TfLiteEvalTensor* output) {
+  tflite::ArithmeticParams op_params = {};
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.float_activation_max = data->output_activation_max_f32;
+  op_params.input1_offset = -data->input1_zero_point;
+  op_params.input2_offset = -data->input2_zero_point;
+  op_params.output_offset = data->output_zero_point;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+
+  bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+      tflite::micro::GetTensorShape(input1),
+      tflite::micro::GetTensorShape(input2), &op_params);
+
+  if (need_broadcast) {
+    reference_integer_ops::BroadcastMul4DSlow(
+        op_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<int8_t>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<int8_t>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
+  } else {
+    const int8_t *input1_data = tflite::micro::GetTensorData<int8_t>(input1);
+    const int8_t *input2_data = tflite::micro::GetTensorData<int8_t>(input2);
+    int8_t *out_data = tflite::micro::GetTensorData<int8_t>(output);
+
+    esp_nn_mul_elementwise_s8(input1_data, input2_data, op_params.input1_offset,
+                              op_params.input2_offset, out_data, op_params.output_offset,
+                              op_params.output_multiplier, op_params.output_shift,
+                              op_params.quantized_activation_min, op_params.quantized_activation_max,
+                              MatchingElementsSize(tflite::micro::GetTensorShape(input1),
+                                                    tflite::micro::GetTensorShape(input2),
+                                                    tflite::micro::GetTensorShape(output)));
+  }
+}
+#endif
+
+TfLiteStatus MulEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataMul* data = static_cast<const OpDataMul*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kMulInput1Tensor);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kMulInput2Tensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kMulOutputTensor);
+
+  long long start_time = esp_timer_get_time();
+  switch (input1->type) {
+    case kTfLiteInt8:
+#if ESP_NN
+      MulEvalQuantized(context, node, data, input1, input2, output);
+#else
+      EvalMulQuantizedReference(context, node, data, input1, input2, output);
+#endif
+      break;
+    case kTfLiteInt32:
+      EvalMulQuantizedReference(context, node, data, input1, input2, output);
+      break;
+    case kTfLiteFloat32:
+      EvalMulFloatReference(context, node, params, data, input1, input2,
+                            output);
+      break;
+    default:
+      MicroPrintf("Type %s (%d) not supported.",
+                  TfLiteTypeGetName(input1->type), input1->type);
+      return kTfLiteError;
+  }
+  mul_total_time += esp_timer_get_time() - start_time;
+  return kTfLiteOk;
+}
+
+TfLiteRegistration Register_MUL() {
+  return {/*init=*/MulInit,
+          /*free=*/nullptr,
+          /*prepare=*/MulPrepare,
+          /*invoke=*/MulEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/pooling.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/pooling.cc
@@ -0,0 +1,245 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/pooling.h"
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+#include <esp_timer.h>
+
+long long pooling_total_time = 0;
+
+namespace tflite {
+
+namespace {
+#if ESP_NN
+void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
+                          const TfLitePoolParams* params, const OpDataPooling* data,
+                          const TfLiteEvalTensor* input,
+                          TfLiteEvalTensor* output) {
+
+  const int stride_height = params->stride_height;
+  const int stride_width = params->stride_width;
+  const int filter_height = params->filter_height;
+  const int filter_width = params->filter_width;
+  const int activation_min = data->activation_min;
+  const int activation_max = data->activation_max;
+  const int pad_height = data->padding.height;
+  const int pad_width = data->padding.width;
+
+  const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input);
+  const RuntimeShape& output_shape = tflite::micro::GetTensorShape(output);
+  TFLITE_DCHECK_LE(activation_min, activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  const int8_t *input_data = tflite::micro::GetTensorData<int8_t>(input);
+  int8_t *output_data = tflite::micro::GetTensorData<int8_t>(output);
+
+  const int input_size = input_width * input_height * depth;
+  const int output_size = output_width * output_height * depth;
+
+  if (depth % 4 == 0) { // S3 version only supports channels multiple of 4
+    for (int batch = 0; batch < batches; ++batch) {
+      esp_nn_avg_pool_s8(input_data, input_width, input_height,
+                         output_data, output_width, output_height,
+                         stride_width, stride_height,
+                         filter_width, filter_height,
+                         pad_width, pad_height,
+                         activation_min, activation_max, depth);
+      input_data += input_size;
+      output_data += output_size;
+    }
+  } else {
+    for (int batch = 0; batch < batches; ++batch) {
+      esp_nn_avg_pool_s8_ansi(input_data, input_width, input_height,
+                              output_data, output_width, output_height,
+                              stride_width, stride_height,
+                              filter_width, filter_height,
+                              pad_width, pad_height,
+                              activation_min, activation_max, depth);
+      input_data += input_size;
+      output_data += output_size;
+    }
+  }
+}
+
+void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                      TfLitePoolParams* params, const OpDataPooling* data,
+                      const TfLiteEvalTensor* input, TfLiteEvalTensor* output) {
+
+  const int stride_height = params->stride_height;
+  const int stride_width = params->stride_width;
+  const int filter_height = params->filter_height;
+  const int filter_width = params->filter_width;
+  const int activation_min = data->activation_min;
+  const int activation_max = data->activation_max;
+  const int pad_height = data->padding.height;
+  const int pad_width = data->padding.width;
+
+  const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input);
+  const RuntimeShape& output_shape = tflite::micro::GetTensorShape(output);
+  TFLITE_DCHECK_LE(activation_min, activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  const int8_t *input_data = tflite::micro::GetTensorData<int8_t>(input);
+  int8_t *output_data = tflite::micro::GetTensorData<int8_t>(output);
+
+  const int input_size = input_width * input_height * depth;
+  const int output_size = output_width * output_height * depth;
+  if (depth % 4 == 0) { // S3 version only supports channels multiple of 4
+    for (int batch = 0; batch < batches; ++batch) {
+      esp_nn_max_pool_s8(input_data, input_width, input_height,
+                         output_data, output_width, output_height,
+                         stride_width, stride_height,
+                         filter_width, filter_height,
+                         pad_width, pad_height,
+                         activation_min, activation_max, depth);
+      input_data += input_size;
+      output_data += output_size;
+    }
+  } else {
+    for (int batch = 0; batch < batches; ++batch) {
+      esp_nn_max_pool_s8_ansi(input_data, input_width, input_height,
+                              output_data, output_width, output_height,
+                              stride_width, stride_height,
+                              filter_width, filter_height,
+                              pad_width, pad_height,
+                              activation_min, activation_max, depth);
+      input_data += input_size;
+      output_data += output_size;
+    }
+  }
+}
+#endif
+
+TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataPooling* data =
+      static_cast<const OpDataPooling*>(node->user_data);
+
+  const TfLiteEvalTensor* input =
+      micro::GetEvalInput(context, node, kPoolingInputTensor);
+  TfLiteEvalTensor* output =
+      micro::GetEvalOutput(context, node, kPoolingOutputTensor);
+
+  long long start_time = esp_timer_get_time();
+  // Inputs and outputs share the same type, guaranteed by the converter.
+  switch (input->type) {
+    case kTfLiteFloat32:
+      AveragePoolingEvalFloat(context, node, params, data, input, output);
+      break;
+    case kTfLiteInt8:
+#if ESP_NN
+      AverageEvalQuantized(context, node, params, data, input, output);
+#else
+      AveragePoolingEvalQuantized(context, node, params, data, input, output);
+#endif
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  pooling_total_time += esp_timer_get_time() - start_time;
+  return kTfLiteOk;
+}
+
+TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataPooling* data =
+      static_cast<const OpDataPooling*>(node->user_data);
+
+  const TfLiteEvalTensor* input =
+      micro::GetEvalInput(context, node, kPoolingInputTensor);
+  TfLiteEvalTensor* output =
+      micro::GetEvalOutput(context, node, kPoolingOutputTensor);
+
+  long long start_time = esp_timer_get_time();
+  switch (input->type) {
+    case kTfLiteFloat32:
+      MaxPoolingEvalFloat(context, node, params, data, input, output);
+      break;
+    case kTfLiteInt8:
+#if ESP_NN
+      MaxEvalQuantized(context, node, params, data, input, output);
+#else
+      MaxPoolingEvalQuantized(context, node, params, data, input, output);
+#endif
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  pooling_total_time += esp_timer_get_time() - start_time;
+  return kTfLiteOk;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataPooling));
+}
+
+}  // namespace
+
+TfLiteRegistration Register_AVERAGE_POOL_2D() {
+  return {/*init=*/Init,
+          /*free=*/nullptr,
+          /*prepare=*/PoolingPrepare,
+          /*invoke=*/AverageEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+TfLiteRegistration Register_MAX_POOL_2D() {
+  return {/*init=*/Init,
+          /*free=*/nullptr,
+          /*prepare=*/PoolingPrepare,
+          /*invoke=*/MaxEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/exp.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/exp.cc
@@ -27,11 +27,15 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
@@ -40,6 +44,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  for (int i = 0; i < output->dims->size; ++i) {
    TF_LITE_ENSURE_EQ(context, output->dims->data[i], input->dims->data[i]);
  }
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/expand_dims.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/expand_dims.cc
@@ -84,22 +84,31 @@ TfLiteStatus VerifyTensorDim(TfLiteContext* context, const TfLiteTensor* input,
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  const TfLiteTensor* axis;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kAxisTensor, &axis));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* axis =
+      micro_context->AllocateTempInputTensor(node, kAxisTensor);
+  TF_LITE_ENSURE(context, axis != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
  output->type = input->type;
  if (IsDynamicTensor(axis)) {
    TF_LITE_KERNEL_LOG(context,
                       "DynamicTensor is not yet supported by Expand_Dims.");
    return kTfLiteError;
  }
-  return VerifyTensorDim(context, input, axis, output);
+  TF_LITE_ENSURE_OK(context, VerifyTensorDim(context, input, axis, output));
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(axis);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
 }

 template <typename T>
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/fill.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/fill.cc
@@ -65,14 +65,18 @@ constexpr int kValueTensor = 1;
 constexpr int kOutputTensor = 0;

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  // Ensure inputs and outputs exist.
-  const TfLiteTensor* dims;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kDimsTensor, &dims));
-  const TfLiteTensor* value;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kValueTensor, &value));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* dims =
+      micro_context->AllocateTempInputTensor(node, kDimsTensor);
+  TF_LITE_ENSURE(context, dims != nullptr);
+  TfLiteTensor* value =
+      micro_context->AllocateTempInputTensor(node, kValueTensor);
+  TF_LITE_ENSURE(context, value != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);

  // The value tensor must be a scalar.
  TF_LITE_ENSURE_EQ(context, NumDimensions(value), 0);
@@ -90,6 +94,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
    TF_LITE_ENSURE_OK(context, EnsureEq(context, output->dims, dims));
  }

+  micro_context->DeallocateTempTfLiteTensor(dims);
+  micro_context->DeallocateTempTfLiteTensor(value);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_div.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_div.cc
@@ -31,22 +31,28 @@ constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;

 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input1;
-  TF_LITE_ENSURE_OK(context,
-                    GetInputSafe(context, node, kInputTensor1, &input1));
-  const TfLiteTensor* input2;
-  TF_LITE_ENSURE_OK(context,
-                    GetInputSafe(context, node, kInputTensor2, &input2));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor1);
+  TF_LITE_ENSURE(context, input1 != nullptr);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor2);
+  TF_LITE_ENSURE(context, input2 != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, output->type);

+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_mod.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_mod.cc
@@ -36,22 +36,28 @@ constexpr int kOutputTensor = 0;
 // OLD-TODO(b/117912880): Support quantization.

 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input1;
-  TF_LITE_ENSURE_OK(context,
-                    GetInputSafe(context, node, kInputTensor1, &input1));
-  const TfLiteTensor* input2;
-  TF_LITE_ENSURE_OK(context,
-                    GetInputSafe(context, node, kInputTensor2, &input2));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor1);
+  TF_LITE_ENSURE(context, input1 != nullptr);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor2);
+  TF_LITE_ENSURE(context, input2 != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, output->type);

+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -35,6 +35,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TFLITE_DCHECK(node->user_data != nullptr);
  TFLITE_DCHECK(node->builtin_data != nullptr);

@@ -42,23 +44,33 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  const auto params =
      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);

-  const TfLiteTensor* input =
-      GetInput(context, node, kFullyConnectedInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kFullyConnectedInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter =
-      GetInput(context, node, kFullyConnectedWeightsTensor);
+  TfLiteTensor* filter = micro_context->AllocateTempInputTensor(
+      node, kFullyConnectedWeightsTensor);
  TF_LITE_ENSURE(context, filter != nullptr);
-  const TfLiteTensor* bias =
-      GetOptionalInputTensor(context, node, kFullyConnectedBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kFullyConnectedOutputTensor);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kFullyConnectedBiasTensor);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(
+      node, kFullyConnectedOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                     "Hybrid models are not supported on TFLite Micro.");

-  return CalculateOpDataFullyConnected(context, params->activation, input->type,
-                                       input, filter, bias, output, data);
+  TF_LITE_ENSURE_OK(context, CalculateOpDataFullyConnected(
+                                 context, params->activation, input->type,
+                                 input, filter, bias, output, data));
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+  if (bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(bias);
+  }
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather.cc
@@ -97,19 +97,23 @@ TfLiteStatus Gather(const TfLiteGatherParams* params,
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

  const auto* params =
      reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  const TfLiteTensor* coords;
-  TF_LITE_ENSURE_OK(context,
-                    GetInputSafe(context, node, kInputPositions, &coords));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* coords =
+      micro_context->AllocateTempInputTensor(node, kInputPositions);
+  TF_LITE_ENSURE(context, coords != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
  switch (coords->type) {
    case kTfLiteInt32:
      break;
@@ -176,6 +180,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  for (int i = axis + 1; i < input->dims->size; ++i) {
    output_shape->data[output_index++] = input->dims->data[i];
  }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(coords);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather_nd.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather_nd.cc
@@ -28,16 +28,19 @@ constexpr int kOutputTensor = 0;
 constexpr int MAX_INDICES_ND = 5;

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* params;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kParams, &params));
-  const TfLiteTensor* indices;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kIndices, &indices));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* params = micro_context->AllocateTempInputTensor(node, kParams);
+  TF_LITE_ENSURE(context, params != nullptr);
+  TfLiteTensor* indices =
+      micro_context->AllocateTempInputTensor(node, kIndices);
+  TF_LITE_ENSURE(context, indices != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);

  switch (params->type) {
    case kTfLiteFloat32:
@@ -98,6 +101,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
    output_shape->data[output_index++] = params->dims->data[i];
  }
  output_shape->size = output_index;
+
+  micro_context->DeallocateTempTfLiteTensor(params);
+  micro_context->DeallocateTempTfLiteTensor(indices);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish_common.cc
@@ -32,13 +32,17 @@ const int kHardSwishInputTensor = 0;
 const int kHardSwishOutputTensor = 0;

 TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TFLITE_DCHECK(node->user_data != nullptr);
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input = GetInput(context, node, kHardSwishInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kHardSwishInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kHardSwishOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kHardSwishOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  if (input->type == kTfLiteInt8) {
@@ -73,6 +77,9 @@ TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
        &params->reluish_multiplier_fixedpoint_int16);
  }

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/if.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/if.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_context.h"
 #include "tensorflow/lite/micro/micro_graph.h"
 #include "tensorflow/lite/schema/schema_generated.h"

@@ -50,36 +51,33 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE(context, node->inputs->size > 0);

  // The first input is the condition.
-  const TfLiteTensor* cond;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &cond));
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  TfLiteTensor* cond = micro_context->AllocateTempInputTensor(node, 0);
+
+  TF_LITE_ENSURE(context, cond != nullptr);
  TF_LITE_ENSURE_EQ(context, cond->type, kTfLiteBool);
  TF_LITE_ENSURE_EQ(context, NumElements(cond), 1);

+  micro_context->DeallocateTempTfLiteTensor(cond);
+
  // The first input of the node is the condition. The rest of inputs are
  // passed to the branch subgraphs. Therefore, the number of subgraph inputs
  // will be the number of node inputs - 1.
  size_t num_inputs = node->inputs->size - 1;
  size_t num_outputs = node->outputs->size;

-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
+  MicroGraph& graph_info = micro_context->graph();

  TF_LITE_ENSURE(context,
-                 op_data->then_subgraph_index < graph_info->NumSubgraphs());
+                 op_data->then_subgraph_index < graph_info.NumSubgraphs());
  TF_LITE_ENSURE(context,
-                 op_data->else_subgraph_index < graph_info->NumSubgraphs());
+                 op_data->else_subgraph_index < graph_info.NumSubgraphs());

-  TF_LITE_ENSURE_EQ(
-      context, num_inputs,
-      graph_info->NumSubgraphInputs(op_data->then_subgraph_index));
+  TF_LITE_ENSURE_EQ(context, num_inputs,
+                    graph_info.NumSubgraphInputs(op_data->then_subgraph_index));
  TF_LITE_ENSURE_EQ(
      context, num_outputs,
-      graph_info->NumSubgraphOutputs(op_data->then_subgraph_index));
+      graph_info.NumSubgraphOutputs(op_data->then_subgraph_index));

  return kTfLiteOk;
 }
@@ -87,66 +85,30 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);

-  const TfLiteTensor* cond;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &cond));
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  TfLiteTensor* cond = micro_context->AllocateTempInputTensor(node, 0);
+
+  TF_LITE_ENSURE(context, cond != nullptr);
  bool cond_value = cond->data.b[0];
+  micro_context->DeallocateTempTfLiteTensor(cond);

-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
-
-  // Currently we copy the input / output between the subgraphs. This isn't
-  // optimized yet.
+  MicroGraph* graph_info = &micro_context->graph();
+  // Currently we copy the input / output between the subgraphs.
  int active_branch_subgraph_index =
      cond_value ? op_data->then_subgraph_index : op_data->else_subgraph_index;

-  for (size_t i = 0;
-       i < graph_info->NumSubgraphInputs(active_branch_subgraph_index); ++i) {
-    const TfLiteEvalTensor* input =
-        tflite::micro::GetEvalInput(context, node, i + 1);
-
-    TfLiteEvalTensor* subgraph_input =
-        graph_info->GetSubgraphInput(active_branch_subgraph_index, i);
-
-    // These checks must occur in Eval since TfLiteEvalTensors are not available
-    // during Prepare.
-    size_t input_bytes;
-    size_t subgraph_input_bytes;
-    TF_LITE_ENSURE_OK(context, TfLiteEvalTensorByteLength(input, &input_bytes));
-    TF_LITE_ENSURE_OK(context, TfLiteEvalTensorByteLength(
-                                   subgraph_input, &subgraph_input_bytes));
-    TF_LITE_ENSURE_TYPES_EQ(context, input->type, subgraph_input->type);
-    TF_LITE_ENSURE_EQ(context, input_bytes, subgraph_input_bytes);
-    memcpy(subgraph_input->data.raw, input->data.raw, input_bytes);
-  }
+  TF_LITE_ENSURE_OK(context,
+                    tflite::micro::CopyOpInputsToSubgraphInputs(
+                        context, node, graph_info, active_branch_subgraph_index,
+                        /*first_tensor_idx=*/1));

  TF_LITE_ENSURE_OK(context,
                    graph_info->InvokeSubgraph(active_branch_subgraph_index));

-  for (size_t i = 0;
-       i < graph_info->NumSubgraphOutputs(active_branch_subgraph_index); ++i) {
-    const TfLiteEvalTensor* output =
-        tflite::micro::GetEvalOutput(context, node, i);
+  TF_LITE_ENSURE_OK(
+      context, tflite::micro::CopySubgraphOutputsToOpOutputs(
+                   context, node, graph_info, active_branch_subgraph_index));

-    TfLiteEvalTensor* subgraph_output =
-        graph_info->GetSubgraphOutput(active_branch_subgraph_index, i);
-
-    // These checks must occur in Eval since TfLiteEvalTensors are not available
-    // during Prepare.
-    size_t output_bytes;
-    size_t subgraph_output_bytes;
-    TF_LITE_ENSURE_OK(context,
-                      TfLiteEvalTensorByteLength(output, &output_bytes));
-    TF_LITE_ENSURE_OK(context, TfLiteEvalTensorByteLength(
-                                   subgraph_output, &subgraph_output_bytes));
-    TF_LITE_ENSURE_TYPES_EQ(context, output->type, subgraph_output->type);
-    TF_LITE_ENSURE_EQ(context, output_bytes, subgraph_output_bytes);
-    memcpy(output->data.raw, subgraph_output->data.raw, output_bytes);
-  }
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.cc
@@ -24,7 +24,6 @@ namespace tflite {
 namespace micro {

 // TODO(b/161841696): Consider moving away from global arena buffers:
-constexpr int KernelRunner::kNumScratchBuffers_;
 constexpr int KernelRunner::kKernelRunnerBufferSize_;
 uint8_t KernelRunner::kKernelRunnerBuffer_[];

@@ -32,22 +31,23 @@ KernelRunner::KernelRunner(const TfLiteRegistration& registration,
                           TfLiteTensor* tensors, int tensors_size,
                           TfLiteIntArray* inputs, TfLiteIntArray* outputs,
                           void* builtin_data)
-    : allocator_(SimpleMemoryAllocator::Create(GetMicroErrorReporter(),
+    : registration_(registration),
+      allocator_(SimpleMemoryAllocator::Create(GetMicroErrorReporter(),
                                               kKernelRunnerBuffer_,
                                               kKernelRunnerBufferSize_)),
-      registration_(registration),
-      tensors_(tensors),
-      mock_micro_graph_(allocator_) {
+      mock_micro_graph_(allocator_),
+      fake_micro_context_(tensors, allocator_, &mock_micro_graph_) {
  // Prepare TfLiteContext:
-  context_.impl_ = static_cast<void*>(this);
-  context_.ReportError = ReportOpError;
+  context_.impl_ = static_cast<void*>(&fake_micro_context_);
+  context_.ReportError = MicroContextReportOpError;
  context_.recommended_num_threads = 1;
-  context_.GetTensor = GetTensor;
-  context_.GetEvalTensor = GetEvalTensor;
-  context_.AllocatePersistentBuffer = AllocatePersistentBuffer;
-  context_.RequestScratchBufferInArena = RequestScratchBufferInArena;
-  context_.GetScratchBuffer = GetScratchBuffer;
-  context_.GetExecutionPlan = GetGraph;
+  context_.GetTensor = MicroContextGetTensor;
+  context_.GetEvalTensor = MicroContextGetEvalTensor;
+  context_.AllocatePersistentBuffer = MicroContextAllocatePersistentBuffer;
+  context_.RequestScratchBufferInArena =
+      MicroContextRequestScratchBufferInArena;
+  context_.GetScratchBuffer = MicroContextGetScratchBuffer;
+
  context_.recommended_num_threads = 0;

  // Prepare TfLiteNode:
@@ -56,14 +56,24 @@ KernelRunner::KernelRunner(const TfLiteRegistration& registration,
  node_.builtin_data = builtin_data;
 }

+bool KernelRunner::ValidateTempBufferDeallocated() {
+  return fake_micro_context_.IsAllTempTfLiteTensorDeallocated();
+}
+
 TfLiteStatus KernelRunner::InitAndPrepare(const char* init_data,
                                          size_t length) {
  if (registration_.init) {
    node_.user_data = registration_.init(&context_, init_data, length);
  }
+
+  TF_LITE_ENSURE(&context_, ValidateTempBufferDeallocated());
+
  if (registration_.prepare) {
    TF_LITE_ENSURE_STATUS(registration_.prepare(&context_, &node_));
  }
+
+  TF_LITE_ENSURE(&context_, ValidateTempBufferDeallocated());
+
  return kTfLiteOk;
 }

@@ -72,101 +82,11 @@ TfLiteStatus KernelRunner::Invoke() {
    MicroPrintf("TfLiteRegistration missing invoke function pointer!");
    return kTfLiteError;
  }
-  return registration_.invoke(&context_, &node_);
-}

-TfLiteTensor* KernelRunner::GetTensor(const struct TfLiteContext* context,
-                                      int tensor_index) {
-  TFLITE_DCHECK(context != nullptr);
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
+  TF_LITE_ENSURE_STATUS(registration_.invoke(&context_, &node_));

-  return &runner->tensors_[tensor_index];
-}
+  TF_LITE_ENSURE(&context_, ValidateTempBufferDeallocated());

-TfLiteEvalTensor* KernelRunner::GetEvalTensor(
-    const struct TfLiteContext* context, int tensor_index) {
-  TFLITE_DCHECK(context != nullptr);
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
-
-  TfLiteEvalTensor* eval_tensor =
-      reinterpret_cast<TfLiteEvalTensor*>(runner->allocator_->AllocateTemp(
-          sizeof(TfLiteEvalTensor), alignof(TfLiteEvalTensor)));
-  TFLITE_DCHECK(eval_tensor != nullptr);
-
-  // In unit tests, the TfLiteTensor pointer contains the source of truth for
-  // buffers and values:
-  eval_tensor->data = runner->tensors_[tensor_index].data;
-  eval_tensor->dims = runner->tensors_[tensor_index].dims;
-  eval_tensor->type = runner->tensors_[tensor_index].type;
-  return eval_tensor;
-}
-
-void* KernelRunner::AllocatePersistentBuffer(TfLiteContext* context,
-                                             size_t bytes) {
-  TFLITE_DCHECK(context != nullptr);
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
-
-  return runner->allocator_->AllocateFromTail(bytes,
-                                              MicroArenaBufferAlignment());
-}
-
-TfLiteStatus KernelRunner::RequestScratchBufferInArena(TfLiteContext* context,
-                                                       size_t bytes,
-                                                       int* buffer_index) {
-  TFLITE_DCHECK(context != nullptr);
-  TFLITE_DCHECK(buffer_index != nullptr);
-
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
-
-  if (runner->scratch_buffer_count_ == kNumScratchBuffers_) {
-    MicroPrintf("Exceeded the maximum number of scratch tensors allowed (%d).",
-                kNumScratchBuffers_);
-    return kTfLiteError;
-  }
-
-  // For tests, we allocate scratch buffers from the tail and keep them around
-  // for the lifetime of model. This means that the arena size in the tests will
-  // be more than what we would have if the scratch buffers could share memory.
-  runner->scratch_buffers_[runner->scratch_buffer_count_] =
-      runner->allocator_->AllocateFromTail(bytes, MicroArenaBufferAlignment());
-  TFLITE_DCHECK(runner->scratch_buffers_[runner->scratch_buffer_count_] !=
-                nullptr);
-
-  *buffer_index = runner->scratch_buffer_count_++;
-  return kTfLiteOk;
-}
-
-void* KernelRunner::GetScratchBuffer(TfLiteContext* context, int buffer_index) {
-  TFLITE_DCHECK(context != nullptr);
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
-
-  TFLITE_DCHECK(runner->scratch_buffer_count_ <= kNumScratchBuffers_);
-  if (buffer_index >= runner->scratch_buffer_count_) {
-    return nullptr;
-  }
-  return runner->scratch_buffers_[buffer_index];
-}
-
-void KernelRunner::ReportOpError(struct TfLiteContext* context,
-                                 const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  GetMicroErrorReporter()->Report(format, args);
-  va_end(args);
-}
-
-TfLiteStatus KernelRunner::GetGraph(struct TfLiteContext* context,
-                                    TfLiteIntArray** args) {
-  TFLITE_DCHECK(context != nullptr);
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  *args = reinterpret_cast<TfLiteIntArray*>(runner->GetMockGraph());
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.h
@@ -18,6 +18,7 @@ limitations under the License.

 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/micro/fake_micro_context.h"
 #include "tensorflow/lite/micro/mock_micro_graph.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"

@@ -50,40 +51,22 @@ class KernelRunner {
  // to stub out MicroGraph methods and track invocations on each subgraph.
  MockMicroGraph* GetMockGraph() { return &mock_micro_graph_; }

- protected:
-  static TfLiteTensor* GetTensor(const struct TfLiteContext* context,
-                                 int tensor_index);
-  static TfLiteEvalTensor* GetEvalTensor(const struct TfLiteContext* context,
-                                         int tensor_index);
-  static void* AllocatePersistentBuffer(TfLiteContext* context, size_t bytes);
-  static TfLiteStatus RequestScratchBufferInArena(TfLiteContext* context,
-                                                  size_t bytes,
-                                                  int* buffer_index);
-  static void* GetScratchBuffer(TfLiteContext* context, int buffer_index);
-  static void ReportOpError(struct TfLiteContext* context, const char* format,
-                            ...);
-  // This method matches GetExecutionPlan from TfLiteContext since TFLM reuses
-  // this method to get the MicroGraph from an operator context.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  static TfLiteStatus GetGraph(struct TfLiteContext* context,
-                               TfLiteIntArray** args);
+  // Returns true if all temp buffer in tests are deallocated.
+  // TODO(b/209453859): move this function to private after deallocation checks
+  // are enabled for all kernel tests.
+  bool ValidateTempBufferDeallocated();

 private:
-  static constexpr int kNumScratchBuffers_ = 12;
-
  static constexpr int kKernelRunnerBufferSize_ = 10000;
  static uint8_t kKernelRunnerBuffer_[kKernelRunnerBufferSize_];

-  SimpleMemoryAllocator* allocator_ = nullptr;
-  const TfLiteRegistration& registration_;
-  TfLiteTensor* tensors_ = nullptr;
-  MockMicroGraph mock_micro_graph_;
-
  TfLiteContext context_ = {};
  TfLiteNode node_ = {};
+  const TfLiteRegistration& registration_;

-  int scratch_buffer_count_ = 0;
-  uint8_t* scratch_buffers_[kNumScratchBuffers_];
+  SimpleMemoryAllocator* allocator_;
+  MockMicroGraph mock_micro_graph_;
+  FakeMicroContext fake_micro_context_;
 };

 }  // namespace micro
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"

 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/memory_helpers.h"

 namespace tflite {
 namespace micro {
@@ -119,13 +120,83 @@ TfLiteStatus CreateWritableTensorDimsWithCopy(TfLiteContext* context,
  return kTfLiteOk;
 }

-// Returns a blob of payload data. The payload is subjected to interpretation by
-// the OP. This is the recommended API for an OP to get an external context. OP
-// should use this instead of directly calling GetExternalContext function in
-// context.
-void* GetExternalContext(TfLiteContext* context) {
-  return reinterpret_cast<void*>(
-      context->GetExternalContext(context, kTfLiteMaxExternalContexts));
+// Verify that both tensors have the same type and size, then return the size
+// of both tensors in bytes if they are the same, or -1 if they are different.
+size_t ValidateAndGetTensorSizes(const TfLiteEvalTensor* tensor1,
+                                 const TfLiteEvalTensor* tensor2) {
+  TFLITE_DCHECK(tensor1->type == tensor2->type);
+  size_t tensor1_size = 0;
+  size_t tensor2_size = 0;
+  TfLiteEvalTensorByteLength(tensor1, &tensor1_size);
+  TfLiteEvalTensorByteLength(tensor2, &tensor2_size);
+  return (tensor1_size == tensor2_size) ? tensor1_size : -1;
+}
+
+TfLiteStatus CopyOpInputsToOpOutputs(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, node->inputs->size == node->outputs->size);
+  for (int i = 0; i < node->inputs->size; i++) {
+    const TfLiteEvalTensor* input =
+        tflite::micro::GetEvalInput(context, node, i);
+    TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, i);
+    int bytes = ValidateAndGetTensorSizes(input, output);
+    TF_LITE_ENSURE(context, bytes >= 0);
+    memcpy(output->data.raw, input->data.raw, bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus CopyOpInputsToSubgraphInputs(TfLiteContext* context,
+                                          TfLiteNode* node,
+                                          MicroGraph* graph_info,
+                                          int subgraph_idx,
+                                          int first_tensor_idx) {
+  TF_LITE_ENSURE(context,
+                 static_cast<size_t>(node->inputs->size - first_tensor_idx) ==
+                     graph_info->NumSubgraphInputs(subgraph_idx));
+  for (int i = 0; i < node->inputs->size - first_tensor_idx; i++) {
+    const TfLiteEvalTensor* input =
+        tflite::micro::GetEvalInput(context, node, i + first_tensor_idx);
+    TfLiteEvalTensor* subgraph_input =
+        graph_info->GetSubgraphInput(subgraph_idx, i);
+    int bytes = ValidateAndGetTensorSizes(input, subgraph_input);
+    TF_LITE_ENSURE(context, bytes >= 0);
+    memcpy(subgraph_input->data.raw, input->data.raw, bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus CopyOpOutputsToSubgraphInputs(TfLiteContext* context,
+                                           TfLiteNode* node,
+                                           MicroGraph* graph_info,
+                                           int subgraph_idx) {
+  TF_LITE_ENSURE(context, static_cast<size_t>(node->outputs->size) ==
+                              graph_info->NumSubgraphInputs(subgraph_idx));
+  for (int i = 0; i < node->outputs->size; i++) {
+    TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, i);
+    TfLiteEvalTensor* subgraph_input =
+        graph_info->GetSubgraphInput(subgraph_idx, i);
+    int bytes = ValidateAndGetTensorSizes(output, subgraph_input);
+    TF_LITE_ENSURE(context, bytes >= 0);
+    memcpy(subgraph_input->data.raw, output->data.raw, bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus CopySubgraphOutputsToOpOutputs(TfLiteContext* context,
+                                            TfLiteNode* node,
+                                            MicroGraph* graph_info,
+                                            int subgraph_idx) {
+  TF_LITE_ENSURE(context, static_cast<size_t>(node->outputs->size) ==
+                              graph_info->NumSubgraphOutputs(subgraph_idx));
+  for (int i = 0; i < node->outputs->size; i++) {
+    TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, i);
+    TfLiteEvalTensor* subgraph_output =
+        graph_info->GetSubgraphOutput(subgraph_idx, i);
+    int bytes = ValidateAndGetTensorSizes(output, subgraph_output);
+    TF_LITE_ENSURE(context, bytes >= 0);
+    memcpy(output->data.raw, subgraph_output->data.raw, bytes);
+  }
+  return kTfLiteOk;
 }

 }  // namespace micro
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/micro/micro_context.h"

 namespace tflite {
 namespace micro {
@@ -69,23 +70,33 @@ TfLiteStatus CreateWritableTensorDimsWithCopy(TfLiteContext* context,
                                              TfLiteTensor* tensor,
                                              TfLiteEvalTensor* eval_tensor);

-// Returns a blob of payload data. The payload is subjected to interpretation by
-// the OP. This is the recommended API for an OP to get an external context. OP
-// should use this instead of directly calling GetExternalContext function in
-// context. Example usage:
-//
-// An application can set an external context through interpreter as below
-//     interpreter->SetMicroExternalContext(pointer_to_your_payload);
-//
-//  Inside an OP that needs this payload, it get the payload pointer by:
-//    Prepare(TfliteContext * context) {
-//       ...
-//       payload_ptr =
-//       reinterpret_cast<your_data_type>(GetMicroExternalContext(context))
-//       ...
-//    }
-//
-void* GetMicroExternalContext(TfLiteContext* context);
+// Copy all op input tensors to op output tensors. Requires all op input tensor
+// shapes and types to be identical to op output tensor shapes and types.
+TfLiteStatus CopyOpInputsToOpOutputs(TfLiteContext* context, TfLiteNode* node);
+
+// Copy all op input tensors to subgraph input tensors. Requires all op input
+// tensor shapes and types to be identical to subgraph input tensor shapes and
+// types.
+TfLiteStatus CopyOpInputsToSubgraphInputs(TfLiteContext* context,
+                                          TfLiteNode* node,
+                                          MicroGraph* graph_info,
+                                          int subgraph_idx,
+                                          int first_tensor_idx);
+
+// Copy all op output tensors to subgraph input tensors. Requires all op output
+// tensor shapes and types to be identical to subgraph input tensor shapes and
+// types.
+TfLiteStatus CopyOpOutputsToSubgraphInputs(TfLiteContext* context,
+                                           TfLiteNode* node,
+                                           MicroGraph* graph_info,
+                                           int subgraph_idx);
+
+// Copy all subgraph output tensors to op outputs. Requires all subgraph output
+// tensor shapes and types to be identical to op output tensor shapes and types.
+TfLiteStatus CopySubgraphOutputsToOpOutputs(TfLiteContext* context,
+                                            TfLiteNode* node,
+                                            MicroGraph* graph_info,
+                                            int subgraph_idx);

 }  // namespace micro
 }  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2_pool_2d.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2_pool_2d.cc
@@ -36,15 +36,18 @@ constexpr int kTensorShapeRank = 4;
 enum { kBatchRank = 0, kHeightRank, kWidthRank, kChannelRank };

 TfLiteStatus L2Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  auto* params = static_cast<TfLitePoolParams*>(node->builtin_data);

  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
  TF_LITE_ENSURE_EQ(context, NumDimensions(input), kTensorShapeRank);
  TF_LITE_ENSURE_EQ(context, NumDimensions(output), kTensorShapeRank);
  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
@@ -82,6 +85,9 @@ TfLiteStatus L2Prepare(TfLiteContext* context, TfLiteNode* node) {
  output->dims->data[kWidthRank] = out_width;
  output->dims->data[kChannelRank] = channels_out;

+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(input);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2norm.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2norm.cc
@@ -49,11 +49,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE(context, output != nullptr);
+  MicroContext* micro_context = GetMicroContext(context);

+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE(context, NumDimensions(input) <= 4);

  TF_LITE_ENSURE(context,
@@ -69,6 +72,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  // Our implementations don't currently support activations.
  TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu_common.cc
@@ -30,13 +30,16 @@ const int kOutputTensor = 0;

 TfLiteStatus CalculateOpDataLeakyRelu(TfLiteContext* context,
                                      TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);

  if (output->type == kTfLiteInt8 || output->type == kTfLiteInt16) {
@@ -62,6 +65,9 @@ TfLiteStatus CalculateOpDataLeakyRelu(TfLiteContext* context,
    data->output_shift_identity = static_cast<int32_t>(output_shift_identity);
  }

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/log_softmax.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/log_softmax.cc
@@ -43,13 +43,16 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;

 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);

  TF_LITE_ENSURE(context, HaveSameShapes(input, output));
@@ -89,6 +92,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
    data->depth = static_cast<size_t>(input_shape.Dims(trailing_dim));
  }

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic_common.cc
@@ -32,9 +32,13 @@ const int kLogisticOutputTensor = 0;
 TfLiteStatus CalculateArithmeticOpDataLogistic(TfLiteContext* context,
                                               TfLiteNode* node,
                                               OpDataLogistic* data) {
-  const TfLiteTensor* input = GetInput(context, node, kLogisticInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kLogisticInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kLogisticOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kLogisticOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
@@ -55,6 +59,53 @@ TfLiteStatus CalculateArithmeticOpDataLogistic(TfLiteContext* context,
    data->input_range_radius =
        CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31);
  }
+
+  if (input->type == kTfLiteInt16) {
+    static constexpr int kInputIntegerBits = 3;
+    static constexpr int kOutputFractionalBits = 15;
+
+    // See comments in TanhPrepare about requiring zero_point==0
+    // and a power-of-two ("POT") scale.
+
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+
+    int input_scale_log2_rounded;
+    bool param_scale_pot =
+        CheckedLog2(input->params.scale, &input_scale_log2_rounded);
+
+    data->input_left_shift =
+        (15 - kInputIntegerBits) + input_scale_log2_rounded;
+    param_scale_pot &= (data->input_left_shift == 0);
+
+    if (param_scale_pot) {
+      data->input_multiplier = 0;
+    } else {
+      // Calculate multiplier to change input scale to 1/(3*4096)
+      // as required by the table lookup.
+      // In this scaling +/-2^17 represents +/-10.7
+      double multiplier =
+          static_cast<double>(input->params.scale) * 4096.0 * 3.0;
+
+      data->input_left_shift = 0;
+
+      while (multiplier <= 32767.0 / 2.0 && data->input_left_shift <= 30) {
+        data->input_left_shift++;
+        multiplier = multiplier * 2.0;
+      }
+
+      data->input_multiplier = static_cast<int32_t>(multiplier);
+    }
+
+    int output_scale_log2_rounded;
+    TF_LITE_ENSURE(
+        context, CheckedLog2(output->params.scale, &output_scale_log2_rounded));
+    TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded,
+                      -kOutputFractionalBits);
+  }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_ops.h
@@ -36,6 +36,8 @@ TfLiteRegistration Register_ADD_N();
 TfLiteRegistration Register_ASSIGN_VARIABLE();
 TfLiteRegistration Register_AVERAGE_POOL_2D();
 TfLiteRegistration Register_BATCH_TO_SPACE_ND();
+TfLiteRegistration Register_BROADCAST_ARGS();
+TfLiteRegistration Register_BROADCAST_TO();
 TfLiteRegistration Register_CALL_ONCE();
 TfLiteRegistration Register_CAST();
 // TODO(b/160234179): Change custom OPs to also return by value.
@@ -62,6 +64,7 @@ TfLiteRegistration Register_LOGICAL_AND();
 TfLiteRegistration Register_LOGICAL_OR();
 TfLiteRegistration Register_LOGISTIC();
 TfLiteRegistration Register_MAX_POOL_2D();
+TfLiteRegistration Register_MIRROR_PAD();
 TfLiteRegistration Register_PRELU();
 TfLiteRegistration Register_MUL();
 TfLiteRegistration Register_QUANTIZE();
@@ -79,6 +82,7 @@ TfLiteRegistration Register_SVDF();
 TfLiteRegistration Register_TRANSPOSE();
 TfLiteRegistration Register_TRANSPOSE_CONV();
 TfLiteRegistration Register_VAR_HANDLE();
+TfLiteRegistration Register_WHILE();
 TfLiteRegistration Register_ZEROS_LIKE();

 namespace ops {
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/mirror_pad.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/mirror_pad.cc
@@ -0,0 +1,222 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace {
+
+struct OpDataMirrorPad {
+  int input_dims;
+  int output_size;
+  int offset;
+  int output_dims_num_elements_buffer_index;
+  int input_dims_num_elements_buffer_index;
+};
+
+// Helper method that fills the left and right pads.
+template <typename T>
+inline void GetPadding(const T* data, int offset, int64_t* left_pad,
+                       int64_t* right_pad) {
+  *left_pad = static_cast<int64_t>(*(data + offset * 2));
+  *right_pad = static_cast<int64_t>(*(data + offset * 2 + 1));
+}
+
+// Given dimension index and the left/right padding.
+// Returns the corresponding dimension in the input array.
+inline int GetInputDimension(int padded_dimension, int left_pad, int right_pad,
+                             int input_dim_size, int offset) {
+  if (padded_dimension < left_pad) {
+    const int original_ind = left_pad + offset - 1;
+    return original_ind - (std::min(padded_dimension, original_ind - offset));
+  }
+  padded_dimension -= left_pad;
+  if (padded_dimension >= input_dim_size) {
+    padded_dimension -= input_dim_size;
+    const int original_ind = input_dim_size - (1 + offset);
+    return original_ind - std::min(padded_dimension, original_ind);
+  }
+  return padded_dimension;
+}
+
+// Given and index in output array, returns the index of the value
+// in input array.
+int GetFlatIndex(int index, int num_dims,
+                 const TfLiteEvalTensor* padding_matrix,
+                 const TfLiteIntArray* input_dims,
+                 int* output_dims_num_elements, int* input_dims_num_elements,
+                 const int offset) {
+  int flat_index = 0;
+  int64_t left_pad = 0, right_pad = 0, dimension_index, index_in_input;
+
+  for (int i = 0; i < num_dims; ++i) {
+    switch (padding_matrix->type) {
+      case kTfLiteInt32:
+        GetPadding(padding_matrix->data.i32, i, &left_pad, &right_pad);
+        break;
+      case kTfLiteInt64:
+        GetPadding(padding_matrix->data.i64, i, &left_pad, &right_pad);
+        break;
+      default:
+        break;
+    }
+    dimension_index = index / output_dims_num_elements[i];
+
+    index_in_input = GetInputDimension(dimension_index, left_pad, right_pad,
+                                       input_dims->data[i], offset);
+
+    flat_index += index_in_input * (input_dims_num_elements)[i];
+    index %= output_dims_num_elements[i];
+  }
+
+  return flat_index;
+}
+
+template <typename T>
+void MirrorPad(const TfLiteEvalTensor* padding_matrix,
+               const TfLiteIntArray* input_dims, int* output_dims_num_elements,
+               int* input_dims_num_elements, const T* input_data,
+               T* output_data, const int offset, const int num_dims,
+               const int output_size) {
+  for (int i = 0; i < output_size; ++i) {
+    output_data[i] = input_data[GetFlatIndex(
+        i, num_dims, padding_matrix, input_dims, output_dims_num_elements,
+        input_dims_num_elements, offset)];
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TfLiteStatus status = kTfLiteOk;
+  const OpDataMirrorPad* data =
+      static_cast<const OpDataMirrorPad*>(node->user_data);
+
+  const TfLiteEvalTensor* input_tensor =
+      tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* padding_matrix =
+      tflite::micro::GetEvalInput(context, node, 1);
+
+  TfLiteEvalTensor* output_tensor =
+      tflite::micro::GetEvalOutput(context, node, 0);
+  const int input_dims = data->input_dims;
+  const int output_size = data->output_size;
+
+  int* input_dims_num_elements = (int*)context->GetScratchBuffer(
+      context, data->input_dims_num_elements_buffer_index);
+  int* output_dims_num_elements = (int*)context->GetScratchBuffer(
+      context, data->output_dims_num_elements_buffer_index);
+
+  for (int i = 0; i < input_dims; i++) {
+    output_dims_num_elements[i] = 1;
+    input_dims_num_elements[i] = 1;
+  }
+
+  for (int i = input_dims - 2; i >= 0; i--) {
+    output_dims_num_elements[i] =
+        output_dims_num_elements[i + 1] * output_tensor->dims->data[i + 1];
+
+    input_dims_num_elements[i] =
+        input_dims_num_elements[i + 1] * input_tensor->dims->data[i + 1];
+  }
+
+  switch (output_tensor->type) {
+    case kTfLiteFloat32: {
+      MirrorPad(padding_matrix, input_tensor->dims, output_dims_num_elements,
+                input_dims_num_elements,
+                tflite::micro::GetTensorData<float>(input_tensor),
+                tflite::micro::GetTensorData<float>(output_tensor),
+                data->offset, input_dims, output_size);
+      break;
+    }
+    case kTfLiteInt8: {
+      MirrorPad(padding_matrix, input_tensor->dims, output_dims_num_elements,
+                input_dims_num_elements,
+                tflite::micro::GetTensorData<int8_t>(input_tensor),
+                tflite::micro::GetTensorData<int8_t>(output_tensor),
+                data->offset, input_dims, output_size);
+      break;
+    }
+    default:
+      status = kTfLiteError;
+      break;
+  }
+
+#undef TF_LITE_MIRROR_PAD
+
+  return status;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataMirrorPad));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpDataMirrorPad* data = static_cast<OpDataMirrorPad*>(node->user_data);
+
+  TfLiteTensor* input_tensor = micro_context->AllocateTempInputTensor(node, 0);
+  TfLiteTensor* padding_matrix =
+      micro_context->AllocateTempInputTensor(node, 1);
+  TfLiteTensor* output_tensor =
+      micro_context->AllocateTempOutputTensor(node, 0);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(padding_matrix), 2);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(padding_matrix, 0),
+                    NumDimensions(input_tensor));
+  auto* params =
+      reinterpret_cast<TfLiteMirrorPaddingParams*>(node->builtin_data);
+  if (params == nullptr) {
+    return kTfLiteError;
+  }
+
+  data->offset =
+      params->mode != TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect ? 0
+                                                                           : 1;
+  data->input_dims = NumDimensions(input_tensor);
+  data->output_size = NumElements(output_tensor);
+
+  TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+      context, data->input_dims * sizeof(int),
+      &data->output_dims_num_elements_buffer_index));
+  TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+      context, data->input_dims * sizeof(int),
+      &data->input_dims_num_elements_buffer_index));
+
+  micro_context->DeallocateTempTfLiteTensor(input_tensor);
+  micro_context->DeallocateTempTfLiteTensor(padding_matrix);
+  micro_context->DeallocateTempTfLiteTensor(output_tensor);
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_MIRROR_PAD() {
+  return {/*init=*/Init,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul_common.cc
@@ -37,11 +37,16 @@ void* MulInit(TfLiteContext* context, const char* buffer, size_t length) {

 TfLiteStatus CalculateOpDataMul(TfLiteContext* context, TfLiteNode* node,
                                TfLiteMulParams* params, OpDataMul* data) {
-  const TfLiteTensor* input1 = GetInput(context, node, kMulInput1Tensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kMulInput1Tensor);
  TF_LITE_ENSURE(context, input1 != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kMulInput2Tensor);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kMulInput2Tensor);
  TF_LITE_ENSURE(context, input2 != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kMulOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kMulOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
@@ -72,6 +77,9 @@ TfLiteStatus CalculateOpDataMul(TfLiteContext* context, TfLiteNode* node,
                             &data->output_activation_max_f32);
  }

+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pad.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pad.cc
@@ -43,19 +43,26 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TFLITE_DCHECK(node->user_data != nullptr);
  OpData* data = static_cast<OpData*>(node->user_data);

  TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input = GetInput(context, node, /*index=*/0);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, /*index=*/0);
  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* paddings = GetInput(context, node, /*index=*/1);
+  TfLiteTensor* paddings =
+      micro_context->AllocateTempInputTensor(node, /*index=*/1);
  TF_LITE_ENSURE(context, paddings != nullptr);
-  const TfLiteTensor* constant_values =
-      NumInputs(node) == 3 ? GetInput(context, node, /*index=*/2) : nullptr;
-  TfLiteTensor* output = GetOutput(context, node, /*index=*/0);
+  TfLiteTensor* constant_values =
+      NumInputs(node) == 3
+          ? micro_context->AllocateTempInputTensor(node, /*index=*/2)
+          : nullptr;
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, /*index=*/0);
  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE_EQ(context, input->type, output->type);
@@ -122,6 +129,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
    data->output_zero_point = output->params.zero_point;
  }

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(paddings);
+  if (constant_values != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(constant_values);
+  }
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling_common.cc
@@ -54,9 +54,13 @@ TfLiteStatus PoolingPrepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  OpDataPooling* data = static_cast<OpDataPooling*>(node->user_data);

-  const TfLiteTensor* input = GetInput(context, node, kPoolingInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kPoolingInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kPoolingOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kPoolingOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE_STATUS(
@@ -71,6 +75,9 @@ TfLiteStatus PoolingPrepare(TfLiteContext* context, TfLiteNode* node) {
                                      &data->activation_max);
  }

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu_common.cc
@@ -84,14 +84,22 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  PreluParams* params = static_cast<PreluParams*>(node->user_data);

-  const TfLiteTensor* input = GetInput(context, node, 0);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* alpha = GetInput(context, node, 1);
+  TfLiteTensor* alpha = micro_context->AllocateTempInputTensor(node, 1);
  TF_LITE_ENSURE(context, alpha != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
  TF_LITE_ENSURE(context, output != nullptr);

-  return CalculatePreluParams(input, alpha, output, params);
+  TF_LITE_ENSURE_OK(context,
+                    CalculatePreluParams(input, alpha, output, params));
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(alpha);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
 }

 }  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize_common.cc
@@ -36,9 +36,11 @@ TfLiteStatus PrepareQuantizeReference(TfLiteContext* context,
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input = GetInput(context, node, 0);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
  TF_LITE_ENSURE(context, output != nullptr);

  // TODO(b/128934713): Add support for fixed-point per-channel quantization.
@@ -77,6 +79,9 @@ TfLiteStatus PrepareQuantizeReference(TfLiteContext* context,
  data->quantization_params.scale = static_cast<double>(output->params.scale);

  data->input_zero_point = input->params.zero_point;
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/read_variable.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/read_variable.cc
@@ -39,13 +39,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(NumInputs(node) == 1);
  TFLITE_DCHECK(NumOutputs(node) == 1);

-  const TfLiteTensor* input_resource_id_tensor =
-      GetInput(context, node, kInputVariableId);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input_resource_id_tensor =
+      micro_context->AllocateTempInputTensor(node, kInputVariableId);

  TFLITE_DCHECK(input_resource_id_tensor != nullptr);
  TFLITE_DCHECK(input_resource_id_tensor->type == kTfLiteResource);
  TFLITE_DCHECK(NumElements(input_resource_id_tensor) == 1);

+  micro_context->DeallocateTempTfLiteTensor(input_resource_id_tensor);
+
  return kTfLiteOk;
 }

@@ -58,14 +62,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
      tflite::micro::GetEvalOutput(context, node, kOutputValue);
  TFLITE_DCHECK(output_value != nullptr);

-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
-  MicroResourceVariables* resources = graph_info->GetResourceVariables();
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  MicroGraph& graph_info = micro_context->graph();
+
+  MicroResourceVariables* resources = graph_info.GetResourceVariables();
  if (resources == nullptr) {
    MicroPrintf(
        "READ_VARIABLE requires resource variables. Please create "
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.cc
@@ -50,10 +50,12 @@ void* InitReduce(TfLiteContext* context, const char* buffer, size_t length) {
 }

 TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  // Inputs Tensor (dtype depends on quantization):
  // [0] = Input
  // [1] = Axis
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);

  // Outputs Tensor (dtype depends on quantization):
  // [0] = Output
@@ -63,28 +65,31 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);

  // Validate axis type
-  const TfLiteTensor* axis = GetInput(context, node, 1);
+  TfLiteTensor* axis = micro_context->AllocateTempInputTensor(node, 1);
  TF_LITE_ENSURE(context, axis != nullptr);
  TF_LITE_ENSURE_TYPES_EQ(context, axis->type, kTfLiteInt32);

  if (input->type == kTfLiteInt8) {
    OpData* data = static_cast<OpData*>(node->user_data);
-    const TfLiteTensor* output = GetOutput(context, node, 0);
+    TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
    const double real_multiplier = static_cast<double>(input->params.scale) /
                                   static_cast<double>(output->params.scale);
    QuantizeMultiplier(real_multiplier, &data->multiplier, &data->shift);
+    micro_context->DeallocateTempTfLiteTensor(output);
  }
-
+  micro_context->DeallocateTempTfLiteTensor(axis);
+  micro_context->DeallocateTempTfLiteTensor(input);
  return kTfLiteOk;
 }

 TfLiteStatus PrepareMax(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_OK(context, PrepareSimple(context, node));

+  MicroContext* micro_context = GetMicroContext(context);
  OpData* op_data = static_cast<OpData*>(node->user_data);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  const TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* axis = GetInput(context, node, 1);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
+  TfLiteTensor* axis = micro_context->AllocateTempInputTensor(node, 1);

  op_data->input_scale = input->params.scale;
  op_data->output_scale = output->params.scale;
@@ -96,13 +101,17 @@ TfLiteStatus PrepareMax(TfLiteContext* context, TfLiteNode* node) {
      context, sizeof(int) * static_cast<int>(ElementCount(*axis->dims)),
      &op_data->resolved_axis_idx);

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(axis);
  return kTfLiteOk;
 }

 TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-  const TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
  if (input->type == kTfLiteInt8 || input->type == kTfLiteInt16) {
    const double real_multiplier = static_cast<double>(input->params.scale) /
                                   static_cast<double>(output->params.scale);
@@ -121,6 +130,8 @@ TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {

  TF_LITE_ENSURE_OK(context, PrepareSimple(context, node));
  // TODO(b/144955155): Support uint8_t(b/144955155) and int8_t(b/144955018)
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/reshape.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reshape.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

+#include <cstring>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -31,9 +33,13 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;

 TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  // Tensorflow's Reshape allows one of the shape components to have the
  // special -1 value, meaning it will be calculated automatically based on the
@@ -68,6 +74,9 @@ TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) {

  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
  TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

@@ -93,9 +102,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  // Do nothing for in-place reshape.
  if (input->data.raw != output->data.raw) {
    // Otherwise perform reshape with copy.
-    for (size_t i = 0; i < input_bytes; ++i) {
-      output->data.raw[i] = input->data.raw[i];
-    }
+    memcpy(output->data.raw, input->data.raw, input_bytes);
  }
  return kTfLiteOk;
 }
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_bilinear.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_bilinear.cc
@@ -30,12 +30,17 @@ constexpr int kSizeTensor = 1;
 constexpr int kOutputTensor = 0;

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* size =
+      micro_context->AllocateTempInputTensor(node, kSizeTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);

  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
  TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1);
@@ -55,6 +60,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
    return kTfLiteError;
  }

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(size);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
@@ -33,12 +33,17 @@ constexpr int kSizeTensor = 1;
 constexpr int kOutputTensor = 0;

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* size =
+      micro_context->AllocateTempInputTensor(node, kSizeTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);

  // Our current implementations rely on the input being 4D,
  // and the size being 1D tensor with exactly 2 elements.
@@ -53,6 +58,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
    TF_LITE_KERNEL_LOG(context, "Dynamic tensors are unsupported in tfmicro.");
    return kTfLiteError;
  }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(size);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/round.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/round.cc
@@ -29,9 +29,13 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -42,6 +46,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  for (int i = 0; i < output->dims->size; ++i) {
    TF_LITE_ENSURE_EQ(context, output->dims->data[i], input->dims->data[i]);
  }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/slice.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/slice.cc
@@ -45,16 +45,22 @@ void GetBeginAndSizeVectors(int dimensions, const TfLiteEvalTensor* begin,
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
  TFLITE_DCHECK(input != nullptr);
-  const TfLiteTensor* begin = GetInput(context, node, kBeginTensor);
+  TfLiteTensor* begin =
+      micro_context->AllocateTempInputTensor(node, kBeginTensor);
  TFLITE_DCHECK(begin != nullptr);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  TfLiteTensor* size =
+      micro_context->AllocateTempInputTensor(node, kSizeTensor);
  TFLITE_DCHECK(size != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
  TFLITE_DCHECK(output != nullptr);

  // Ensure validity of input tensor and its dimension.
@@ -66,6 +72,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(NumDimensions(size) == 1);
  TFLITE_DCHECK(NumElements(begin) == NumElements(size));
  TFLITE_DCHECK(NumDimensions(input) <= kMaxDim);
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(begin);
+  micro_context->DeallocateTempTfLiteTensor(size);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax_common.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/softmax.h"
+#include "tensorflow/lite/micro/micro_context.h"

 namespace tflite {

@@ -90,12 +91,14 @@ void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
 }

 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
  TF_LITE_ENSURE(context, input != nullptr);
  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE(context, node->user_data != nullptr);
@@ -136,7 +139,12 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
  }

  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
-  return CalculateSoftmaxParams(context, input, output, params, op_data);
+  auto ret_val =
+      CalculateSoftmaxParams(context, input, output, params, op_data);
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return ret_val;
 }

 }  // namespace tflite
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_batch_nd.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_batch_nd.cc
@@ -44,11 +44,15 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
  TF_LITE_ENSURE(context, input != nullptr && output != nullptr);

  TF_LITE_ENSURE(context, NumDimensions(input) >= kInputOutputMinDimensionNum);
@@ -57,6 +61,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE(context, NumDimensions(output) <= kInputOutputMaxDimensionNum);
  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_depth.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_depth.cc
@@ -39,11 +39,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);

@@ -75,6 +78,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  output->dims->data[kDepthRank] =
      input->dims->data[kDepthRank] * block_size * block_size;

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/split.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/split.cc
@@ -69,7 +69,8 @@ TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* axis = GetInput(context, node, 0);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* axis = micro_context->AllocateTempInputTensor(node, 0);
  TF_LITE_ENSURE(context, axis != nullptr);

  // Dynamic output tensors are needed if axis tensor is not constant.
@@ -77,6 +78,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  // constant axis tensor for now.
  TF_LITE_ENSURE_MSG(context, IsConstantTensor(axis),
                     "Non constant axis tensor not supported");
+
+  micro_context->DeallocateTempTfLiteTensor(axis);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/split_v.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/split_v.cc
@@ -74,13 +74,14 @@ TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);

+  MicroContext* micro_context = GetMicroContext(context);
  // Dynamic output tensors are needed if axis tensor is not constant.
  // But Micro doesn't support dynamic memory allocation, so we only support
  // constant axis tensor for now.
-  const TfLiteTensor* axis = GetInput(context, node, 2);
+  TfLiteTensor* axis = micro_context->AllocateTempInputTensor(node, 2);
  TF_LITE_ENSURE_MSG(context, IsConstantTensor(axis),
                     "Non constant axis tensor not supported");
-
+  micro_context->DeallocateTempTfLiteTensor(axis);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/squeeze.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/squeeze.cc
@@ -27,12 +27,19 @@ namespace tflite {
 namespace {

 struct SqueezeContext {
-  SqueezeContext(TfLiteContext* context, TfLiteNode* node)
-      : params(reinterpret_cast<TfLiteSqueezeParams*>(node->builtin_data)),
-        input(GetInput(context, node, 0)),
-        output(GetOutput(context, node, 0)) {}
+  SqueezeContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLiteSqueezeParams*>(node->builtin_data);
+    micro_context = GetMicroContext(context);
+    input = micro_context->AllocateTempInputTensor(node, 0);
+    output = micro_context->AllocateTempOutputTensor(node, 0);
+  }
+  ~SqueezeContext() {
+    micro_context->DeallocateTempTfLiteTensor(input);
+    micro_context->DeallocateTempTfLiteTensor(output);
+  }
+  MicroContext* micro_context;
  TfLiteSqueezeParams* params;
-  const TfLiteTensor* const input;
+  TfLiteTensor* input;
  TfLiteTensor* output;
 };

@@ -80,18 +87,24 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  SqueezeContext op_context(context, node);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);

-  if (op_context.input->type == kTfLiteString) {
+  if (input->type == kTfLiteString) {
    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                       TfLiteTypeGetName(op_context.input->type),
-                       op_context.input->type);
+                       TfLiteTypeGetName(input->type), input->type);
    return kTfLiteError;
  }

-  TF_LITE_ENSURE_EQ(context, op_context.input->bytes, op_context.output->bytes);
-  memcpy(op_context.output->data.raw, op_context.input->data.raw,
-         op_context.input->bytes);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+  size_t input_byte_size;
+  size_t output_byte_size;
+  TF_LITE_ENSURE_OK(context,
+                    TfLiteEvalTensorByteLength(input, &input_byte_size));
+  TF_LITE_ENSURE_OK(context,
+                    TfLiteEvalTensorByteLength(output, &output_byte_size));
+
+  TF_LITE_ENSURE_EQ(context, input_byte_size, output_byte_size);
+  memcpy(output->data.raw, input->data.raw, input_byte_size);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/strided_slice.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/strided_slice.cc
@@ -38,18 +38,27 @@ constexpr int kOutputTensor = 0;
 struct StridedSliceContext {
  StridedSliceContext(TfLiteContext* context, TfLiteNode* node) {
    params = reinterpret_cast<TfLiteStridedSliceParams*>(node->builtin_data);
-    input = GetInput(context, node, kInputTensor);
-    begin = GetInput(context, node, kBeginTensor);
-    end = GetInput(context, node, kEndTensor);
-    strides = GetInput(context, node, kStridesTensor);
-    output = GetOutput(context, node, kOutputTensor);
+    micro_context = GetMicroContext(context);
+    input = micro_context->AllocateTempInputTensor(node, kInputTensor);
+    begin = micro_context->AllocateTempInputTensor(node, kBeginTensor);
+    end = micro_context->AllocateTempInputTensor(node, kEndTensor);
+    strides = micro_context->AllocateTempInputTensor(node, kStridesTensor);
+    output = micro_context->AllocateTempOutputTensor(node, kOutputTensor);
    dims = NumDimensions(input);
  }
+  ~StridedSliceContext() {
+    micro_context->DeallocateTempTfLiteTensor(input);
+    micro_context->DeallocateTempTfLiteTensor(begin);
+    micro_context->DeallocateTempTfLiteTensor(end);
+    micro_context->DeallocateTempTfLiteTensor(strides);
+    micro_context->DeallocateTempTfLiteTensor(output);
+  }
  const TfLiteStridedSliceParams* params;
-  const TfLiteTensor* input;
-  const TfLiteTensor* begin;
-  const TfLiteTensor* end;
-  const TfLiteTensor* strides;
+  MicroContext* micro_context;
+  TfLiteTensor* input;
+  TfLiteTensor* begin;
+  TfLiteTensor* end;
+  TfLiteTensor* strides;
  TfLiteTensor* output;
  int dims;
 };
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub_common.cc
@@ -83,15 +83,24 @@ TfLiteStatus SubPrepare(TfLiteContext* context, TfLiteNode* node) {
  OpDataSub* data = static_cast<OpDataSub*>(node->user_data);
  auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);

-  const TfLiteTensor* input1 = GetInput(context, node, kSubInputTensor1);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kSubInputTensor1);
  TF_LITE_ENSURE(context, input1 != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kSubInputTensor2);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kSubInputTensor2);
  TF_LITE_ENSURE(context, input2 != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kSubOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kSubOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE_STATUS(
      CalculateOpDataSub(context, params, input1, input2, output, data));
+
+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf_common.cc
@@ -364,6 +364,8 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) {

  const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);

+  MicroContext* micro_context = GetMicroContext(context);
+
  // Validate Tensor Inputs (dtype depends on quantization):
  // [0] = Input, {2, batch_size, input_size}
  // [1] = Weights Feature, {2, num_filters, input_size}
@@ -371,18 +373,19 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
  // [3] = Bias (optional), {1, num_units}
  // [4] = Activation State (variable),
  //         {2, batch_size, memory_size * num_filters}
-  const TfLiteTensor* input = GetInput(context, node, kSvdfInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kSvdfInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* weights_feature =
-      GetInput(context, node, kSvdfWeightsFeatureTensor);
+  TfLiteTensor* weights_feature =
+      micro_context->AllocateTempInputTensor(node, kSvdfWeightsFeatureTensor);
  TF_LITE_ENSURE(context, weights_feature != nullptr);
-  const TfLiteTensor* weights_time =
-      GetInput(context, node, kSvdfWeightsTimeTensor);
+  TfLiteTensor* weights_time =
+      micro_context->AllocateTempInputTensor(node, kSvdfWeightsTimeTensor);
  TF_LITE_ENSURE(context, weights_time != nullptr);
-  const TfLiteTensor* bias =
-      GetOptionalInputTensor(context, node, kSvdfBiasTensor);
-  const TfLiteTensor* activation_state =
-      GetInput(context, node, kSvdfInputActivationStateTensor);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kSvdfBiasTensor);
+  TfLiteTensor* activation_state = micro_context->AllocateTempInputTensor(
+      node, kSvdfInputActivationStateTensor);
  TF_LITE_ENSURE(context, activation_state != nullptr);

  // Define input constants based on input tensor definition above:
@@ -402,7 +405,8 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
  // Validate Tensor Output:
  // [0] = float/int8_t, {2, batch_size, num_units}
  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-  TfLiteTensor* output = GetOutput(context, node, kSvdfOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kSvdfOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);
  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
  TF_LITE_ENSURE_EQ(context, output->dims->data[0], batch_size);
@@ -498,6 +502,12 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
    TF_LITE_ENSURE_OK(context, scratch_status);
  }

+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(weights_feature);
+  micro_context->DeallocateTempTfLiteTensor(weights_time);
+  micro_context->DeallocateTempTfLiteTensor(activation_state);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(bias);
  return kTfLiteOk;
 }

--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/tanh.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/tanh.cc
@@ -48,11 +48,14 @@ void* TanhInit(TfLiteContext* context, const char* buffer, size_t length) {

 TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
                                       OpData* data) {
+  MicroContext* micro_context = GetMicroContext(context);
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
@@ -69,6 +72,62 @@ TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
    data->input_range_radius =
        CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31);
  }
+
+  if (input->type == kTfLiteInt16) {
+    static constexpr int kInputIntegerBits = 3;
+    static constexpr int kOutputFractionalBits = 15;
+
+    // These operators are implemented in fixed-point arithmetic,
+    // which intrinsically wants symmetric ranges (zero_point==0)
+    // and power-of-two scales (power-of-two is abbreviated below as POT).
+    // While more general support would be possible by means of rescaling,
+    // that would add some overhead and some loss of accuracy and wouldn't
+    // be used at the moment as current quantized LSTM applications are
+    // happy with symmetric, power-of-two-scales quantization. So we just
+    // implement that narrow case only for now.
+
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+
+    int input_scale_log2_rounded;
+    bool param_scale_pot =
+        CheckedLog2(input->params.scale, &input_scale_log2_rounded);
+
+    data->input_left_shift =
+        (15 - kInputIntegerBits) + input_scale_log2_rounded;
+    param_scale_pot &=
+        (data->input_left_shift == 0 || data->input_left_shift == 1);
+
+    if (param_scale_pot) {
+      data->input_multiplier = 0;
+    } else {
+      // Calculate multiplier to change input scale to 1/(3*4096)
+      // as required by the table lookup.
+      // The number 3.0 in the multiplier comes from here,
+      // because the interval is [-10.7, 10.7] instead of [-8, 8].
+      // So, in this scaling +/-2^17 represents +/-10.7.
+
+      double multiplier =
+          static_cast<double>(input->params.scale) * 4096.0 * 3.0;
+      data->input_left_shift = 0;
+
+      while (multiplier <= 32767.0 / 2.0 && data->input_left_shift <= 30) {
+        data->input_left_shift++;
+        multiplier = multiplier * 2.0;
+      }
+
+      data->input_multiplier = static_cast<int32_t>(multiplier);
+    }
+
+    int output_scale_log2_rounded;
+    TF_LITE_ENSURE(
+        context, CheckedLog2(output->params.scale, &output_scale_log2_rounded));
+    TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded,
+                      -kOutputFractionalBits);
+  }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
  return kTfLiteOk;
 }

@@ -77,10 +136,15 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {

  OpData* data = static_cast<OpData*>(node->user_data);

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
  data->input_zero_point = input->params.zero_point;
-  return CalculateArithmeticOpData(context, node, data);
+  TF_LITE_ENSURE_OK(context, CalculateArithmeticOpData(context, node, data));
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  return kTfLiteOk;
 }

 }  // namespace
--- a/Show More
+++ b/Show More