Rolling 20210420

2025-12-09 21:17:06 +03:00 · 2021-04-20 19:44:16 +02:00
parent 520f818adc
commit ea2305de47
156 changed files with 11095 additions and 8601 deletions
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/add_n.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/add_n.cc
@@ -0,0 +1,119 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/add_n.h"
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kInputTensor0 = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
+  int num_inputs = NumInputs(node);
+  TF_LITE_ENSURE(context, num_inputs >= 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input_tensor_first;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kInputTensor0, &input_tensor_first));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+
+  // Check that all tensors have the same shape and type.
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_tensor_first->type);
+  for (int i = kInputTensor0 + 1; i < num_inputs; ++i) {
+    const TfLiteTensor* input;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, i, &input));
+    TF_LITE_ENSURE(context, HaveSameShapes(input_tensor_first, input));
+    TF_LITE_ENSURE_TYPES_EQ(context, input_tensor_first->type, input->type);
+  }
+
+  // Allocate scratch buffer space for pointer to each tensor's data
+  // and store the scratch buffer index in the node's user_data
+  if (output->type == kTfLiteFloat32) {
+    int scratch_index;
+    size_t scratch_size = sizeof(float*) * num_inputs;
+    TF_LITE_ENSURE_OK(context, context->RequestScratchBufferInArena(
+                                   context, scratch_size, &scratch_index));
+    node->user_data =
+        reinterpret_cast<decltype(node->user_data)>(scratch_index);
+  } else {
+    TF_LITE_KERNEL_LOG(context, "ADD_N only supports FLOAT32, got %s.",
+                       TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return CalculateOpData(context, node);
+}
+
+template <typename T>
+void EvalAddN(TfLiteContext* context, TfLiteNode* node,
+              TfLiteEvalTensor* output) {
+  int num_inputs = NumInputs(node);
+
+  int scratch_index =
+      static_cast<int>(reinterpret_cast<intptr_t>(node->user_data));
+  void* scratch_buffer = context->GetScratchBuffer(context, scratch_index);
+  const T** all_inputs = static_cast<decltype(all_inputs)>(scratch_buffer);
+  for (int i = 0; i < num_inputs; i++) {
+    const TfLiteEvalTensor* next_input =
+        tflite::micro::GetEvalInput(context, node, kInputTensor0 + i);
+    all_inputs[i] = tflite::micro::GetTensorData<T>(next_input);
+  }
+
+  reference_ops::AddN<T>(tflite::micro::GetTensorShape(output), num_inputs,
+                         all_inputs, tflite::micro::GetTensorData<T>(output));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  if (output->type == kTfLiteFloat32) {
+    EvalAddN<float>(context, node, output);
+  } else {
+    TF_LITE_KERNEL_LOG(context, "ADD_N only supports FLOAT32, got %s.",
+                       TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_ADD_N() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/batch_to_space_nd.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/batch_to_space_nd.cc
@@ -0,0 +1,111 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kBlockShapeTensor = 1;
+constexpr int kCropsTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// Currently, only 3D NHC and 4D NHWC input/output op_context are supported.
+// In case of 3D input, it will be extended to 3D NHWC by adding W=1.
+// The 4D array need to have exactly 2 spatial dimensions.
+// TODO(b/149952582): Support arbitrary dimension in SpaceToBatchND.
+const int kInputOutputMinDimensionNum = 3;
+const int kInputOutputMaxDimensionNum = 4;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, input != nullptr && output != nullptr);
+
+  TF_LITE_ENSURE(context, NumDimensions(input) >= kInputOutputMinDimensionNum);
+  TF_LITE_ENSURE(context, NumDimensions(output) >= kInputOutputMinDimensionNum);
+  TF_LITE_ENSURE(context, NumDimensions(input) <= kInputOutputMaxDimensionNum);
+  TF_LITE_ENSURE(context, NumDimensions(output) <= kInputOutputMaxDimensionNum);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* block_shape =
+      tflite::micro::GetEvalInput(context, node, kBlockShapeTensor);
+  const TfLiteEvalTensor* crops =
+      tflite::micro::GetEvalInput(context, node, kCropsTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      reference_ops::BatchToSpaceND(
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(block_shape),
+          tflite::micro::GetTensorData<int32_t>(block_shape),
+          tflite::micro::GetTensorShape(crops),
+          tflite::micro::GetTensorData<int32_t>(crops),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+      break;
+    case kTfLiteInt8:
+      reference_ops::BatchToSpaceND(
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(block_shape),
+          tflite::micro::GetTensorData<int32_t>(block_shape),
+          tflite::micro::GetTensorShape(crops),
+          tflite::micro::GetTensorData<int32_t>(crops),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace.
+
+TfLiteRegistration Register_BATCH_TO_SPACE_ND() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/cast.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/cast.cc
@@ -0,0 +1,96 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  return kTfLiteOk;
+}
+
+template <typename FromT, typename ToT>
+void copyCast(const FromT* in, ToT* out, int num_elements) {
+  std::transform(in, in + num_elements, out,
+                 [](FromT a) { return static_cast<ToT>(a); });
+}
+
+template <typename FromT>
+TfLiteStatus copyToTensor(TfLiteContext* context, const FromT* in,
+                          TfLiteEvalTensor* out, int num_elements) {
+  switch (out->type) {
+    case kTfLiteInt8:
+      copyCast(in, out->data.int8, num_elements);
+      break;
+    case kTfLiteFloat32:
+      copyCast(in, tflite::micro::GetTensorData<float>(out), num_elements);
+      break;
+    default:
+      // Unsupported type.
+      TF_LITE_KERNEL_LOG(context, "Output type %s (%d) not supported.",
+                         TfLiteTypeGetName(out->type), out->type);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  int num_elements = MatchingFlatSize(tflite::micro::GetTensorShape(input),
+                                      tflite::micro::GetTensorShape(output));
+
+  switch (input->type) {
+    case kTfLiteInt8:
+      return copyToTensor(context, input->data.int8, output, num_elements);
+    case kTfLiteFloat32:
+      return copyToTensor(context, tflite::micro::GetTensorData<float>(input),
+                          output, num_elements);
+    default:
+      // Unsupported type.
+      TF_LITE_KERNEL_LOG(context, "Input type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteRegistration Register_CAST() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/circular_buffer.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/circular_buffer.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

+#define FLATBUFFERS_LOCALE_INDEPENDENT 0
+#include "flatbuffers/flexbuffers.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -55,7 +57,7 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;

 // TODO(b/149795762): Add this to TfLiteStatus enum.
-constexpr int kTfLiteAbort = -9;
+constexpr TfLiteStatus kTfLiteAbort = static_cast<TfLiteStatus>(-9);

 // These fields control the stride period of a strided streaming model. This op
 // returns kTfLiteAbort until cycles_until_run-- is zero.  At this time,
@@ -65,47 +67,64 @@ struct OpData {
  int cycles_max;
 };

-// These constants represent constants specific to the music detect model.
-// They exist until (b/132070898) is fixed.
-constexpr int kMaxOpDataSize = 7;
-int op_data_counter = 0;
-OpData op_data_array[kMaxOpDataSize];
-
 }  // namespace

-void Free(TfLiteContext* context, void* buffer) { op_data_counter = 0; }
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  OpData* op_data = static_cast<OpData*>(
+      context->AllocatePersistentBuffer(context, sizeof(OpData)));
+
+  if (buffer != nullptr && length > 0) {
+    const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+    const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+    op_data->cycles_max = m["cycles_max"].AsInt32();
+  } else {
+    op_data->cycles_max = 0;
+  }
+
+  return op_data;
+}

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE(context, input != nullptr);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE(context, output != nullptr);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* op_data = static_cast<OpData*>(node->user_data);

  TF_LITE_ENSURE(context, input != nullptr);
  TF_LITE_ENSURE(context, output != nullptr);
-  TF_LITE_ENSURE_EQ(context, 1, output->dims->data[0]);
-  TF_LITE_ENSURE_EQ(context, 1, input->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, input->dims->data[0], output->dims->data[0]);
  TF_LITE_ENSURE_EQ(context, 1, input->dims->data[1]);
-  TF_LITE_ENSURE_EQ(context, 1, output->dims->data[2]);
-  TF_LITE_ENSURE_EQ(context, 1, input->dims->data[2]);
+  TF_LITE_ENSURE_EQ(context, input->dims->data[2], output->dims->data[2]);
  TF_LITE_ENSURE_EQ(context, output->dims->data[3], input->dims->data[3]);

  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);

-  // The circular buffer custom operator currently only supports int8_t.
+  // The circular buffer custom operator currently only supports int8.
  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);

-  // TODO(b/132070898): Use statically slotted OpData structures until a
-  // scratch memory API is ready.
-  TFLITE_DCHECK_LE(op_data_counter, kMaxOpDataSize);
-  OpData* op_data = &op_data_array[op_data_counter++];
-  // The last circular buffer layer (length 5) simply accumulates outputs, and
-  // does not run periodically.
-  // TODO(b/150001379): Move this special case logic to the tflite flatbuffer.
-  if (output->dims->data[1] == 5) {
-    op_data->cycles_max = 1;
-  } else {
-    op_data->cycles_max = 2;
+  if (op_data->cycles_max <= 0) {
+    // The last circular buffer layer simply accumulates outputs, and does not
+    // run periodically.
+    // TODO(b/150001379): Move this special case logic to the tflite flatbuffer.
+    static int cb_prepare_count = 0;
+    cb_prepare_count++;
+    // These checks specifically work for the only two streaming models
+    // supported on TFLM. They use the shape of the output tensor along with the
+    // layer number to determine if the circular buffer period should be 1 or 2.
+
+    // These models are outlined int the following documents:
+    // https://docs.google.com/document/d/1lc_G2ZFhjiKFo02UHjBaljye1xsL0EkfybkaVELEE3Q/edit?usp=sharing
+    // https://docs.google.com/document/d/1pGc42PuWyrk-Jy1-9qeqtggvsmHr1ifz8Lmqfpr2rKA/edit?usp=sharing
+    if (output->dims->data[1] == 5 || output->dims->data[1] == 13 ||
+        (cb_prepare_count == 5 && output->dims->data[2] == 2 &&
+         output->dims->data[3] == 96)) {
+      op_data->cycles_max = 1;
+      cb_prepare_count = 0;
+    } else {
+      op_data->cycles_max = 2;
+    }
  }
  op_data->cycles_until_run = op_data->cycles_max;
  node->user_data = op_data;
@@ -127,10 +146,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  TfLiteEvalTensor* output =
      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

+  TFLITE_DCHECK(node->user_data != nullptr);
  OpData* data = reinterpret_cast<OpData*>(node->user_data);

  int num_slots = output->dims->data[1];
-  int depth = output->dims->data[3];
+  int depth = output->dims->data[2] * output->dims->data[3];

  if (input->type == kTfLiteInt8) {
    EvalInt8(tflite::micro::GetTensorData<int8_t>(input), num_slots, depth,
@@ -148,12 +168,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
    return static_cast<TfLiteStatus>(kTfLiteAbort);
  }

-  // If prepare is ever called more than one time (for example, when testing the
-  // ambient model, the interpreter is created a few times), this op data
-  // counter needs to be reset so that future instances do not overrun this op
-  // data array.
-  op_data_counter = 0;
-
  data->cycles_until_run = data->cycles_max;

  return kTfLiteOk;
@@ -162,8 +176,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace circular_buffer

 TfLiteRegistration* Register_CIRCULAR_BUFFER() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/circular_buffer::Free,
+  static TfLiteRegistration r = {/*init=*/circular_buffer::Init,
+                                 /*free=*/nullptr,
                                 /*prepare=*/circular_buffer::Prepare,
                                 /*invoke=*/circular_buffer::Eval,
                                 /*profiling_string=*/nullptr,
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/circular_buffer_flexbuffers_generated_data.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/circular_buffer_flexbuffers_generated_data.h
@@ -0,0 +1,22 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_FLEXBUFFERS_GENERATED_DATA_H
+#define TENSORFLOW_LITE_MICRO_KERNELS_FLEXBUFFERS_GENERATED_DATA_H
+
+extern const int g_gen_data_size_circular_buffer_config;
+extern const unsigned char g_gen_data_circular_buffer_config[];
+
+#endif
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/conv.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/conv.cc
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/micro/kernels/conv.h"

 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -28,294 +29,60 @@ limitations under the License.
 namespace tflite {
 namespace {

-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
-// Conv is quantized along dimension 0:
-// https://www.tensorflow.org/lite/performance/quantization_spec
-constexpr int kConvQuantizedDimension = 0;
-
-// This file has 2 implementation of Conv.
-
-struct OpData {
-  TfLitePaddingValues padding;
-
-  // Cached tensor zero point values for quantized operations.
-  int32_t input_zero_point;
-  int32_t filter_zero_point;
-  int32_t output_zero_point;
-
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  // Per channel output multiplier and shift.
-  int32_t* per_channel_output_multiplier;
-  int32_t* per_channel_output_shift;
-
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
-inline PaddingType RuntimePaddingType(TfLitePadding padding) {
-  switch (padding) {
-    case TfLitePadding::kTfLitePaddingSame:
-      return PaddingType::kSame;
-    case TfLitePadding::kTfLitePaddingValid:
-      return PaddingType::kValid;
-    case TfLitePadding::kTfLitePaddingUnknown:
-    default:
-      return PaddingType::kNone;
-  }
-}
-
-TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             const TfLiteConvParams* params, int width,
-                             int height, int filter_width, int filter_height,
-                             int out_width, int out_height,
-                             const TfLiteType data_type, OpData* data) {
-  bool has_bias = node->inputs->size == 3;
-  // Check number of inputs/outputs
-  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  // Matching GetWindowedOutputSize in TensorFlow.
-  auto padding = params->padding;
-  data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width,
-      params->dilation_height_factor, params->dilation_width_factor, height,
-      width, filter_height, filter_width, padding, &out_height, &out_width);
-
-  // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    TF_LITE_ENSURE(context, input != nullptr);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    TF_LITE_ENSURE(context, filter != nullptr);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-    TF_LITE_ENSURE(context, output != nullptr);
-    int output_channels = filter->dims->data[kConvQuantizedDimension];
-
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
-        context, input, filter, bias, output, params->activation,
-        &data->output_multiplier, &data->output_shift,
-        &data->output_activation_min, &data->output_activation_max,
-        data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift),
-        output_channels));
-  }
-  return kTfLiteOk;
-}
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(OpData));
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->user_data != nullptr);
-  TFLITE_DCHECK(node->builtin_data != nullptr);
-
-  OpData* data = static_cast<OpData*>(node->user_data);
-  const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE(context, output != nullptr);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  TF_LITE_ENSURE(context, filter != nullptr);
-
-  int input_width = input->dims->data[2];
-  int input_height = input->dims->data[1];
-  int filter_width = filter->dims->data[2];
-  int filter_height = filter->dims->data[1];
-  int output_width = output->dims->data[2];
-  int output_height = output->dims->data[1];
-
-  // Dynimically allocate per-channel quantization parameters.
-  const int num_channels = filter->dims->data[kConvQuantizedDimension];
-  data->per_channel_output_multiplier =
-      static_cast<int32_t*>(context->AllocatePersistentBuffer(
-          context, num_channels * sizeof(int32_t)));
-  data->per_channel_output_shift =
-      static_cast<int32_t*>(context->AllocatePersistentBuffer(
-          context, num_channels * sizeof(int32_t)));
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-
-    TF_LITE_ENSURE(context,
-                   affine_quantization->scale->size == 1 ||
-                       affine_quantization->scale->size ==
-                           filter->dims->data[kConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(
-      context, node, params, input_width, input_height, filter_width,
-      filter_height, output_width, output_height, input->type, data));
-
-  data->input_zero_point = input->params.zero_point;
-  data->filter_zero_point = filter->params.zero_point;
-  data->output_zero_point = output->params.zero_point;
-
-  return kTfLiteOk;
-}  // namespace conv
-
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteConvParams* params, const OpData& data,
-                   const TfLiteEvalTensor* input,
-                   const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
-                   TfLiteEvalTensor* im2col, TfLiteEvalTensor* hwcn_weights,
-                   TfLiteEvalTensor* output) {
-  const int32_t input_offset = -data.input_zero_point;
-  const int32_t filter_offset = -data.filter_zero_point;
-  const int32_t output_offset = data.output_zero_point;
-
-  // TODO(b/154032858): Investigate removing extra copies.
-  ConvParams op_params;
-  op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data.padding.width;
-  op_params.padding_values.height = data.padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data.output_multiplier;
-  op_params.output_shift = -data.output_shift;
-  op_params.quantized_activation_min = data.output_activation_min;
-  op_params.quantized_activation_max = data.output_activation_max;
-  reference_ops::Conv(op_params, tflite::micro::GetTensorShape(input),
-                      tflite::micro::GetTensorData<uint8_t>(input),
-                      tflite::micro::GetTensorShape(filter),
-                      tflite::micro::GetTensorData<uint8_t>(filter),
-                      tflite::micro::GetTensorShape(bias),
-                      tflite::micro::GetTensorData<int32_t>(bias),
-                      tflite::micro::GetTensorShape(output),
-                      tflite::micro::GetTensorData<uint8_t>(output),
-                      tflite::micro::GetTensorShape(im2col),
-                      tflite::micro::GetTensorData<uint8_t>(im2col), nullptr);
-}
-
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, const OpData& data,
-                             const TfLiteEvalTensor* input,
-                             const TfLiteEvalTensor* filter,
-                             const TfLiteEvalTensor* bias,
-                             TfLiteEvalTensor* output,
-                             TfLiteEvalTensor* im2col) {
-  // TODO(b/154032858): Investigate removing extra copies.
-  ConvParams op_params;
-  op_params.input_offset = -data.input_zero_point;
-  op_params.output_offset = data.output_zero_point;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.padding_values.height = data.padding.height;
-  op_params.padding_values.width = data.padding.width;
-  op_params.quantized_activation_min = data.output_activation_min;
-  op_params.quantized_activation_max = data.output_activation_max;
-
-  reference_integer_ops::ConvPerChannel(
-      op_params, data.per_channel_output_multiplier,
-      data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
-      tflite::micro::GetTensorData<int8_t>(input),
-      tflite::micro::GetTensorShape(filter),
-      tflite::micro::GetTensorData<int8_t>(filter),
-      tflite::micro::GetTensorShape(bias),
-      tflite::micro::GetTensorData<int32_t>(bias),
-      tflite::micro::GetTensorShape(output),
-      tflite::micro::GetTensorData<int8_t>(output));
-}
-
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteConvParams* params, const OpData& data,
-               const TfLiteEvalTensor* input, const TfLiteEvalTensor* filter,
-               const TfLiteEvalTensor* bias, TfLiteEvalTensor* im2col,
-               TfLiteEvalTensor* hwcn_weights, TfLiteEvalTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-  // TODO(b/154032858): Investigate removing extra copies.
-  ConvParams op_params;
-  op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data.padding.width;
-  op_params.padding_values.height = data.padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  reference_ops::Conv(op_params, tflite::micro::GetTensorShape(input),
-                      tflite::micro::GetTensorData<float>(input),
-                      tflite::micro::GetTensorShape(filter),
-                      tflite::micro::GetTensorData<float>(filter),
-                      tflite::micro::GetTensorShape(bias),
-                      tflite::micro::GetTensorData<float>(bias),
-                      tflite::micro::GetTensorShape(output),
-                      tflite::micro::GetTensorData<float>(output),
-                      tflite::micro::GetTensorShape(im2col),
-                      tflite::micro::GetTensorData<float>(im2col));
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataConv));
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
-
  const TfLiteEvalTensor* input =
-      tflite::micro::GetEvalInput(context, node, kInputTensor);
+      tflite::micro::GetEvalInput(context, node, kConvInputTensor);
  const TfLiteEvalTensor* filter =
-      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+      tflite::micro::GetEvalInput(context, node, kConvWeightsTensor);
  const TfLiteEvalTensor* bias =
      (NumInputs(node) == 3)
-          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          ? tflite::micro::GetEvalInput(context, node, kConvBiasTensor)
          : nullptr;
  TfLiteEvalTensor* output =
-      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+      tflite::micro::GetEvalOutput(context, node, kConvOutputTensor);

+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto& params =
+      *(reinterpret_cast<TfLiteConvParams*>(node->builtin_data));
  TFLITE_DCHECK(node->user_data != nullptr);
-  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+  const auto& data = *(static_cast<const OpDataConv*>(node->user_data));

  TF_LITE_ENSURE_EQ(context, input->type, output->type);
  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                     "Hybrid models are not supported on TFLite Micro.");

  switch (input->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      EvalFloat(context, node, params, data, input, filter, bias, nullptr,
-                nullptr, output);
+    case kTfLiteFloat32: {
+      tflite::reference_ops::Conv(
+          ConvParamsFloat(params, data), tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output),
+          tflite::micro::GetTensorShape(nullptr), nullptr);
      break;
-    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
-                              output, nullptr);
-      break;
-    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, data, input, filter, bias, nullptr,
-                    nullptr, output);
+    }
+    case kTfLiteInt8: {
+      reference_integer_ops::ConvPerChannel(
+          ConvParamsQuantized(params, data), data.per_channel_output_multiplier,
+          data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<int8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
      break;
+    }
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                         TfLiteTypeGetName(input->type), input->type);
@@ -329,7 +96,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 TfLiteRegistration Register_CONV_2D() {
  return {/*init=*/Init,
          /*free=*/nullptr,
-          /*prepare=*/Prepare,
+          /*prepare=*/ConvPrepare,
          /*invoke=*/Eval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/conv.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/conv.h
@@ -0,0 +1,77 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_CONV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_CONV_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+struct OpDataConv {
+  TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+
+  // Per channel output multiplier and shift.
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
+
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+extern const int kConvInputTensor;
+extern const int kConvWeightsTensor;
+extern const int kConvBiasTensor;
+extern const int kConvOutputTensor;
+extern const int kConvQuantizedDimension;
+
+// Returns a ConvParams struct with all the parameters needed for a
+// float computation.
+ConvParams ConvParamsFloat(const TfLiteConvParams& params,
+                           const OpDataConv& data);
+
+// Returns a ConvParams struct with all the parameters needed for a
+// quantized computation.
+ConvParams ConvParamsQuantized(const TfLiteConvParams& params,
+                               const OpDataConv& data);
+
+TfLiteStatus CalculateOpDataConv(TfLiteContext* context, TfLiteNode* node,
+                                 const TfLiteConvParams& params, int width,
+                                 int height, int filter_width,
+                                 int filter_height, int out_width,
+                                 int out_height, const TfLiteType data_type,
+                                 OpDataConv* data);
+
+TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_CONV_H_
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/conv_common.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/conv_common.cc
@@ -0,0 +1,182 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/conv.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+
+const int kConvInputTensor = 0;
+const int kConvWeightsTensor = 1;
+const int kConvBiasTensor = 2;
+const int kConvOutputTensor = 0;
+
+// Conv is quantized along dimension 0:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+const int kConvQuantizedDimension = 0;
+
+// Returns a ConvParams struct with all the parameters needed for a
+// float computation.
+ConvParams ConvParamsFloat(const TfLiteConvParams& params,
+                           const OpDataConv& data) {
+  ConvParams op_params;
+  CalculateActivationRange(params.activation, &op_params.float_activation_min,
+                           &op_params.float_activation_max);
+  op_params.padding_type = tflite::micro::RuntimePaddingType(params.padding);
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
+  op_params.stride_width = params.stride_width;
+  op_params.stride_height = params.stride_height;
+  op_params.dilation_width_factor = params.dilation_width_factor;
+  op_params.dilation_height_factor = params.dilation_height_factor;
+  return op_params;
+}
+
+// Returns a ConvParams struct with all the parameters needed for a
+// quantized computation.
+ConvParams ConvParamsQuantized(const TfLiteConvParams& params,
+                               const OpDataConv& data) {
+  ConvParams op_params;
+  op_params.input_offset = -data.input_zero_point;
+  op_params.weights_offset = -data.filter_zero_point;
+  op_params.output_offset = data.output_zero_point;
+  op_params.output_multiplier = data.output_multiplier;
+  op_params.output_shift = -data.output_shift;
+  op_params.padding_type = tflite::micro::RuntimePaddingType(params.padding);
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.stride_height = params.stride_height;
+  op_params.stride_width = params.stride_width;
+  op_params.dilation_height_factor = params.dilation_height_factor;
+  op_params.dilation_width_factor = params.dilation_width_factor;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
+  return op_params;
+}
+
+TfLiteStatus CalculateOpDataConv(TfLiteContext* context, TfLiteNode* node,
+                                 const TfLiteConvParams& params, int width,
+                                 int height, int filter_width,
+                                 int filter_height, int out_width,
+                                 int out_height, const TfLiteType data_type,
+                                 OpDataConv* data) {
+  bool has_bias = node->inputs->size == 3;
+  // Check number of inputs/outputs
+  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  // Matching GetWindowedOutputSize in TensorFlow.
+  auto padding = params.padding;
+  data->padding = ComputePaddingHeightWidth(
+      params.stride_height, params.stride_width, params.dilation_height_factor,
+      params.dilation_width_factor, height, width, filter_height, filter_width,
+      padding, &out_height, &out_width);
+
+  const TfLiteTensor* input = GetInput(context, node, kConvInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  const TfLiteTensor* filter = GetInput(context, node, kConvWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+  const TfLiteTensor* bias =
+      GetOptionalInputTensor(context, node, kConvBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kConvOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    int output_channels = filter->dims->data[kConvQuantizedDimension];
+
+    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params.activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier,
+        reinterpret_cast<int*>(data->per_channel_output_shift),
+        output_channels));
+  }
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpDataConv* data = static_cast<OpDataConv*>(node->user_data);
+  const auto& params =
+      *(static_cast<const TfLiteConvParams*>(node->builtin_data));
+
+  TfLiteTensor* output = GetOutput(context, node, kConvOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+  const TfLiteTensor* input = GetInput(context, node, kConvInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  const TfLiteTensor* filter = GetInput(context, node, kConvWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+
+  const int input_width = input->dims->data[2];
+  const int input_height = input->dims->data[1];
+  const int filter_width = filter->dims->data[2];
+  const int filter_height = filter->dims->data[1];
+  const int output_width = output->dims->data[2];
+  const int output_height = output->dims->data[1];
+
+  // Dynamically allocate per-channel quantization parameters.
+  const int num_channels = filter->dims->data[kConvQuantizedDimension];
+  data->per_channel_output_multiplier =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+    TFLITE_DCHECK(affine_quantization != nullptr);
+    TFLITE_DCHECK(affine_quantization->scale != nullptr);
+    TFLITE_DCHECK(affine_quantization->zero_point != nullptr);
+
+    TF_LITE_ENSURE(context,
+                   affine_quantization->scale->size == 1 ||
+                       affine_quantization->scale->size ==
+                           filter->dims->data[kConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpDataConv(
+      context, node, params, input_width, input_height, filter_width,
+      filter_height, output_width, output_height, input->type, data));
+
+  return kTfLiteOk;
+}
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/conv_test.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/conv_test.h
@@ -0,0 +1,94 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_CONV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_CONV_H_
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/kernels/micro_ops.h"
+#include "tensorflow/lite/micro/test_helpers.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace testing {
+
+TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size,
+                        int output_length, TfLiteConvParams* conv_params,
+                        TfLiteRegistration registration, float* output_data);
+
+TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size,
+                        int output_length, TfLiteConvParams* conv_params,
+                        TfLiteRegistration registration, int8_t* output_data);
+
+TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size,
+                        int output_length, TfLiteConvParams* conv_params,
+                        TfLiteRegistration registration, uint8_t* output_data);
+
+TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
+                                 const float* expected_output_data,
+                                 int output_length,
+                                 TfLiteConvParams* conv_params,
+                                 TfLiteRegistration registration,
+                                 float* output_data, float tolerance = 1e-5);
+
+TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
+                                 const int8_t* expected_output_data,
+                                 int output_length,
+                                 TfLiteConvParams* conv_params,
+                                 TfLiteRegistration registration,
+                                 int8_t* output_data, float tolerance = 1e-5);
+
+TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
+                                 const uint8_t* expected_output_data,
+                                 int output_length,
+                                 TfLiteConvParams* conv_params,
+                                 TfLiteRegistration registration,
+                                 uint8_t* output_data, float tolerance = 1e-5);
+
+TfLiteStatus TestConvFloat(const int* input_dims_data, const float* input_data,
+                           const int* filter_dims_data,
+                           const float* filter_data, const int* bias_dims_data,
+                           const float* bias_data, const int* output_dims_data,
+                           const float* expected_output_data,
+                           TfLiteConvParams* conv_params,
+                           TfLiteRegistration registration, float* output_data);
+
+TfLiteStatus TestConvQuantizedPerLayer(
+    const int* input_dims_data, const float* input_data,
+    uint8_t* input_quantized, float input_scale, const int* filter_dims_data,
+    const float* filter_data, uint8_t* filter_quantized, float filter_scale,
+    const int* bias_dims_data, const float* bias_data, int32_t* bias_quantized,
+    const int* output_dims_data, const float* expected_output_data,
+    uint8_t* expected_output_quantized, float output_scale,
+    TfLiteConvParams* conv_params, TfLiteRegistration registration,
+    uint8_t* output_data);
+
+TfLiteStatus TestConvQuantizedPerChannel(
+    const int* input_dims_data, const float* input_data,
+    int8_t* input_quantized, float input_scale, int input_zero_point,
+    const int* filter_dims_data, const float* filter_data,
+    int8_t* filter_data_quantized, const int* bias_dims_data,
+    const float* bias_data, int32_t* bias_data_quantized, float* bias_scales,
+    int* bias_zero_points, const int* output_dims_data,
+    const float* expected_output_data, int8_t* expected_output_data_quantized,
+    float output_scale, int output_zero_point, TfLiteConvParams* conv_params,
+    TfLiteRegistration registration, int8_t* output_data);
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_CONV_H_
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/depthwise_conv.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/depthwise_conv.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/micro/kernels/depthwise_conv.h"

 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
@@ -29,279 +30,58 @@ limitations under the License.
 namespace tflite {
 namespace {

-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
-// Depthwise conv is quantized along dimension 3:
-// https://www.tensorflow.org/lite/performance/quantization_spec
-constexpr int kDepthwiseConvQuantizedDimension = 3;
-
-struct OpData {
-  TfLitePaddingValues padding;
-
-  // Cached tensor zero point values for quantized operations.
-  int32_t input_zero_point;
-  int32_t filter_zero_point;
-  int32_t output_zero_point;
-
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  // Per channel output multiplier and shift.
-  int32_t* per_channel_output_multiplier;
-  int32_t* per_channel_output_shift;
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
-TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, int width,
-                             int height, int filter_width, int filter_height,
-                             const TfLiteType data_type, OpData* data) {
-  bool has_bias = node->inputs->size == 3;
-  // Check number of inputs/outputs
-  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  int unused_output_height, unused_output_width;
-  data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width, 1, 1, height, width,
-      filter_height, filter_width, params->padding, &unused_output_height,
-      &unused_output_width);
-
-  // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    TF_LITE_ENSURE(context, input != nullptr);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    TF_LITE_ENSURE(context, filter != nullptr);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-    TF_LITE_ENSURE(context, output != nullptr);
-    int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
-
-    return tflite::PopulateConvolutionQuantizationParams(
-        context, input, filter, bias, output, params->activation,
-        &data->output_multiplier, &data->output_shift,
-        &data->output_activation_min, &data->output_activation_max,
-        data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels);
-  }
-  return kTfLiteOk;
-}
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(OpData));
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->user_data != nullptr);
-  TFLITE_DCHECK(node->builtin_data != nullptr);
-
-  auto* params =
-      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
-  OpData* data = static_cast<OpData*>(node->user_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE(context, output != nullptr);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  TF_LITE_ENSURE(context, filter != nullptr);
-
-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-
-  // Per channel quantization is only needed for int8_t inference. For other
-  // quantized types, only a single scale and zero point is needed.
-  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
-  // Dynimically allocate per-channel quantization parameters.
-  data->per_channel_output_multiplier =
-      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
-          context, num_channels * sizeof(int32_t)));
-  data->per_channel_output_shift =
-      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
-          context, num_channels * sizeof(int32_t)));
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    TF_LITE_ENSURE(
-        context, affine_quantization->scale->size == 1 ||
-                     affine_quantization->scale->size ==
-                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, data_type,
-                                        data));
-
-  data->input_zero_point = input->params.zero_point;
-  data->filter_zero_point = filter->params.zero_point;
-  data->output_zero_point = output->params.zero_point;
-
-  return kTfLiteOk;
-}
-
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, const OpData& data,
-               const TfLiteEvalTensor* input, const TfLiteEvalTensor* filter,
-               const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data.padding.width;
-  op_params.padding_values.height = data.padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  tflite::reference_ops::DepthwiseConv(
-      op_params, tflite::micro::GetTensorShape(input),
-      tflite::micro::GetTensorData<float>(input),
-      tflite::micro::GetTensorShape(filter),
-      tflite::micro::GetTensorData<float>(filter),
-      tflite::micro::GetTensorShape(bias),
-      tflite::micro::GetTensorData<float>(bias),
-      tflite::micro::GetTensorShape(output),
-      tflite::micro::GetTensorData<float>(output));
-}
-
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params,
-                             const OpData& data, const TfLiteEvalTensor* input,
-                             const TfLiteEvalTensor* filter,
-                             const TfLiteEvalTensor* bias,
-                             TfLiteEvalTensor* output) {
-  DepthwiseParams op_params;
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data.padding.width;
-  op_params.padding_values.height = data.padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.input_offset = -data.input_zero_point;
-  op_params.weights_offset = 0;
-  op_params.output_offset = data.output_zero_point;
-  // TODO(b/130439627): Use calculated value for clamping.
-  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
-  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
-
-  reference_integer_ops::DepthwiseConvPerChannel(
-      op_params, data.per_channel_output_multiplier,
-      data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
-      tflite::micro::GetTensorData<int8_t>(input),
-      tflite::micro::GetTensorShape(filter),
-      tflite::micro::GetTensorData<int8_t>(filter),
-      tflite::micro::GetTensorShape(bias),
-      tflite::micro::GetTensorData<int32_t>(bias),
-      tflite::micro::GetTensorShape(output),
-      tflite::micro::GetTensorData<int8_t>(output));
-}
-
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, const OpData& data,
-                   const TfLiteEvalTensor* input,
-                   const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
-                   TfLiteEvalTensor* output) {
-  const int32_t input_offset = -data.input_zero_point;
-  const int32_t filter_offset = -data.filter_zero_point;
-  const int32_t output_offset = data.output_zero_point;
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data.padding.width;
-  op_params.padding_values.height = data.padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.quantized_activation_min = data.output_activation_min;
-  op_params.quantized_activation_max = data.output_activation_max;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data.output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data.output_shift;
-
-  tflite::reference_ops::DepthwiseConv(
-      op_params, tflite::micro::GetTensorShape(input),
-      tflite::micro::GetTensorData<uint8_t>(input),
-      tflite::micro::GetTensorShape(filter),
-      tflite::micro::GetTensorData<uint8_t>(filter),
-      tflite::micro::GetTensorShape(bias),
-      tflite::micro::GetTensorData<int32_t>(bias),
-      tflite::micro::GetTensorShape(output),
-      tflite::micro::GetTensorData<uint8_t>(output));
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataConv));
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  TFLITE_DCHECK(node->builtin_data != nullptr);

-  auto* params =
-      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
-  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+  auto& params =
+      *(reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data));
+  const OpDataConv& data = *(static_cast<const OpDataConv*>(node->user_data));

  TfLiteEvalTensor* output =
-      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+      tflite::micro::GetEvalOutput(context, node, kDepthwiseConvOutputTensor);
  const TfLiteEvalTensor* input =
-      tflite::micro::GetEvalInput(context, node, kInputTensor);
+      tflite::micro::GetEvalInput(context, node, kDepthwiseConvInputTensor);
  const TfLiteEvalTensor* filter =
-      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+      tflite::micro::GetEvalInput(context, node, kDepthwiseConvWeightsTensor);
  const TfLiteEvalTensor* bias =
      (NumInputs(node) == 3)
-          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          ? tflite::micro::GetEvalInput(context, node, kDepthwiseConvBiasTensor)
          : nullptr;

-  // TODO(aselle): Consider whether float conv and quantized conv should be
-  // separate ops to avoid dispatch overhead here.
  switch (input->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      EvalFloat(context, node, params, data, input, filter, bias, output);
+    case kTfLiteFloat32: {
+      tflite::reference_ops::DepthwiseConv(
+          DepthwiseConvParamsFloat(params, data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
      break;
-    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
-                              output);
-      break;
-    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, data, input, filter, bias, output);
+    }
+    case kTfLiteInt8: {
+      reference_integer_ops::DepthwiseConvPerChannel(
+          DepthwiseConvParamsQuantized(params, data),
+          data.per_channel_output_multiplier, data.per_channel_output_shift,
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<int8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
      break;
+    }
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                         TfLiteTypeGetName(input->type), input->type);
@@ -315,7 +95,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
  return {/*init=*/Init,
          /*free=*/nullptr,
-          /*prepare=*/Prepare,
+          /*prepare=*/DepthwiseConvPrepare,
          /*invoke=*/Eval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/depthwise_conv.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/depthwise_conv.h
@@ -0,0 +1,54 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_DEPTHWISE_CONV_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_DEPTHWISE_CONV_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/micro/kernels/conv.h"
+
+namespace tflite {
+
+extern const int kDepthwiseConvInputTensor;
+extern const int kDepthwiseConvWeightsTensor;
+extern const int kDepthwiseConvBiasTensor;
+extern const int kDepthwiseConvOutputTensor;
+extern const int kDepthwiseConvQuantizedDimension;
+
+// Returns a DepthwiseParams struct with all the parameters needed for a
+// float computation.
+DepthwiseParams DepthwiseConvParamsFloat(
+    const TfLiteDepthwiseConvParams& params, const OpDataConv& data);
+
+// Returns a DepthwiseParams struct with all the parameters needed for a
+// quantized computation.
+DepthwiseParams DepthwiseConvParamsQuantized(
+    const TfLiteDepthwiseConvParams& params, const OpDataConv& data);
+
+TfLiteStatus CalculateOpDataDepthwiseConv(
+    TfLiteContext* context, TfLiteNode* node,
+    const TfLiteDepthwiseConvParams& params, int width, int height,
+    int filter_width, int filter_height, int out_width, int out_height,
+    const TfLiteType data_type, OpDataConv* data);
+
+TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_DEPTHWISE_CONV_H_
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/depthwise_conv_common.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/depthwise_conv_common.cc
@@ -0,0 +1,188 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/depthwise_conv.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+
+const int kDepthwiseConvInputTensor = 0;
+const int kDepthwiseConvWeightsTensor = 1;
+const int kDepthwiseConvBiasTensor = 2;
+const int kDepthwiseConvOutputTensor = 0;
+
+// DepthwiseConv is quantized along dimension 3:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+const int kDepthwiseConvQuantizedDimension = 3;
+
+// Returns a DepthwiseParams struct with all the parameters needed for a
+// float computation.
+DepthwiseParams DepthwiseConvParamsFloat(
+    const TfLiteDepthwiseConvParams& params, const OpDataConv& data) {
+  DepthwiseParams op_params;
+  CalculateActivationRange(params.activation, &op_params.float_activation_min,
+                           &op_params.float_activation_max);
+  op_params.padding_type = tflite::micro::RuntimePaddingType(params.padding);
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
+  op_params.stride_width = params.stride_width;
+  op_params.stride_height = params.stride_height;
+  op_params.dilation_width_factor = params.dilation_width_factor;
+  op_params.dilation_height_factor = params.dilation_height_factor;
+  op_params.depth_multiplier = params.depth_multiplier;
+  return op_params;
+}
+
+// Returns a DepthwiseParams struct with all the parameters needed for a
+// quantized computation.
+DepthwiseParams DepthwiseConvParamsQuantized(
+    const TfLiteDepthwiseConvParams& params, const OpDataConv& data) {
+  DepthwiseParams op_params;
+  op_params.input_offset = -data.input_zero_point;
+  op_params.weights_offset = -data.filter_zero_point;
+  op_params.output_offset = data.output_zero_point;
+  op_params.output_multiplier = data.output_multiplier;
+  op_params.output_shift = -data.output_shift;
+  op_params.padding_type = tflite::micro::RuntimePaddingType(params.padding);
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.stride_height = params.stride_height;
+  op_params.stride_width = params.stride_width;
+  op_params.dilation_height_factor = params.dilation_height_factor;
+  op_params.dilation_width_factor = params.dilation_width_factor;
+  op_params.depth_multiplier = params.depth_multiplier;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
+  return op_params;
+}
+
+TfLiteStatus CalculateOpDataDepthwiseConv(
+    TfLiteContext* context, TfLiteNode* node,
+    const TfLiteDepthwiseConvParams& params, int width, int height,
+    int filter_width, int filter_height, int out_width, int out_height,
+    const TfLiteType data_type, OpDataConv* data) {
+  bool has_bias = node->inputs->size == 3;
+  // Check number of inputs/outputs
+  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  // Matching GetWindowedOutputSize in TensorFlow.
+  auto padding = params.padding;
+  data->padding = ComputePaddingHeightWidth(
+      params.stride_height, params.stride_width, params.dilation_height_factor,
+      params.dilation_width_factor, height, width, filter_height, filter_width,
+      padding, &out_height, &out_width);
+
+  const TfLiteTensor* input = GetInput(context, node, kConvInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  const TfLiteTensor* filter = GetInput(context, node, kConvWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+  const TfLiteTensor* bias =
+      GetOptionalInputTensor(context, node, kConvBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kConvOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    int output_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+
+    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params.activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier,
+        reinterpret_cast<int*>(data->per_channel_output_shift),
+        output_channels));
+  }
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpDataConv* data = static_cast<OpDataConv*>(node->user_data);
+  const auto& params =
+      *(static_cast<const TfLiteDepthwiseConvParams*>(node->builtin_data));
+
+  TfLiteTensor* output = GetOutput(context, node, kDepthwiseConvOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+  const TfLiteTensor* input =
+      GetInput(context, node, kDepthwiseConvInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  const TfLiteTensor* filter =
+      GetInput(context, node, kDepthwiseConvWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+
+  const int input_width = input->dims->data[2];
+  const int input_height = input->dims->data[1];
+  const int filter_width = filter->dims->data[2];
+  const int filter_height = filter->dims->data[1];
+  const int output_width = output->dims->data[2];
+  const int output_height = output->dims->data[1];
+
+  // Dynamically allocate per-channel quantization parameters.
+  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+  data->per_channel_output_multiplier =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+    TFLITE_DCHECK(affine_quantization != nullptr);
+    TFLITE_DCHECK(affine_quantization->scale != nullptr);
+    TFLITE_DCHECK(affine_quantization->zero_point != nullptr);
+
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpDataDepthwiseConv(
+      context, node, params, input_width, input_height, filter_width,
+      filter_height, output_width, output_height, input->type, data));
+
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/dequantize.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/dequantize.cc
@@ -59,8 +59,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE(context, input->type == kTfLiteUInt8 ||
                              input->type == kTfLiteInt8 ||
                              input->type == kTfLiteInt16);
-  TF_LITE_ENSURE(
-      context, output->type == kTfLiteFloat32 || output->type == kTfLiteInt32);
+  TF_LITE_ENSURE(context, output->type == kTfLiteFloat32);

  if (output->type == kTfLiteInt32) {
    const double effective_output_scale =
@@ -112,32 +111,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                           TfLiteTypeGetName(output->type));
        return kTfLiteError;
    }
-  } else if (output->type == kTfLiteInt32) {
-    int flat_size = MatchingFlatSize(tflite::micro::GetTensorShape(input),
-                                     tflite::micro::GetTensorShape(output));
-    switch (input->type) {
-      case kTfLiteInt16: {
-        reference_ops::Requantize(
-            tflite::micro::GetTensorData<int16_t>(input), flat_size,
-            data->output_multiplier, data->output_shift,
-            data->quantization_params.zero_point, data->output_zero_point,
-            tflite::micro::GetTensorData<int32_t>(output));
-        break;
-      }
-      case kTfLiteInt8: {
-        reference_ops::Requantize(
-            tflite::micro::GetTensorData<int8_t>(input), flat_size,
-            data->output_multiplier, data->output_shift,
-            data->quantization_params.zero_point, data->output_zero_point,
-            tflite::micro::GetTensorData<int32_t>(output));
-        break;
-      }
-      default:
-        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                           TfLiteTypeGetName(input->type),
-                           TfLiteTypeGetName(output->type));
-        return kTfLiteError;
-    }
  } else {
    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
                       TfLiteTypeGetName(input->type),
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/detection_postprocess.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/detection_postprocess.cc
@@ -0,0 +1,805 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <numeric>
+
+#define FLATBUFFERS_LOCALE_INDEPENDENT 0
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+namespace {
+
+/**
+ * This version of detection_postprocess is specific to TFLite Micro. It
+ * contains the following differences between the TFLite version:
+ *
+ * 1.) Temporaries (temporary tensors) - Micro use instead scratch buffer API.
+ * 2.) Output dimensions - the TFLite version does not support undefined out
+ * dimensions. So model must have static out dimensions.
+ */
+
+// Input tensors
+constexpr int kInputTensorBoxEncodings = 0;
+constexpr int kInputTensorClassPredictions = 1;
+constexpr int kInputTensorAnchors = 2;
+
+// Output tensors
+constexpr int kOutputTensorDetectionBoxes = 0;
+constexpr int kOutputTensorDetectionClasses = 1;
+constexpr int kOutputTensorDetectionScores = 2;
+constexpr int kOutputTensorNumDetections = 3;
+
+constexpr int kNumCoordBox = 4;
+constexpr int kBatchSize = 1;
+
+constexpr int kNumDetectionsPerClass = 100;
+
+// Object Detection model produces axis-aligned boxes in two formats:
+// BoxCorner represents the lower left corner (xmin, ymin) and
+// the upper right corner (xmax, ymax).
+// CenterSize represents the center (xcenter, ycenter), height and width.
+// BoxCornerEncoding and CenterSizeEncoding are related as follows:
+// ycenter = y / y_scale * anchor.h + anchor.y;
+// xcenter = x / x_scale * anchor.w + anchor.x;
+// half_h = 0.5*exp(h/ h_scale)) * anchor.h;
+// half_w = 0.5*exp(w / w_scale)) * anchor.w;
+// ymin = ycenter - half_h
+// ymax = ycenter + half_h
+// xmin = xcenter - half_w
+// xmax = xcenter + half_w
+struct BoxCornerEncoding {
+  float ymin;
+  float xmin;
+  float ymax;
+  float xmax;
+};
+
+struct CenterSizeEncoding {
+  float y;
+  float x;
+  float h;
+  float w;
+};
+// We make sure that the memory allocations are contiguous with static_assert.
+static_assert(sizeof(BoxCornerEncoding) == sizeof(float) * kNumCoordBox,
+              "Size of BoxCornerEncoding is 4 float values");
+static_assert(sizeof(CenterSizeEncoding) == sizeof(float) * kNumCoordBox,
+              "Size of CenterSizeEncoding is 4 float values");
+
+struct OpData {
+  int max_detections;
+  int max_classes_per_detection;  // Fast Non-Max-Suppression
+  int detections_per_class;       // Regular Non-Max-Suppression
+  float non_max_suppression_score_threshold;
+  float intersection_over_union_threshold;
+  int num_classes;
+  bool use_regular_non_max_suppression;
+  CenterSizeEncoding scale_values;
+
+  // Scratch buffers indexes
+  int active_candidate_idx;
+  int decoded_boxes_idx;
+  int scores_idx;
+  int score_buffer_idx;
+  int keep_scores_idx;
+  int scores_after_regular_non_max_suppression_idx;
+  int sorted_values_idx;
+  int keep_indices_idx;
+  int sorted_indices_idx;
+  int buffer_idx;
+  int selected_idx;
+
+  // Cached tensor scale and zero point values for quantized operations
+  TfLiteQuantizationParams input_box_encodings;
+  TfLiteQuantizationParams input_class_predictions;
+  TfLiteQuantizationParams input_anchors;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  OpData* op_data = nullptr;
+
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  op_data = reinterpret_cast<OpData*>(
+      context->AllocatePersistentBuffer(context, sizeof(OpData)));
+
+  op_data->max_detections = m["max_detections"].AsInt32();
+  op_data->max_classes_per_detection = m["max_classes_per_detection"].AsInt32();
+  if (m["detections_per_class"].IsNull())
+    op_data->detections_per_class = kNumDetectionsPerClass;
+  else
+    op_data->detections_per_class = m["detections_per_class"].AsInt32();
+  if (m["use_regular_nms"].IsNull())
+    op_data->use_regular_non_max_suppression = false;
+  else
+    op_data->use_regular_non_max_suppression = m["use_regular_nms"].AsBool();
+
+  op_data->non_max_suppression_score_threshold =
+      m["nms_score_threshold"].AsFloat();
+  op_data->intersection_over_union_threshold = m["nms_iou_threshold"].AsFloat();
+  op_data->num_classes = m["num_classes"].AsInt32();
+  op_data->scale_values.y = m["y_scale"].AsFloat();
+  op_data->scale_values.x = m["x_scale"].AsFloat();
+  op_data->scale_values.h = m["h_scale"].AsFloat();
+  op_data->scale_values.w = m["w_scale"].AsFloat();
+
+  return op_data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* op_data = static_cast<OpData*>(node->user_data);
+
+  // Inputs: box_encodings, scores, anchors
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  const TfLiteTensor* input_box_encodings =
+      GetInput(context, node, kInputTensorBoxEncodings);
+  const TfLiteTensor* input_class_predictions =
+      GetInput(context, node, kInputTensorClassPredictions);
+  const TfLiteTensor* input_anchors =
+      GetInput(context, node, kInputTensorAnchors);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input_box_encodings), 3);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input_class_predictions), 3);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input_anchors), 2);
+
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 4);
+  const int num_boxes = input_box_encodings->dims->data[1];
+  const int num_classes = op_data->num_classes;
+
+  op_data->input_box_encodings.scale = input_box_encodings->params.scale;
+  op_data->input_box_encodings.zero_point =
+      input_box_encodings->params.zero_point;
+  op_data->input_class_predictions.scale =
+      input_class_predictions->params.scale;
+  op_data->input_class_predictions.zero_point =
+      input_class_predictions->params.zero_point;
+  op_data->input_anchors.scale = input_anchors->params.scale;
+  op_data->input_anchors.zero_point = input_anchors->params.zero_point;
+
+  // Scratch tensors
+  context->RequestScratchBufferInArena(context, num_boxes,
+                                       &op_data->active_candidate_idx);
+  context->RequestScratchBufferInArena(context,
+                                       num_boxes * kNumCoordBox * sizeof(float),
+                                       &op_data->decoded_boxes_idx);
+  context->RequestScratchBufferInArena(
+      context,
+      input_class_predictions->dims->data[1] *
+          input_class_predictions->dims->data[2] * sizeof(float),
+      &op_data->scores_idx);
+
+  // Additional buffers
+  context->RequestScratchBufferInArena(context, num_boxes * sizeof(float),
+                                       &op_data->score_buffer_idx);
+  context->RequestScratchBufferInArena(context, num_boxes * sizeof(float),
+                                       &op_data->keep_scores_idx);
+  context->RequestScratchBufferInArena(
+      context, op_data->max_detections * num_boxes * sizeof(float),
+      &op_data->scores_after_regular_non_max_suppression_idx);
+  context->RequestScratchBufferInArena(
+      context, op_data->max_detections * num_boxes * sizeof(float),
+      &op_data->sorted_values_idx);
+  context->RequestScratchBufferInArena(context, num_boxes * sizeof(int),
+                                       &op_data->keep_indices_idx);
+  context->RequestScratchBufferInArena(
+      context, op_data->max_detections * num_boxes * sizeof(int),
+      &op_data->sorted_indices_idx);
+  int buffer_size = std::max(num_classes, op_data->max_detections);
+  context->RequestScratchBufferInArena(
+      context, buffer_size * num_boxes * sizeof(int), &op_data->buffer_idx);
+  buffer_size = std::min(num_boxes, op_data->max_detections);
+  context->RequestScratchBufferInArena(
+      context, buffer_size * num_boxes * sizeof(int), &op_data->selected_idx);
+
+  // Outputs: detection_boxes, detection_scores, detection_classes,
+  // num_detections
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 4);
+
+  return kTfLiteOk;
+}
+
+class Dequantizer {
+ public:
+  Dequantizer(int zero_point, float scale)
+      : zero_point_(zero_point), scale_(scale) {}
+  float operator()(uint8_t x) {
+    return (static_cast<float>(x) - zero_point_) * scale_;
+  }
+
+ private:
+  int zero_point_;
+  float scale_;
+};
+
+void DequantizeBoxEncodings(const TfLiteEvalTensor* input_box_encodings,
+                            int idx, float quant_zero_point, float quant_scale,
+                            int length_box_encoding,
+                            CenterSizeEncoding* box_centersize) {
+  const uint8_t* boxes =
+      tflite::micro::GetTensorData<uint8_t>(input_box_encodings) +
+      length_box_encoding * idx;
+  Dequantizer dequantize(quant_zero_point, quant_scale);
+  // See definition of the KeyPointBoxCoder at
+  // https://github.com/tensorflow/models/blob/master/research/object_detection/box_coders/keypoint_box_coder.py
+  // The first four elements are the box coordinates, which is the same as the
+  // FastRnnBoxCoder at
+  // https://github.com/tensorflow/models/blob/master/research/object_detection/box_coders/faster_rcnn_box_coder.py
+  box_centersize->y = dequantize(boxes[0]);
+  box_centersize->x = dequantize(boxes[1]);
+  box_centersize->h = dequantize(boxes[2]);
+  box_centersize->w = dequantize(boxes[3]);
+}
+
+template <class T>
+T ReInterpretTensor(const TfLiteEvalTensor* tensor) {
+  const float* tensor_base = tflite::micro::GetTensorData<float>(tensor);
+  return reinterpret_cast<T>(tensor_base);
+}
+
+template <class T>
+T ReInterpretTensor(TfLiteEvalTensor* tensor) {
+  float* tensor_base = tflite::micro::GetTensorData<float>(tensor);
+  return reinterpret_cast<T>(tensor_base);
+}
+
+TfLiteStatus DecodeCenterSizeBoxes(TfLiteContext* context, TfLiteNode* node,
+                                   OpData* op_data) {
+  // Parse input tensor boxencodings
+  const TfLiteEvalTensor* input_box_encodings =
+      tflite::micro::GetEvalInput(context, node, kInputTensorBoxEncodings);
+  TF_LITE_ENSURE_EQ(context, input_box_encodings->dims->data[0], kBatchSize);
+  const int num_boxes = input_box_encodings->dims->data[1];
+  TF_LITE_ENSURE(context, input_box_encodings->dims->data[2] >= kNumCoordBox);
+  const TfLiteEvalTensor* input_anchors =
+      tflite::micro::GetEvalInput(context, node, kInputTensorAnchors);
+
+  // Decode the boxes to get (ymin, xmin, ymax, xmax) based on the anchors
+  CenterSizeEncoding box_centersize;
+  CenterSizeEncoding scale_values = op_data->scale_values;
+  CenterSizeEncoding anchor;
+  for (int idx = 0; idx < num_boxes; ++idx) {
+    switch (input_box_encodings->type) {
+        // Quantized
+      case kTfLiteUInt8:
+        DequantizeBoxEncodings(
+            input_box_encodings, idx,
+            static_cast<float>(op_data->input_box_encodings.zero_point),
+            static_cast<float>(op_data->input_box_encodings.scale),
+            input_box_encodings->dims->data[2], &box_centersize);
+        DequantizeBoxEncodings(
+            input_anchors, idx,
+            static_cast<float>(op_data->input_anchors.zero_point),
+            static_cast<float>(op_data->input_anchors.scale), kNumCoordBox,
+            &anchor);
+        break;
+        // Float
+      case kTfLiteFloat32: {
+        // Please see DequantizeBoxEncodings function for the support detail.
+        const int box_encoding_idx = idx * input_box_encodings->dims->data[2];
+        const float* boxes = &(tflite::micro::GetTensorData<float>(
+            input_box_encodings)[box_encoding_idx]);
+        box_centersize = *reinterpret_cast<const CenterSizeEncoding*>(boxes);
+        anchor =
+            ReInterpretTensor<const CenterSizeEncoding*>(input_anchors)[idx];
+        break;
+      }
+      default:
+        // Unsupported type.
+        return kTfLiteError;
+    }
+
+    float ycenter = static_cast<float>(static_cast<double>(box_centersize.y) /
+                                           static_cast<double>(scale_values.y) *
+                                           static_cast<double>(anchor.h) +
+                                       static_cast<double>(anchor.y));
+
+    float xcenter = static_cast<float>(static_cast<double>(box_centersize.x) /
+                                           static_cast<double>(scale_values.x) *
+                                           static_cast<double>(anchor.w) +
+                                       static_cast<double>(anchor.x));
+
+    float half_h =
+        static_cast<float>(0.5 *
+                           (std::exp(static_cast<double>(box_centersize.h) /
+                                     static_cast<double>(scale_values.h))) *
+                           static_cast<double>(anchor.h));
+    float half_w =
+        static_cast<float>(0.5 *
+                           (std::exp(static_cast<double>(box_centersize.w) /
+                                     static_cast<double>(scale_values.w))) *
+                           static_cast<double>(anchor.w));
+
+    float* decoded_boxes = reinterpret_cast<float*>(
+        context->GetScratchBuffer(context, op_data->decoded_boxes_idx));
+    auto& box = reinterpret_cast<BoxCornerEncoding*>(decoded_boxes)[idx];
+    box.ymin = ycenter - half_h;
+    box.xmin = xcenter - half_w;
+    box.ymax = ycenter + half_h;
+    box.xmax = xcenter + half_w;
+  }
+  return kTfLiteOk;
+}
+
+void DecreasingPartialArgSort(const float* values, int num_values,
+                              int num_to_sort, int* indices) {
+  std::iota(indices, indices + num_values, 0);
+  std::partial_sort(
+      indices, indices + num_to_sort, indices + num_values,
+      [&values](const int i, const int j) { return values[i] > values[j]; });
+}
+
+int SelectDetectionsAboveScoreThreshold(const float* values, int size,
+                                        const float threshold,
+                                        float* keep_values, int* keep_indices) {
+  int counter = 0;
+  for (int i = 0; i < size; i++) {
+    if (values[i] >= threshold) {
+      keep_values[counter] = values[i];
+      keep_indices[counter] = i;
+      counter++;
+    }
+  }
+  return counter;
+}
+
+bool ValidateBoxes(const float* decoded_boxes, const int num_boxes) {
+  for (int i = 0; i < num_boxes; ++i) {
+    // ymax>=ymin, xmax>=xmin
+    auto& box = reinterpret_cast<const BoxCornerEncoding*>(decoded_boxes)[i];
+    if (box.ymin >= box.ymax || box.xmin >= box.xmax) {
+      return false;
+    }
+  }
+  return true;
+}
+
+float ComputeIntersectionOverUnion(const float* decoded_boxes, const int i,
+                                   const int j) {
+  auto& box_i = reinterpret_cast<const BoxCornerEncoding*>(decoded_boxes)[i];
+  auto& box_j = reinterpret_cast<const BoxCornerEncoding*>(decoded_boxes)[j];
+  const float area_i = (box_i.ymax - box_i.ymin) * (box_i.xmax - box_i.xmin);
+  const float area_j = (box_j.ymax - box_j.ymin) * (box_j.xmax - box_j.xmin);
+  if (area_i <= 0 || area_j <= 0) return 0.0;
+  const float intersection_ymin = std::max<float>(box_i.ymin, box_j.ymin);
+  const float intersection_xmin = std::max<float>(box_i.xmin, box_j.xmin);
+  const float intersection_ymax = std::min<float>(box_i.ymax, box_j.ymax);
+  const float intersection_xmax = std::min<float>(box_i.xmax, box_j.xmax);
+  const float intersection_area =
+      std::max<float>(intersection_ymax - intersection_ymin, 0.0) *
+      std::max<float>(intersection_xmax - intersection_xmin, 0.0);
+  return intersection_area / (area_i + area_j - intersection_area);
+}
+
+// NonMaxSuppressionSingleClass() prunes out the box locations with high overlap
+// before selecting the highest scoring boxes (max_detections in number)
+// It assumes all boxes are good in beginning and sorts based on the scores.
+// If lower-scoring box has too much overlap with a higher-scoring box,
+// we get rid of the lower-scoring box.
+// Complexity is O(N^2) pairwise comparison between boxes
+TfLiteStatus NonMaxSuppressionSingleClassHelper(
+    TfLiteContext* context, TfLiteNode* node, OpData* op_data,
+    const float* scores, int* selected, int* selected_size,
+    int max_detections) {
+  const TfLiteEvalTensor* input_box_encodings =
+      tflite::micro::GetEvalInput(context, node, kInputTensorBoxEncodings);
+  const int num_boxes = input_box_encodings->dims->data[1];
+  const float non_max_suppression_score_threshold =
+      op_data->non_max_suppression_score_threshold;
+  const float intersection_over_union_threshold =
+      op_data->intersection_over_union_threshold;
+  // Maximum detections should be positive.
+  TF_LITE_ENSURE(context, (max_detections >= 0));
+  // intersection_over_union_threshold should be positive
+  // and should be less than 1.
+  TF_LITE_ENSURE(context, (intersection_over_union_threshold > 0.0f) &&
+                              (intersection_over_union_threshold <= 1.0f));
+  // Validate boxes
+  float* decoded_boxes = reinterpret_cast<float*>(
+      context->GetScratchBuffer(context, op_data->decoded_boxes_idx));
+
+  TF_LITE_ENSURE(context, ValidateBoxes(decoded_boxes, num_boxes));
+
+  // threshold scores
+  int* keep_indices = reinterpret_cast<int*>(
+      context->GetScratchBuffer(context, op_data->keep_indices_idx));
+  float* keep_scores = reinterpret_cast<float*>(
+      context->GetScratchBuffer(context, op_data->keep_scores_idx));
+  int num_scores_kept = SelectDetectionsAboveScoreThreshold(
+      scores, num_boxes, non_max_suppression_score_threshold, keep_scores,
+      keep_indices);
+  int* sorted_indices = reinterpret_cast<int*>(
+      context->GetScratchBuffer(context, op_data->sorted_indices_idx));
+
+  DecreasingPartialArgSort(keep_scores, num_scores_kept, num_scores_kept,
+                           sorted_indices);
+
+  const int num_boxes_kept = num_scores_kept;
+  const int output_size = std::min(num_boxes_kept, max_detections);
+  *selected_size = 0;
+
+  int num_active_candidate = num_boxes_kept;
+  uint8_t* active_box_candidate = reinterpret_cast<uint8_t*>(
+      context->GetScratchBuffer(context, op_data->active_candidate_idx));
+
+  for (int row = 0; row < num_boxes_kept; row++) {
+    active_box_candidate[row] = 1;
+  }
+  for (int i = 0; i < num_boxes_kept; ++i) {
+    if (num_active_candidate == 0 || *selected_size >= output_size) break;
+    if (active_box_candidate[i] == 1) {
+      selected[(*selected_size)++] = keep_indices[sorted_indices[i]];
+      active_box_candidate[i] = 0;
+      num_active_candidate--;
+    } else {
+      continue;
+    }
+    for (int j = i + 1; j < num_boxes_kept; ++j) {
+      if (active_box_candidate[j] == 1) {
+        float intersection_over_union = ComputeIntersectionOverUnion(
+            decoded_boxes, keep_indices[sorted_indices[i]],
+            keep_indices[sorted_indices[j]]);
+
+        if (intersection_over_union > intersection_over_union_threshold) {
+          active_box_candidate[j] = 0;
+          num_active_candidate--;
+        }
+      }
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+// This function implements a regular version of Non Maximal Suppression (NMS)
+// for multiple classes where
+// 1) we do NMS separately for each class across all anchors and
+// 2) keep only the highest anchor scores across all classes
+// 3) The worst runtime of the regular NMS is O(K*N^2)
+// where N is the number of anchors and K the number of
+// classes.
+TfLiteStatus NonMaxSuppressionMultiClassRegularHelper(TfLiteContext* context,
+                                                      TfLiteNode* node,
+                                                      OpData* op_data,
+                                                      const float* scores) {
+  const TfLiteEvalTensor* input_box_encodings =
+      tflite::micro::GetEvalInput(context, node, kInputTensorBoxEncodings);
+  const TfLiteEvalTensor* input_class_predictions =
+      tflite::micro::GetEvalInput(context, node, kInputTensorClassPredictions);
+  TfLiteEvalTensor* detection_boxes =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensorDetectionBoxes);
+  TfLiteEvalTensor* detection_classes = tflite::micro::GetEvalOutput(
+      context, node, kOutputTensorDetectionClasses);
+  TfLiteEvalTensor* detection_scores =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensorDetectionScores);
+  TfLiteEvalTensor* num_detections =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensorNumDetections);
+
+  const int num_boxes = input_box_encodings->dims->data[1];
+  const int num_classes = op_data->num_classes;
+  const int num_detections_per_class = op_data->detections_per_class;
+  const int max_detections = op_data->max_detections;
+  const int num_classes_with_background =
+      input_class_predictions->dims->data[2];
+  // The row index offset is 1 if background class is included and 0 otherwise.
+  int label_offset = num_classes_with_background - num_classes;
+  TF_LITE_ENSURE(context, num_detections_per_class > 0);
+
+  // For each class, perform non-max suppression.
+  float* class_scores = reinterpret_cast<float*>(
+      context->GetScratchBuffer(context, op_data->score_buffer_idx));
+  int* box_indices_after_regular_non_max_suppression = reinterpret_cast<int*>(
+      context->GetScratchBuffer(context, op_data->buffer_idx));
+  float* scores_after_regular_non_max_suppression =
+      reinterpret_cast<float*>(context->GetScratchBuffer(
+          context, op_data->scores_after_regular_non_max_suppression_idx));
+
+  int size_of_sorted_indices = 0;
+  int* sorted_indices = reinterpret_cast<int*>(
+      context->GetScratchBuffer(context, op_data->sorted_indices_idx));
+  float* sorted_values = reinterpret_cast<float*>(
+      context->GetScratchBuffer(context, op_data->sorted_values_idx));
+
+  for (int col = 0; col < num_classes; col++) {
+    for (int row = 0; row < num_boxes; row++) {
+      // Get scores of boxes corresponding to all anchors for single class
+      class_scores[row] =
+          *(scores + row * num_classes_with_background + col + label_offset);
+    }
+    // Perform non-maximal suppression on single class
+    int selected_size = 0;
+    int* selected = reinterpret_cast<int*>(
+        context->GetScratchBuffer(context, op_data->selected_idx));
+    TF_LITE_ENSURE_STATUS(NonMaxSuppressionSingleClassHelper(
+        context, node, op_data, class_scores, selected, &selected_size,
+        num_detections_per_class));
+    // Add selected indices from non-max suppression of boxes in this class
+    int output_index = size_of_sorted_indices;
+    for (int i = 0; i < selected_size; i++) {
+      int selected_index = selected[i];
+
+      box_indices_after_regular_non_max_suppression[output_index] =
+          (selected_index * num_classes_with_background + col + label_offset);
+      scores_after_regular_non_max_suppression[output_index] =
+          class_scores[selected_index];
+      output_index++;
+    }
+    // Sort the max scores among the selected indices
+    // Get the indices for top scores
+    int num_indices_to_sort = std::min(output_index, max_detections);
+    DecreasingPartialArgSort(scores_after_regular_non_max_suppression,
+                             output_index, num_indices_to_sort, sorted_indices);
+
+    // Copy values to temporary vectors
+    for (int row = 0; row < num_indices_to_sort; row++) {
+      int temp = sorted_indices[row];
+      sorted_indices[row] = box_indices_after_regular_non_max_suppression[temp];
+      sorted_values[row] = scores_after_regular_non_max_suppression[temp];
+    }
+    // Copy scores and indices from temporary vectors
+    for (int row = 0; row < num_indices_to_sort; row++) {
+      box_indices_after_regular_non_max_suppression[row] = sorted_indices[row];
+      scores_after_regular_non_max_suppression[row] = sorted_values[row];
+    }
+    size_of_sorted_indices = num_indices_to_sort;
+  }
+
+  // Allocate output tensors
+  for (int output_box_index = 0; output_box_index < max_detections;
+       output_box_index++) {
+    if (output_box_index < size_of_sorted_indices) {
+      const int anchor_index = floor(
+          box_indices_after_regular_non_max_suppression[output_box_index] /
+          num_classes_with_background);
+      const int class_index =
+          box_indices_after_regular_non_max_suppression[output_box_index] -
+          anchor_index * num_classes_with_background - label_offset;
+      const float selected_score =
+          scores_after_regular_non_max_suppression[output_box_index];
+      // detection_boxes
+      float* decoded_boxes = reinterpret_cast<float*>(
+          context->GetScratchBuffer(context, op_data->decoded_boxes_idx));
+      ReInterpretTensor<BoxCornerEncoding*>(detection_boxes)[output_box_index] =
+          reinterpret_cast<BoxCornerEncoding*>(decoded_boxes)[anchor_index];
+      // detection_classes
+      tflite::micro::GetTensorData<float>(detection_classes)[output_box_index] =
+          class_index;
+      // detection_scores
+      tflite::micro::GetTensorData<float>(detection_scores)[output_box_index] =
+          selected_score;
+    } else {
+      ReInterpretTensor<BoxCornerEncoding*>(
+          detection_boxes)[output_box_index] = {0.0f, 0.0f, 0.0f, 0.0f};
+      // detection_classes
+      tflite::micro::GetTensorData<float>(detection_classes)[output_box_index] =
+          0.0f;
+      // detection_scores
+      tflite::micro::GetTensorData<float>(detection_scores)[output_box_index] =
+          0.0f;
+    }
+  }
+  tflite::micro::GetTensorData<float>(num_detections)[0] =
+      size_of_sorted_indices;
+
+  return kTfLiteOk;
+}
+
+// This function implements a fast version of Non Maximal Suppression for
+// multiple classes where
+// 1) we keep the top-k scores for each anchor and
+// 2) during NMS, each anchor only uses the highest class score for sorting.
+// 3) Compared to standard NMS, the worst runtime of this version is O(N^2)
+// instead of O(KN^2) where N is the number of anchors and K the number of
+// classes.
+TfLiteStatus NonMaxSuppressionMultiClassFastHelper(TfLiteContext* context,
+                                                   TfLiteNode* node,
+                                                   OpData* op_data,
+                                                   const float* scores) {
+  const TfLiteEvalTensor* input_box_encodings =
+      tflite::micro::GetEvalInput(context, node, kInputTensorBoxEncodings);
+  const TfLiteEvalTensor* input_class_predictions =
+      tflite::micro::GetEvalInput(context, node, kInputTensorClassPredictions);
+  TfLiteEvalTensor* detection_boxes =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensorDetectionBoxes);
+
+  TfLiteEvalTensor* detection_classes = tflite::micro::GetEvalOutput(
+      context, node, kOutputTensorDetectionClasses);
+  TfLiteEvalTensor* detection_scores =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensorDetectionScores);
+  TfLiteEvalTensor* num_detections =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensorNumDetections);
+
+  const int num_boxes = input_box_encodings->dims->data[1];
+  const int num_classes = op_data->num_classes;
+  const int max_categories_per_anchor = op_data->max_classes_per_detection;
+  const int num_classes_with_background =
+      input_class_predictions->dims->data[2];
+
+  // The row index offset is 1 if background class is included and 0 otherwise.
+  int label_offset = num_classes_with_background - num_classes;
+  TF_LITE_ENSURE(context, (max_categories_per_anchor > 0));
+  const int num_categories_per_anchor =
+      std::min(max_categories_per_anchor, num_classes);
+  float* max_scores = reinterpret_cast<float*>(
+      context->GetScratchBuffer(context, op_data->score_buffer_idx));
+  int* sorted_class_indices = reinterpret_cast<int*>(
+      context->GetScratchBuffer(context, op_data->buffer_idx));
+
+  for (int row = 0; row < num_boxes; row++) {
+    const float* box_scores =
+        scores + row * num_classes_with_background + label_offset;
+    int* class_indices = sorted_class_indices + row * num_classes;
+    DecreasingPartialArgSort(box_scores, num_classes, num_categories_per_anchor,
+                             class_indices);
+    max_scores[row] = box_scores[class_indices[0]];
+  }
+
+  // Perform non-maximal suppression on max scores
+  int selected_size = 0;
+  int* selected = reinterpret_cast<int*>(
+      context->GetScratchBuffer(context, op_data->selected_idx));
+  TF_LITE_ENSURE_STATUS(NonMaxSuppressionSingleClassHelper(
+      context, node, op_data, max_scores, selected, &selected_size,
+      op_data->max_detections));
+
+  // Allocate output tensors
+  int output_box_index = 0;
+
+  for (int i = 0; i < selected_size; i++) {
+    int selected_index = selected[i];
+
+    const float* box_scores =
+        scores + selected_index * num_classes_with_background + label_offset;
+    const int* class_indices =
+        sorted_class_indices + selected_index * num_classes;
+
+    for (int col = 0; col < num_categories_per_anchor; ++col) {
+      int box_offset = num_categories_per_anchor * output_box_index + col;
+
+      // detection_boxes
+      float* decoded_boxes = reinterpret_cast<float*>(
+          context->GetScratchBuffer(context, op_data->decoded_boxes_idx));
+      ReInterpretTensor<BoxCornerEncoding*>(detection_boxes)[box_offset] =
+          reinterpret_cast<BoxCornerEncoding*>(decoded_boxes)[selected_index];
+
+      // detection_classes
+      tflite::micro::GetTensorData<float>(detection_classes)[box_offset] =
+          class_indices[col];
+
+      // detection_scores
+      tflite::micro::GetTensorData<float>(detection_scores)[box_offset] =
+          box_scores[class_indices[col]];
+
+      output_box_index++;
+    }
+  }
+
+  tflite::micro::GetTensorData<float>(num_detections)[0] = output_box_index;
+  return kTfLiteOk;
+}
+
+void DequantizeClassPredictions(const TfLiteEvalTensor* input_class_predictions,
+                                const int num_boxes,
+                                const int num_classes_with_background,
+                                float* scores, OpData* op_data) {
+  float quant_zero_point =
+      static_cast<float>(op_data->input_class_predictions.zero_point);
+  float quant_scale =
+      static_cast<float>(op_data->input_class_predictions.scale);
+  Dequantizer dequantize(quant_zero_point, quant_scale);
+  const uint8_t* scores_quant =
+      tflite::micro::GetTensorData<uint8_t>(input_class_predictions);
+  for (int idx = 0; idx < num_boxes * num_classes_with_background; ++idx) {
+    scores[idx] = dequantize(scores_quant[idx]);
+  }
+}
+
+TfLiteStatus NonMaxSuppressionMultiClass(TfLiteContext* context,
+                                         TfLiteNode* node, OpData* op_data) {
+  // Get the input tensors
+  const TfLiteEvalTensor* input_box_encodings =
+      tflite::micro::GetEvalInput(context, node, kInputTensorBoxEncodings);
+  const TfLiteEvalTensor* input_class_predictions =
+      tflite::micro::GetEvalInput(context, node, kInputTensorClassPredictions);
+  const int num_boxes = input_box_encodings->dims->data[1];
+  const int num_classes = op_data->num_classes;
+
+  TF_LITE_ENSURE_EQ(context, input_class_predictions->dims->data[0],
+                    kBatchSize);
+  TF_LITE_ENSURE_EQ(context, input_class_predictions->dims->data[1], num_boxes);
+  const int num_classes_with_background =
+      input_class_predictions->dims->data[2];
+
+  TF_LITE_ENSURE(context, (num_classes_with_background - num_classes <= 1));
+  TF_LITE_ENSURE(context, (num_classes_with_background >= num_classes));
+
+  const float* scores;
+  switch (input_class_predictions->type) {
+    case kTfLiteUInt8: {
+      float* temporary_scores = reinterpret_cast<float*>(
+          context->GetScratchBuffer(context, op_data->scores_idx));
+      DequantizeClassPredictions(input_class_predictions, num_boxes,
+                                 num_classes_with_background, temporary_scores,
+                                 op_data);
+      scores = temporary_scores;
+    } break;
+    case kTfLiteFloat32:
+      scores = tflite::micro::GetTensorData<float>(input_class_predictions);
+      break;
+    default:
+      // Unsupported type.
+      return kTfLiteError;
+  }
+
+  if (op_data->use_regular_non_max_suppression) {
+    TF_LITE_ENSURE_STATUS(NonMaxSuppressionMultiClassRegularHelper(
+        context, node, op_data, scores));
+  } else {
+    TF_LITE_ENSURE_STATUS(
+        NonMaxSuppressionMultiClassFastHelper(context, node, op_data, scores));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, (kBatchSize == 1));
+  auto* op_data = static_cast<OpData*>(node->user_data);
+
+  // These two functions correspond to two blocks in the Object Detection model.
+  // In future, we would like to break the custom op in two blocks, which is
+  // currently not feasible because we would like to input quantized inputs
+  // and do all calculations in float. Mixed quantized/float calculations are
+  // currently not supported in TFLite.
+
+  // This fills in temporary decoded_boxes
+  // by transforming input_box_encodings and input_anchors from
+  // CenterSizeEncodings to BoxCornerEncoding
+  TF_LITE_ENSURE_STATUS(DecodeCenterSizeBoxes(context, node, op_data));
+
+  // This fills in the output tensors
+  // by choosing effective set of decoded boxes
+  // based on Non Maximal Suppression, i.e. selecting
+  // highest scoring non-overlapping boxes.
+  TF_LITE_ENSURE_STATUS(NonMaxSuppressionMultiClass(context, node, op_data));
+
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteRegistration* Register_DETECTION_POSTPROCESS() {
+  static TfLiteRegistration r = {/*init=*/Init,
+                                 /*free=*/Free,
+                                 /*prepare=*/Prepare,
+                                 /*invoke=*/Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/detection_postprocess_flexbuffers_generated_data.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/detection_postprocess_flexbuffers_generated_data.h
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_FLEXBUFFERS_GENERATED_DATA_H
+#define TENSORFLOW_LITE_MICRO_KERNELS_FLEXBUFFERS_GENERATED_DATA_H
+
+extern const int g_gen_data_size_none_regular_nms;
+extern const unsigned char g_gen_data_none_regular_nms[];
+
+extern const int g_gen_data_size_regular_nms;
+extern const unsigned char g_gen_data_regular_nms[];
+
+#endif
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/div.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/div.cc
@@ -0,0 +1,206 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/div.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  // Parameters used in the quantized paths where the output is 8bit
+  int32_t input1_zero_point;
+  int32_t input2_zero_point;
+  int32_t output_zero_point;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+
+  // Parameters used in all quantized paths
+  int32_t output_multiplier;
+  int output_shift;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDivParams* params, OpData* data) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, output->type);
+
+  if (output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->output_activation_min,
+        &data->output_activation_max));
+    const double real_multiplier = static_cast<double>(
+        input1->params.scale / (input2->params.scale * output->params.scale));
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier,
+                       &data->output_shift);
+    data->input1_zero_point = input1->params.zero_point;
+    data->input2_zero_point = input2->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
+  }
+
+  return kTfLiteOk;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = static_cast<TfLiteDivParams*>(node->builtin_data);
+  auto* data = static_cast<OpData*>(node->user_data);
+  return CalculateOpData(context, node, params, data);
+}
+
+void EvalDiv(TfLiteContext* context, TfLiteNode* node, TfLiteDivParams* params,
+             const OpData* data, const TfLiteEvalTensor* input1,
+             const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
+  tflite::ArithmeticParams op_params = {};
+
+#define TF_LITE_DIV(type, opname, data_type)                           \
+  data_type output_activation_min, output_activation_max;              \
+  CalculateActivationRange(params->activation, &output_activation_min, \
+                           &output_activation_max);                    \
+  SetActivationParams(output_activation_min, output_activation_max,    \
+                      &op_params);                                     \
+  type::opname(op_params, tflite::micro::GetTensorShape(input1),       \
+               tflite::micro::GetTensorData<data_type>(input1),        \
+               tflite::micro::GetTensorShape(input2),                  \
+               tflite::micro::GetTensorData<data_type>(input2),        \
+               tflite::micro::GetTensorShape(output),                  \
+               tflite::micro::GetTensorData<data_type>(output))
+
+  bool requires_broadcast = reference_ops::ProcessBroadcastShapes(
+      tflite::micro::GetTensorShape(input1),
+      tflite::micro::GetTensorShape(input2), &op_params);
+
+  if (requires_broadcast) {
+    TF_LITE_DIV(reference_ops, BroadcastDivSlow, float);
+  } else {
+    TF_LITE_DIV(reference_ops, Div, float);
+  }
+#undef TF_LITE_DIV
+}
+
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteDivParams* params, const OpData* data,
+                           const TfLiteEvalTensor* input1,
+                           const TfLiteEvalTensor* input2,
+                           TfLiteEvalTensor* output) {
+  tflite::ArithmeticParams op_params = {};
+
+#define TF_LITE_DIV(type, opname, dtype)                         \
+  type::opname(op_params, tflite::micro::GetTensorShape(input1), \
+               tflite::micro::GetTensorData<dtype>(input1),      \
+               tflite::micro::GetTensorShape(input2),            \
+               tflite::micro::GetTensorData<dtype>(input2),      \
+               tflite::micro::GetTensorShape(output),            \
+               tflite::micro::GetTensorData<dtype>(output))
+
+  if (input1->type == kTfLiteInt8 && input2->type == kTfLiteInt8 &&
+      output->type == kTfLiteInt8) {
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    op_params.input1_offset = -data->input1_zero_point;
+    op_params.input2_offset = -data->input2_zero_point;
+    op_params.output_offset = data->output_zero_point;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+
+    bool requires_broadcast = reference_ops::ProcessBroadcastShapes(
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+
+    if (requires_broadcast) {
+      TF_LITE_DIV(reference_ops, BroadcastDivSlow, int8_t);
+    } else {
+      TF_LITE_DIV(reference_ops, Div, int8_t);
+    }
+#undef TF_LITE_DIV
+  } else {
+    TF_LITE_KERNEL_LOG(
+        context, "Unsupported combination of input and output types in DIV.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = static_cast<TfLiteDivParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  auto* data = static_cast<OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    EvalDiv(context, node, params, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_OK(context, EvalQuantized(context, node, params, data,
+                                             input1, input2, output));
+  } else {
+    TF_LITE_KERNEL_LOG(context,
+                       "DIV only supports FLOAT32, quantized INT8 "
+                       "now, got type %s (%d).",
+                       TfLiteTypeGetName(output->type), output->type);
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_DIV() {
+  return {/*init=*/Init,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/elu.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/elu.cc
@@ -0,0 +1,151 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/elu.h"
+
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace {
+
+// Input/output tensor index.
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+// OLD-TODO(b/142762739): We should figure out a multi-threading plan for most
+// of the activation ops below.
+
+struct OpData {
+  int8_t table[256];
+};
+
+using TransformFunc = float (*)(float);
+
+template <typename T>
+void PopulateLookupTable(const TfLiteTensor* input, const TfLiteTensor* output,
+                         const TransformFunc transform, OpData* data) {
+  if (sizeof(T) != 1) TF_LITE_FATAL("Lookup table valid only for 8bit");
+
+  const float inverse_scale = 1 / output->params.scale;
+  int32_t maxval = std::numeric_limits<T>::max();
+  int32_t minval = std::numeric_limits<T>::min();
+  for (int32_t val = minval; val <= maxval; ++val) {
+    const float dequantized =
+        input->params.scale * (val - input->params.zero_point);
+    const float transformed = transform(dequantized);
+    const float rescaled = TfLiteRound(transformed * inverse_scale);
+    const int32_t quantized =
+        static_cast<int32_t>(rescaled + output->params.zero_point);
+    data->table[static_cast<uint8_t>(static_cast<T>(val))] =
+        static_cast<T>(std::max(std::min(maxval, quantized), minval));
+  }
+}
+
+// OLD-TODO(b/143696793): move this to optimized_ops.
+void EvalUsingLookupTable(const OpData* data, const TfLiteEvalTensor* input,
+                          TfLiteEvalTensor* output) {
+  const int size = MatchingFlatSize(tflite::micro::GetTensorShape(input),
+                                    tflite::micro::GetTensorShape(output));
+  int8_t* output_data = tflite::micro::GetTensorData<int8_t>(output);
+  const int8_t* input_data = tflite::micro::GetTensorData<int8_t>(input);
+
+  for (int i = 0; i < size; ++i) {
+    output_data[i] = data->table[static_cast<uint8_t>(input_data[i])];
+  }
+}
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+
+  // Use LUT to handle quantized elu path.
+  if (input->type == kTfLiteInt8) {
+    OpData* data = static_cast<OpData*>(node->user_data);
+    TransformFunc transform = [](float value) {
+      return value < 0.0f ? std::exp(value) - 1.0f : value;
+    };
+    PopulateLookupTable<int8_t>(input, output, transform, data);
+  }
+
+  return kTfLiteOk;
+}
+
+void* EluInit(TfLiteContext* context, const char* buffer, size_t length) {
+  // This is a builtin op, so we don't use the contents in 'buffer', if any.
+  // Instead, we allocate a new object to carry information from Prepare() to
+  // Eval().
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus EluPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return CalculateOpData(context, node);
+}
+
+TfLiteStatus EluEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      reference_ops::Elu(tflite::micro::GetTensorShape(input),
+                         tflite::micro::GetTensorData<float>(input),
+                         tflite::micro::GetTensorShape(output),
+                         tflite::micro::GetTensorData<float>(output));
+      return kTfLiteOk;
+    }
+    case kTfLiteInt8: {
+      const OpData* data = static_cast<OpData*>(node->user_data);
+      EvalUsingLookupTable(data, input, output);
+      return kTfLiteOk;
+    }
+    default:
+      TF_LITE_KERNEL_LOG(
+          context, "ELU only supports float32 and int8 currently, got %s.",
+          TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+}
+
+}  // namespace
+
+TfLiteRegistration Register_ELU() {
+  return {/*init=*/EluInit,
+          /*free=*/nullptr,
+          /*prepare=*/EluPrepare,
+          /*invoke=*/EluEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/ethosu.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/ethosu.cc
@@ -19,14 +19,9 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"

 namespace tflite {
-namespace ops {
-namespace micro {
-namespace custom {
+
 TfLiteRegistration* Register_ETHOSU() { return nullptr; }

 const char* GetString_ETHOSU() { return ""; }

-}  // namespace custom
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/ethosu.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/ethosu.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_ETHOSU_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_ETHOSU_H_
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+TfLiteRegistration* Register_ETHOSU();
+
+const char* GetString_ETHOSU();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_ETHOSU_H_
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/exp.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/exp.cc
@@ -0,0 +1,78 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/exp.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
+  TF_LITE_ENSURE_EQ(context, output->bytes, input->bytes);
+  TF_LITE_ENSURE_EQ(context, output->dims->size, input->dims->size);
+  for (int i = 0; i < output->dims->size; ++i) {
+    TF_LITE_ENSURE_EQ(context, output->dims->data[i], input->dims->data[i]);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  int flat_size = MatchingFlatSize(tflite::micro::GetTensorShape(input),
+                                   tflite::micro::GetTensorShape(output));
+
+  if (input->type == kTfLiteFloat32) {
+    reference_ops::Exp(tflite::micro::GetTensorData<float>(input),
+                       static_cast<size_t>(flat_size),
+                       tflite::micro::GetTensorData<float>(output));
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) currently not supported by Exp.",
+                       TfLiteTypeGetName(input->type), input->type);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteRegistration Register_EXP() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/expand_dims.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/expand_dims.cc
@@ -0,0 +1,152 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kAxisTensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus ExpandTensorDim(TfLiteContext* context,
+                             const TfLiteEvalTensor* input, int32_t axis,
+                             TfLiteEvalTensor* output) {
+  const TfLiteIntArray* input_dims = input->dims;
+  TfLiteIntArray* output_dims = output->dims;
+  if (axis < 0) {
+    axis = input_dims->size + 1 + axis;
+  }
+  TF_LITE_ENSURE(context, (axis <= input_dims->size));
+
+  output_dims->size = input_dims->size + 1;
+  for (int i = 0; i < output_dims->size; ++i) {
+    if (i < axis) {
+      output_dims->data[i] = input_dims->data[i];
+    } else if (i == axis) {
+      output_dims->data[i] = 1;
+    } else {
+      output_dims->data[i] = input_dims->data[i - 1];
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus GetAxisValueFromTensor(TfLiteContext* context,
+                                    const TfLiteEvalTensor* axis,
+                                    int32_t* axis_value) {
+  const int axis_dims = (tflite::micro::GetTensorShape(axis)).DimensionsCount();
+  if (axis_dims > 1) {
+    TF_LITE_KERNEL_LOG(context, "Axis has only one element for Expand_Dims.",
+                       axis_dims);
+    return kTfLiteError;
+  }
+
+  if (kTfLiteInt32 == (axis->type)) {
+    const int32_t* axis_ptr = tflite::micro::GetTensorData<int32_t>(axis);
+    *axis_value = axis_ptr[0];
+    return kTfLiteOk;
+  } else {
+    TF_LITE_KERNEL_LOG(context,
+                       "Axis type %s (%d) not supported by Expand_Dims.",
+                       TfLiteTypeGetName(axis->type), axis->type);
+    return kTfLiteError;
+  }
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* axis;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kAxisTensor, &axis));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  output->type = input->type;
+  if (IsDynamicTensor(axis)) {
+    TF_LITE_KERNEL_LOG(context,
+                       "DynamicTensor is not yet supported by Expand_Dims.");
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+template <typename T>
+void memCopyN(T* out, const T* in, const int num_elements) {
+  for (int i = 0; i < num_elements; ++i) {
+    out[i] = in[i];
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* axis =
+      tflite::micro::GetEvalInput(context, node, kAxisTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  const int flat_size = ElementCount(*input->dims);
+  const int input_dims = input->dims->size;
+
+  int32_t axis_value;
+  TF_LITE_ENSURE_OK(context,
+                    GetAxisValueFromTensor(context, axis, &axis_value));
+  if ((axis_value > static_cast<int32_t>(input_dims)) ||
+      (axis_value < static_cast<int32_t>(-(input_dims + 1)))) {
+    TF_LITE_KERNEL_LOG(context, "Invalid Expand_Dims axis value (%d).",
+                       axis_value);
+    return kTfLiteError;
+  }
+  ExpandTensorDim(context, input, axis_value, output);
+
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      memCopyN(tflite::micro::GetTensorData<float>(output),
+               tflite::micro::GetTensorData<float>(input), flat_size);
+    } break;
+    case kTfLiteInt8: {
+      memCopyN(tflite::micro::GetTensorData<int8_t>(output),
+               tflite::micro::GetTensorData<int8_t>(input), flat_size);
+    } break;
+    default:
+      TF_LITE_KERNEL_LOG(
+          context,
+          "Expand_Dims only currently supports int8 and float32, got %d.",
+          input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteRegistration Register_EXPAND_DIMS() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/fill.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/fill.cc
@@ -0,0 +1,131 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/fill.h"
+
+#include <stdint.h>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+
+namespace {
+
+template <typename T>
+TfLiteStatus EnsureEqImpl(TfLiteContext* context, const TfLiteIntArray* array,
+                          const TfLiteTensor* tensor) {
+  for (int i = 0; i < array->size; ++i) {
+    TF_LITE_ENSURE_EQ(context, array->data[i], GetTensorData<T>(tensor)[i]);
+  }
+  return kTfLiteOk;
+}
+
+// Ensure the equality of an int array and a tensor, which must be
+// one-dimensional and of an integer type.
+TfLiteStatus EnsureEq(TfLiteContext* context, const TfLiteIntArray* array,
+                      const TfLiteTensor* tensor) {
+  TF_LITE_ENSURE_EQ(context, NumDimensions(tensor), 1);
+  const auto tensor_len = tensor->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, array->size, tensor_len);
+
+  switch (tensor->type) {
+    case kTfLiteInt8:
+      return EnsureEqImpl<int8_t>(context, array, tensor);
+    case kTfLiteUInt8:
+      return EnsureEqImpl<uint8_t>(context, array, tensor);
+    case kTfLiteInt16:
+      return EnsureEqImpl<int16_t>(context, array, tensor);
+    case kTfLiteInt32:
+      return EnsureEqImpl<int32_t>(context, array, tensor);
+    case kTfLiteInt64:
+      return EnsureEqImpl<int64_t>(context, array, tensor);
+    default:
+      TF_LITE_KERNEL_LOG(context,
+                         "cannot compare int array to tensor of type %d.",
+                         tensor->type);
+      return kTfLiteError;
+  }
+}
+
+constexpr int kDimsTensor = 0;
+constexpr int kValueTensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Ensure inputs and outputs exist.
+  const TfLiteTensor* dims;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kDimsTensor, &dims));
+  const TfLiteTensor* value;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kValueTensor, &value));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+
+  // The value tensor must be a scalar.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(value), 0);
+
+  // The value type and output type must match.
+  TF_LITE_ENSURE_EQ(context, value->type, output->type);
+
+  // The dims tensor must match the output tensor shape. As a byproduct,
+  // ensures the dims tensor is of an integer type.
+  TF_LITE_ENSURE_OK(context, EnsureEq(context, output->dims, dims));
+
+  return kTfLiteOk;
+}
+
+template <typename T>
+void FillImpl(const TfLiteEvalTensor* value, TfLiteEvalTensor* output) {
+  reference_ops::Fill(
+      micro::GetTensorShape(value), micro::GetTensorData<T>(value),
+      micro::GetTensorShape(output), micro::GetTensorData<T>(output));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* value =
+      micro::GetEvalInput(context, node, kValueTensor);
+  TfLiteEvalTensor* output = micro::GetEvalOutput(context, node, kOutputTensor);
+
+  switch (value->type) {
+    case kTfLiteFloat32:
+      FillImpl<float>(value, output);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(
+          context, "Fill only currently supports float32 for input 1, got %d.",
+          TfLiteTypeGetName(value->type));
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_FILL() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -28,176 +28,37 @@ limitations under the License.
 namespace tflite {
 namespace {

-struct OpData {
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-  // The index of the temporary tensor where the quantized inputs are cached.
-  int input_quantized_index;
-  // Cached zero point values of tensors.
-  int32_t input_zero_point;
-  int32_t filter_zero_point;
-  int32_t output_zero_point;
-};
-
-constexpr int kInputTensor = 0;
-constexpr int kWeightsTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
-TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFusedActivation activation,
-                             TfLiteType data_type, const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output,
-                             OpData* data) {
-  TfLiteStatus status = kTfLiteOk;
-  if (data_type != kTfLiteFloat32) {
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-        context, activation, output, &data->output_activation_min,
-        &data->output_activation_max));
-
-    data->input_zero_point = input->params.zero_point;
-    data->filter_zero_point = filter->params.zero_point;
-    data->output_zero_point = output->params.zero_point;
-  }
-  return status;
-}
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+  return context->AllocatePersistentBuffer(context,
+                                           sizeof(OpDataFullyConnected));
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  TFLITE_DCHECK(node->builtin_data != nullptr);

-  OpData* data = static_cast<OpData*>(node->user_data);
+  auto* data = static_cast<OpDataFullyConnected*>(node->user_data);
  const auto params =
      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input =
+      GetInput(context, node, kFullyConnectedInputTensor);
  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* filter =
+      GetInput(context, node, kFullyConnectedWeightsTensor);
  TF_LITE_ENSURE(context, filter != nullptr);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* bias =
+      GetOptionalInputTensor(context, node, kFullyConnectedBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kFullyConnectedOutputTensor);
  TF_LITE_ENSURE(context, output != nullptr);

  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                     "Hybrid models are not supported on TFLite Micro.");

-  return CalculateOpData(context, params->activation, input->type, input,
-                         filter, bias, output, data);
-}
-
-TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               const OpData& data,
-                               const TfLiteEvalTensor* input,
-                               const TfLiteEvalTensor* filter,
-                               const TfLiteEvalTensor* bias,
-                               TfLiteEvalTensor* output) {
-  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = -data.input_zero_point;
-  op_params.weights_offset = -data.filter_zero_point;
-  op_params.output_offset = data.output_zero_point;
-  op_params.output_multiplier = data.output_multiplier;
-  // TODO(b/138810107): Figure out whether output shift should be inverted
-  op_params.output_shift = -data.output_shift;
-  op_params.quantized_activation_min = data.output_activation_min;
-  op_params.quantized_activation_max = data.output_activation_max;
-
-  reference_integer_ops::FullyConnected(
-      op_params, tflite::micro::GetTensorShape(input),
-      tflite::micro::GetTensorData<int8_t>(input),
-      tflite::micro::GetTensorShape(filter),
-      tflite::micro::GetTensorData<int8_t>(filter),
-      tflite::micro::GetTensorShape(bias),
-      tflite::micro::GetTensorData<int32_t>(bias),
-      tflite::micro::GetTensorShape(output),
-      tflite::micro::GetTensorData<int8_t>(output));
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           const OpData& data, const TfLiteEvalTensor* input,
-                           const TfLiteEvalTensor* filter,
-                           const TfLiteEvalTensor* bias,
-                           TfLiteEvalTensor* output) {
-  const int32_t input_offset = -data.input_zero_point;
-  const int32_t filter_offset = -data.filter_zero_point;
-  const int32_t output_offset = data.output_zero_point;
-
-  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data.output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data.output_shift;
-  op_params.quantized_activation_min = data.output_activation_min;
-  op_params.quantized_activation_max = data.output_activation_max;
-
-#define TF_LITE_FULLY_CONNECTED(output_data_type)      \
-  reference_ops::FullyConnected(                       \
-      op_params, tflite::micro::GetTensorShape(input), \
-      tflite::micro::GetTensorData<uint8_t>(input),    \
-      tflite::micro::GetTensorShape(filter),           \
-      tflite::micro::GetTensorData<uint8_t>(filter),   \
-      tflite::micro::GetTensorShape(bias),             \
-      tflite::micro::GetTensorData<int32_t>(bias),     \
-      tflite::micro::GetTensorShape(output),           \
-      tflite::micro::GetTensorData<output_data_type>(output))
-  switch (output->type) {
-    case kTfLiteUInt8:
-      TF_LITE_FULLY_CONNECTED(uint8_t);
-      break;
-    case kTfLiteInt16:
-      TF_LITE_FULLY_CONNECTED(int16_t);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(output->type), output->type);
-      return kTfLiteError;
-  }
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteFusedActivation activation,
-                       const TfLiteEvalTensor* input,
-                       const TfLiteEvalTensor* filter,
-                       const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(activation, &output_activation_min,
-                           &output_activation_max);
-  tflite::FullyConnectedParams op_params;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-  tflite::reference_ops::FullyConnected(
-      op_params, tflite::micro::GetTensorShape(input),
-      tflite::micro::GetTensorData<float>(input),
-      tflite::micro::GetTensorShape(filter),
-      tflite::micro::GetTensorData<float>(filter),
-      tflite::micro::GetTensorShape(bias),
-      tflite::micro::GetTensorData<float>(bias),
-      tflite::micro::GetTensorShape(output),
-      tflite::micro::GetTensorData<float>(output));
-  return kTfLiteOk;
+  return CalculateOpDataFullyConnected(context, params->activation, input->type,
+                                       input, filter, bias, output, data);
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
@@ -206,33 +67,66 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);

  const TfLiteEvalTensor* input =
-      tflite::micro::GetEvalInput(context, node, kInputTensor);
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedInputTensor);
  const TfLiteEvalTensor* filter =
-      tflite::micro::GetEvalInput(context, node, kWeightsTensor);
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedWeightsTensor);
  const TfLiteEvalTensor* bias =
-      tflite::micro::GetEvalInput(context, node, kBiasTensor);
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedBiasTensor);
  TfLiteEvalTensor* output =
-      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+      tflite::micro::GetEvalOutput(context, node, kFullyConnectedOutputTensor);

  TFLITE_DCHECK(node->user_data != nullptr);
-  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+  const auto& data =
+      *(static_cast<const OpDataFullyConnected*>(node->user_data));

  // Checks in Prepare ensure input, output and filter types are all the same.
  switch (input->type) {
-    case kTfLiteFloat32:
-      return EvalFloat(context, node, params->activation, input, filter, bias,
-                       output);
-    case kTfLiteInt8:
-      return EvalQuantizedInt8(context, node, data, input, filter, bias,
-                               output);
+    case kTfLiteFloat32: {
+      tflite::reference_ops::FullyConnected(
+          FullyConnectedParamsFloat(params->activation),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+      break;
+    }

-    case kTfLiteUInt8:
-      return EvalQuantized(context, node, data, input, filter, bias, output);
+    case kTfLiteInt8: {
+      tflite::reference_integer_ops::FullyConnected(
+          FullyConnectedParamsQuantized(data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<int8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+      break;
+    }

-    default:
+    case kTfLiteUInt8: {
+      tflite::reference_ops::FullyConnected(
+          FullyConnectedParamsQuantized(data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<uint8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<uint8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<uint8_t>(output));
+      break;
+    }
+    default: {
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                         TfLiteTypeGetName(input->type), input->type);
      return kTfLiteError;
+    }
  }
  return kTfLiteOk;
 }
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/fully_connected.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/fully_connected.h
@@ -15,10 +15,51 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_KERNELS_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_MICRO_KERNELS_FULLY_CONNECTED_H_

+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"

 namespace tflite {

+struct OpDataFullyConnected {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  // The index of the temporary tensor where the quantized inputs are cached.
+  int input_quantized_index;
+  // Cached zero point values of tensors.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+};
+
+extern const int kFullyConnectedInputTensor;
+extern const int kFullyConnectedWeightsTensor;
+extern const int kFullyConnectedBiasTensor;
+extern const int kFullyConnectedOutputTensor;
+
+// Returns a FullyConnectedParams struct with all the parameters needed for a
+// float computation.
+FullyConnectedParams FullyConnectedParamsFloat(
+    TfLiteFusedActivation activation);
+
+// Returns a FullyConnectedParams struct with all the parameters needed for a
+// quantized computation.
+FullyConnectedParams FullyConnectedParamsQuantized(
+    const OpDataFullyConnected& op_data);
+
+TfLiteStatus CalculateOpDataFullyConnected(
+    TfLiteContext* context, TfLiteFusedActivation activation,
+    TfLiteType data_type, const TfLiteTensor* input, const TfLiteTensor* filter,
+    const TfLiteTensor* bias, TfLiteTensor* output, OpDataFullyConnected* data);
+
 // This is the most generic TfLiteRegistration. The actual supported types may
 // still be target dependent. The only requirement is that every implementation
 // (reference or optimized) must define this function.
@@ -30,7 +71,7 @@ TfLiteRegistration Register_FULLY_CONNECTED();
 // part of the build. As a result, we use defined(ARDUINO) as proxy for the
 // CMSIS kernels for this one special case.

-// Returns a TfLiteRegistration struct for cmsis-nn kernel variant that only
+// Returns a TfLiteRegistration struct for cmsis_nn kernel variant that only
 // supports int8.
 TfLiteRegistration Register_FULLY_CONNECTED_INT8();

--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/fully_connected_common.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/fully_connected_common.cc
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+
+const int kFullyConnectedInputTensor = 0;
+const int kFullyConnectedWeightsTensor = 1;
+const int kFullyConnectedBiasTensor = 2;
+const int kFullyConnectedOutputTensor = 0;
+
+FullyConnectedParams FullyConnectedParamsQuantized(
+    const OpDataFullyConnected& op_data) {
+  FullyConnectedParams op_params;
+  op_params.input_offset = -op_data.input_zero_point;
+  op_params.weights_offset = -op_data.filter_zero_point;
+  op_params.output_offset = op_data.output_zero_point;
+  op_params.output_multiplier = op_data.output_multiplier;
+  op_params.output_shift = op_data.output_shift;
+  op_params.quantized_activation_min = op_data.output_activation_min;
+  op_params.quantized_activation_max = op_data.output_activation_max;
+  return op_params;
+}
+
+FullyConnectedParams FullyConnectedParamsFloat(
+    TfLiteFusedActivation activation) {
+  FullyConnectedParams op_params;
+  CalculateActivationRange(activation, &op_params.float_activation_min,
+                           &op_params.float_activation_max);
+  return op_params;
+}
+
+TfLiteStatus CalculateOpDataFullyConnected(
+    TfLiteContext* context, TfLiteFusedActivation activation,
+    TfLiteType data_type, const TfLiteTensor* input, const TfLiteTensor* filter,
+    const TfLiteTensor* bias, TfLiteTensor* output,
+    OpDataFullyConnected* data) {
+  if (data_type != kTfLiteFloat32) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier,
+                       &data->output_shift);
+
+    data->input_zero_point = input->params.zero_point;
+    data->filter_zero_point = filter->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
+
+    return CalculateActivationRangeQuantized(context, activation, output,
+                                             &data->output_activation_min,
+                                             &data->output_activation_max);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/kernel_runner.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/kernel_runner.cc
@@ -15,6 +15,8 @@ limitations under the License.

 #include "tensorflow/lite/micro/kernels/kernel_runner.h"

+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
 namespace tflite {
 namespace micro {

@@ -30,12 +32,12 @@ uint8_t KernelRunner::kKernelRunnerBuffer_[];
 KernelRunner::KernelRunner(const TfLiteRegistration& registration,
                           TfLiteTensor* tensors, int tensors_size,
                           TfLiteIntArray* inputs, TfLiteIntArray* outputs,
-                           void* builtin_data, ErrorReporter* error_reporter)
-    : allocator_(SimpleMemoryAllocator::Create(
-          error_reporter, kKernelRunnerBuffer_, kKernelRunnerBufferSize_)),
+                           void* builtin_data)
+    : allocator_(SimpleMemoryAllocator::Create(GetMicroErrorReporter(),
+                                               kKernelRunnerBuffer_,
+                                               kKernelRunnerBufferSize_)),
      registration_(registration),
-      tensors_(tensors),
-      error_reporter_(error_reporter) {
+      tensors_(tensors) {
  // Prepare TfLiteContext:
  context_.impl_ = static_cast<void*>(this);
  context_.ReportError = ReportOpError;
@@ -52,9 +54,10 @@ KernelRunner::KernelRunner(const TfLiteRegistration& registration,
  node_.builtin_data = builtin_data;
 }

-TfLiteStatus KernelRunner::InitAndPrepare(const char* init_data) {
+TfLiteStatus KernelRunner::InitAndPrepare(const char* init_data,
+                                          size_t length) {
  if (registration_.init) {
-    node_.user_data = registration_.init(&context_, init_data, /*length=*/0);
+    node_.user_data = registration_.init(&context_, init_data, length);
  }
  if (registration_.prepare) {
    TF_LITE_ENSURE_STATUS(registration_.prepare(&context_, &node_));
@@ -64,8 +67,7 @@ TfLiteStatus KernelRunner::InitAndPrepare(const char* init_data) {

 TfLiteStatus KernelRunner::Invoke() {
  if (registration_.invoke == nullptr) {
-    TF_LITE_REPORT_ERROR(error_reporter_,
-                         "TfLiteRegistration missing invoke function pointer!");
+    MicroPrintf("TfLiteRegistration missing invoke function pointer!");
    return kTfLiteError;
  }
  return registration_.invoke(&context_, &node_);
@@ -118,10 +120,8 @@ TfLiteStatus KernelRunner::RequestScratchBufferInArena(TfLiteContext* context,
  TFLITE_DCHECK(runner != nullptr);

  if (runner->scratch_buffer_count_ == kNumScratchBuffers_) {
-    TF_LITE_REPORT_ERROR(
-        runner->error_reporter_,
-        "Exceeded the maximum number of scratch tensors allowed (%d).",
-        kNumScratchBuffers_);
+    MicroPrintf("Exceeded the maximum number of scratch tensors allowed (%d).",
+                kNumScratchBuffers_);
    return kTfLiteError;
  }

@@ -151,13 +151,9 @@ void* KernelRunner::GetScratchBuffer(TfLiteContext* context, int buffer_index) {

 void KernelRunner::ReportOpError(struct TfLiteContext* context,
                                 const char* format, ...) {
-  TFLITE_DCHECK(context != nullptr);
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
-
  va_list args;
  va_start(args, format);
-  TF_LITE_REPORT_ERROR(runner->error_reporter_, format, args);
+  GetMicroErrorReporter()->Report(format, args);
  va_end(args);
 }

--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/kernel_runner.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/kernel_runner.h
@@ -23,23 +23,22 @@ limitations under the License.
 namespace tflite {
 namespace micro {

-// Helper class to perform a simulated kernel (i.e. TfLiteRegistration) lifecyle
-// (init, prepare, invoke). All internal allocations are handled by this class.
-// Simply pass in the registration, list of required tensors, inputs array,
-// outputs array, and any pre-builtin data. Calling Invoke() will automatically
-// walk the kernl and outputs will be ready on the the TfLiteTensor output
-// provided during construction.
+// Helper class to perform a simulated kernel (i.e. TfLiteRegistration)
+// lifecycle (init, prepare, invoke). All internal allocations are handled by
+// this class. Simply pass in the registration, list of required tensors, inputs
+// array, outputs array, and any pre-builtin data. Calling Invoke() will
+// automatically walk the kernel and outputs will be ready on the TfLiteTensor
+// output provided during construction.
 class KernelRunner {
 public:
  KernelRunner(const TfLiteRegistration& registration, TfLiteTensor* tensors,
               int tensors_size, TfLiteIntArray* inputs,
-               TfLiteIntArray* outputs, void* builtin_data,
-               ErrorReporter* error_reporter);
+               TfLiteIntArray* outputs, void* builtin_data);

  // Calls init and prepare on the kernel (i.e. TfLiteRegistration) struct. Any
-  // exceptions will be reported through the error_reporter and returned as a
-  // status code here.
-  TfLiteStatus InitAndPrepare(const char* init_data = nullptr);
+  // exceptions will be DebugLog'd and returned as a status code.
+  TfLiteStatus InitAndPrepare(const char* init_data = nullptr,
+                              size_t length = 0);

  // Calls init, prepare, and invoke on a given TfLiteRegistration pointer.
  // After successful invoke, results will be available in the output tensor as
@@ -60,7 +59,7 @@ class KernelRunner {
                            ...);

 private:
-  static constexpr int kNumScratchBuffers_ = 5;
+  static constexpr int kNumScratchBuffers_ = 12;

  static constexpr int kKernelRunnerBufferSize_ = 10000;
  static uint8_t kKernelRunnerBuffer_[kKernelRunnerBufferSize_];
@@ -68,7 +67,6 @@ class KernelRunner {
  SimpleMemoryAllocator* allocator_ = nullptr;
  const TfLiteRegistration& registration_;
  TfLiteTensor* tensors_ = nullptr;
-  ErrorReporter* error_reporter_ = nullptr;

  TfLiteContext context_ = {};
  TfLiteNode node_ = {};
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/kernel_util.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/kernel_util.cc
@@ -37,5 +37,17 @@ const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor) {
  return RuntimeShape(dims_size, dims_data);
 }

+PaddingType RuntimePaddingType(TfLitePadding padding) {
+  switch (padding) {
+    case TfLitePadding::kTfLitePaddingSame:
+      return PaddingType::kSame;
+    case TfLitePadding::kTfLitePaddingValid:
+      return PaddingType::kValid;
+    case TfLitePadding::kTfLitePaddingUnknown:
+    default:
+      return PaddingType::kNone;
+  }
+}
+
 }  // namespace micro
 }  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/kernel_util.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/kernel_util.h
@@ -18,6 +18,7 @@ limitations under the License.

 #include <cstdint>

+#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -69,6 +70,8 @@ const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor);
 bool HaveSameShapes(const TfLiteEvalTensor* input1,
                    const TfLiteEvalTensor* input2);

+PaddingType RuntimePaddingType(TfLitePadding padding);
+
 }  // namespace micro
 }  // namespace tflite

--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/l2_pool_2d.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/l2_pool_2d.cc
@@ -0,0 +1,137 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace {
+
+// Input/output tensor index.
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+// required rank for input/output tensor shape
+constexpr int kTensorShapeRank = 4;
+
+// input/output tensor shape rank associations
+enum { kBatchRank = 0, kHeightRank, kWidthRank, kChannelRank };
+
+TfLiteStatus L2Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = static_cast<TfLitePoolParams*>(node->builtin_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), kTensorShapeRank);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output), kTensorShapeRank);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+
+  int batches = SizeOfDimension(input, kBatchRank);
+  int height = SizeOfDimension(input, kHeightRank);
+  int width = SizeOfDimension(input, kWidthRank);
+  int channels_out = SizeOfDimension(input, kChannelRank);
+
+  // Matching GetWindowedOutputSize in TensorFlow.
+  auto padding = params->padding;
+  int out_width, out_height;
+
+  params->computed.padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width, 1, 1, height, width,
+      params->filter_height, params->filter_width, padding, &out_height,
+      &out_width);
+
+  // We currently don't have a quantized implementation of L2Pool
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+
+  // We must update the output tensor dimensions.
+  // The dims storage is expected to be the same area in memory
+  // for both TfLiteTensor and TfLiteEvalTensor.  This is important
+  // because TfLiteTensor in the MicroInterpreter is a temporary
+  // allocation.
+  output->dims->data[kBatchRank] = batches;
+  output->dims->data[kHeightRank] = out_height;
+  output->dims->data[kWidthRank] = out_width;
+  output->dims->data[kChannelRank] = channels_out;
+
+  return kTfLiteOk;
+}
+
+void L2EvalFloat(const TfLitePoolParams& params, const TfLiteEvalTensor& input,
+                 tflite::PoolParams* op_params, TfLiteEvalTensor* output) {
+  float activation_min, activation_max;
+  CalculateActivationRange(params.activation, &activation_min, &activation_max);
+
+  op_params->float_activation_min = activation_min;
+  op_params->float_activation_max = activation_max;
+  reference_ops::L2Pool(*op_params, tflite::micro::GetTensorShape(&input),
+                        tflite::micro::GetTensorData<float>(&input),
+                        tflite::micro::GetTensorShape(output),
+                        tflite::micro::GetTensorData<float>(output));
+}
+
+TfLiteStatus L2Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = static_cast<const TfLitePoolParams*>(node->builtin_data);
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+
+  tflite::PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = params->computed.padding.height;
+  op_params.padding_values.width = params->computed.padding.width;
+
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      L2EvalFloat(*params, *input, &op_params, output);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context,
+                         "L2_POOL_2D only supports float32 currently, got %s.",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_L2_POOL_2D() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/L2Prepare,
+          /*invoke=*/L2Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/leaky_relu.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/leaky_relu.cc
@@ -0,0 +1,153 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/leaky_relu.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace {
+
+// Input/output tensor index.
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+struct LeakyReluOpData {
+  // quantization parameters
+  int32_t output_multiplier_alpha;
+  int32_t output_shift_alpha;
+  int32_t output_multiplier_identity;
+  int32_t output_shift_identity;
+  int32_t input_zero_point;
+  int32_t output_zero_point;
+};
+
+template <typename T>
+void QuantizeLeakyRelu(const LeakyReluOpData& data,
+                       const TfLiteEvalTensor* input,
+                       TfLiteEvalTensor* output) {
+  LeakyReluParams op_params = {};
+
+  op_params.input_offset = data.input_zero_point;
+  op_params.output_offset = data.output_zero_point;
+  op_params.output_multiplier_alpha = data.output_multiplier_alpha;
+  op_params.output_shift_alpha = data.output_shift_alpha;
+  op_params.output_multiplier_identity = data.output_multiplier_identity;
+  op_params.output_shift_identity = data.output_shift_identity;
+  reference_ops::QuantizeLeakyRelu(op_params,
+                                   tflite::micro::GetTensorShape(input),
+                                   tflite::micro::GetTensorData<T>(input),
+                                   tflite::micro::GetTensorShape(output),
+                                   tflite::micro::GetTensorData<T>(output));
+}
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+
+  if (output->type == kTfLiteInt8) {
+    LeakyReluOpData* data = static_cast<LeakyReluOpData*>(node->user_data);
+    const auto* params =
+        static_cast<TfLiteLeakyReluParams*>(node->builtin_data);
+
+    data->input_zero_point = input->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
+
+    int output_shift_alpha;
+    double alpha_multiplier = static_cast<double>(
+        input->params.scale * params->alpha / output->params.scale);
+    QuantizeMultiplier(alpha_multiplier, &data->output_multiplier_alpha,
+                       &output_shift_alpha);
+    data->output_shift_alpha = static_cast<int32_t>(output_shift_alpha);
+
+    int output_shift_identity;
+    double identity_multiplier =
+        static_cast<double>(input->params.scale / output->params.scale);
+    QuantizeMultiplier(identity_multiplier, &data->output_multiplier_identity,
+                       &output_shift_identity);
+    data->output_shift_identity = static_cast<int32_t>(output_shift_identity);
+  }
+
+  return kTfLiteOk;
+}
+
+void* LeakyReluInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(LeakyReluOpData));
+}
+
+TfLiteStatus LeakyReluPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return CalculateOpData(context, node);
+}
+
+TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  const LeakyReluOpData& data = *static_cast<LeakyReluOpData*>(node->user_data);
+
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      LeakyReluParams op_params = {};
+      const auto* params =
+          static_cast<TfLiteLeakyReluParams*>(node->builtin_data);
+
+      op_params.alpha = params->alpha;
+      reference_ops::LeakyRelu(op_params, tflite::micro::GetTensorShape(input),
+                               tflite::micro::GetTensorData<float>(input),
+                               tflite::micro::GetTensorShape(output),
+                               tflite::micro::GetTensorData<float>(output));
+      return kTfLiteOk;
+    } break;
+    case kTfLiteInt8: {
+      QuantizeLeakyRelu<int8_t>(data, input, output);
+      return kTfLiteOk;
+    } break;
+    default:
+      TF_LITE_KERNEL_LOG(
+          context, "Only float32, int8 are supported by LEAKY_RELU, got %s.",
+          TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+
+  return kTfLiteError;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_LEAKY_RELU() {
+  return {/*init=*/LeakyReluInit,
+          /*free=*/nullptr,
+          /*prepare=*/LeakyReluPrepare,
+          /*invoke=*/LeakyReluEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/micro_ops.h
@@ -31,12 +31,26 @@ namespace tflite {
 // (https://abseil.io/tips/130). Any new ops (or cleanup of existing ops should
 // have their Register function declarations in the tflite namespace.

+TfLiteRegistration Register_ADD_N();
+TfLiteRegistration Register_BATCH_TO_SPACE_ND();
+TfLiteRegistration Register_CAST();
 TfLiteRegistration Register_CONV_2D();
 TfLiteRegistration Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration Register_DIV();
+TfLiteRegistration Register_ELU();
+TfLiteRegistration Register_EXP();
+TfLiteRegistration Register_EXPAND_DIMS();
+TfLiteRegistration Register_FILL();
+TfLiteRegistration Register_L2_POOL_2D();
+TfLiteRegistration Register_LEAKY_RELU();
 TfLiteRegistration Register_QUANTIZE();
 TfLiteRegistration Register_SHAPE();
 TfLiteRegistration Register_SOFTMAX();
+TfLiteRegistration Register_SPACE_TO_BATCH_ND();
+TfLiteRegistration Register_SQUEEZE();
 TfLiteRegistration Register_SVDF();
+TfLiteRegistration Register_TRANSPOSE_CONV();
+TfLiteRegistration Register_ZEROS_LIKE();

 namespace ops {
 namespace micro {
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/micro_utils.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/micro_utils.h
@@ -1,8 +1,11 @@
 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
    http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/quantize.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/quantize.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/kernels/internal/reference/quantize.h"
+
+#include "tensorflow/lite/micro/kernels/quantize.h"

 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/requantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
@@ -25,160 +25,10 @@ limitations under the License.
 namespace tflite {
 namespace {

-struct OpData {
-  tflite::QuantizationParams quantization_params;
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  int32_t input_zero_point;
-};
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(OpData));
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->user_data != nullptr);
-  OpData* data = static_cast<OpData*>(node->user_data);
-
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE(context, output != nullptr);
-
-  // TODO(b/128934713): Add support for fixed-point per-channel quantization.
-  // Currently this only support affine per-layer quantization.
-  TF_LITE_ENSURE_EQ(context, output->quantization.type,
-                    kTfLiteAffineQuantization);
-  const auto* affine_quantization =
-      reinterpret_cast<TfLiteAffineQuantization*>(output->quantization.params);
-  TF_LITE_ENSURE(context, affine_quantization);
-  TF_LITE_ENSURE(context, affine_quantization->scale);
-  TF_LITE_ENSURE(context, affine_quantization->scale->size == 1);
-
-  TF_LITE_ENSURE(context, input->type == kTfLiteFloat32 ||
-                              input->type == kTfLiteInt16 ||
-                              input->type == kTfLiteInt8);
-  TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 ||
-                              output->type == kTfLiteInt8 ||
-                              output->type == kTfLiteInt16 ||
-                              output->type == kTfLiteInt32);
-
-  if (((input->type == kTfLiteInt16 || input->type == kTfLiteInt8) &&
-       output->type == kTfLiteInt8) ||
-      (input->type == kTfLiteInt16 && output->type == kTfLiteInt16)) {
-    double effective_scale = static_cast<double>(input->params.scale) /
-                             static_cast<double>(output->params.scale);
-
-    QuantizeMultiplier(effective_scale, &data->output_multiplier,
-                       &data->output_shift);
-  }
-
-  data->quantization_params.zero_point = output->params.zero_point;
-  data->quantization_params.scale = static_cast<double>(output->params.scale);
-
-  data->input_zero_point = input->params.zero_point;
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->user_data != nullptr);
-  OpData* data = static_cast<OpData*>(node->user_data);
-
-  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
-  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
-
-  if (input->type == kTfLiteFloat32) {
-    switch (output->type) {
-      case kTfLiteInt8:
-        reference_ops::AffineQuantize(
-            data->quantization_params, tflite::micro::GetTensorShape(input),
-            tflite::micro::GetTensorData<float>(input),
-            tflite::micro::GetTensorShape(output),
-            tflite::micro::GetTensorData<int8_t>(output));
-        break;
-      case kTfLiteUInt8:
-        reference_ops::AffineQuantize(
-            data->quantization_params, tflite::micro::GetTensorShape(input),
-            tflite::micro::GetTensorData<float>(input),
-            tflite::micro::GetTensorShape(output),
-            tflite::micro::GetTensorData<uint8_t>(output));
-        break;
-      case kTfLiteInt16:
-        reference_ops::AffineQuantize(
-            data->quantization_params, tflite::micro::GetTensorShape(input),
-            tflite::micro::GetTensorData<float>(input),
-            tflite::micro::GetTensorShape(output),
-            tflite::micro::GetTensorData<int16_t>(output));
-        return kTfLiteOk;
-      default:
-        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                           TfLiteTypeGetName(input->type),
-                           TfLiteTypeGetName(output->type));
-        return kTfLiteError;
-    }
-  } else if (input->type == kTfLiteInt16) {
-    size_t size = ElementCount(*input->dims);
-    switch (output->type) {
-      case kTfLiteInt8:
-        reference_ops::Requantize(tflite::micro::GetTensorData<int16_t>(input),
-                                  size, data->output_multiplier,
-                                  data->output_shift, data->input_zero_point,
-                                  data->quantization_params.zero_point,
-                                  tflite::micro::GetTensorData<int8_t>(output));
-        break;
-      case kTfLiteInt16:
-        reference_ops::Requantize(
-            tflite::micro::GetTensorData<int16_t>(input), size,
-            data->output_multiplier, data->output_shift, data->input_zero_point,
-            data->quantization_params.zero_point,
-            tflite::micro::GetTensorData<int16_t>(output));
-        return kTfLiteOk;
-      case kTfLiteInt32:
-        reference_ops::Requantize(
-            tflite::micro::GetTensorData<int16_t>(input), size,
-            data->output_multiplier, data->output_shift, data->input_zero_point,
-            data->quantization_params.zero_point,
-            tflite::micro::GetTensorData<int32_t>(output));
-        return kTfLiteOk;
-      default:
-        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                           TfLiteTypeGetName(input->type),
-                           TfLiteTypeGetName(output->type));
-        return kTfLiteError;
-    }
-  } else if (input->type == kTfLiteInt8) {
-    // Int8 to Int8 requantization, required if the input and output tensors
-    // have different scales and/or zero points.
-    size_t size = ElementCount(*input->dims);
-    switch (output->type) {
-      case kTfLiteInt8:
-        reference_ops::Requantize(tflite::micro::GetTensorData<int8_t>(input),
-                                  size, data->output_multiplier,
-                                  data->output_shift, data->input_zero_point,
-                                  data->quantization_params.zero_point,
-                                  tflite::micro::GetTensorData<int8_t>(output));
-        break;
-      default:
-        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                           TfLiteTypeGetName(input->type),
-                           TfLiteTypeGetName(output->type));
-        return kTfLiteError;
-    }
-  } else {
-    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                       TfLiteTypeGetName(input->type),
-                       TfLiteTypeGetName(output->type));
-    return kTfLiteError;
-  }
-
-  return kTfLiteOk;
+  return context->AllocatePersistentBuffer(context,
+                                           sizeof(OpDataQuantizeReference));
 }

 }  // namespace
@@ -186,8 +36,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 TfLiteRegistration Register_QUANTIZE() {
  return {/*init=*/Init,
          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
+          /*prepare=*/PrepareQuantizeReference,
+          /*invoke=*/EvalQuantizeReference,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
          /*custom_name=*/nullptr,
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/quantize.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/quantize.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_QUANTIZE_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_QUANTIZE_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+struct OpDataQuantizeReference {
+  tflite::QuantizationParams quantization_params;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t requantize_output_multiplier;
+  int requantize_output_shift;
+
+  int32_t input_zero_point;
+};
+
+TfLiteStatus EvalQuantizeReference(TfLiteContext* context, TfLiteNode* node);
+TfLiteStatus PrepareQuantizeReference(TfLiteContext* context, TfLiteNode* node);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_QUANTIZE_H_
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/quantize_common.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/quantize_common.cc
@@ -0,0 +1,171 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/quantize.h"
+#include "tensorflow/lite/kernels/internal/reference/requantize.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/quantize.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+
+TfLiteStatus PrepareQuantizeReference(TfLiteContext* context,
+                                      TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  auto* data = static_cast<OpDataQuantizeReference*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  // TODO(b/128934713): Add support for fixed-point per-channel quantization.
+  // Currently this only support affine per-layer quantization.
+  TF_LITE_ENSURE_EQ(context, output->quantization.type,
+                    kTfLiteAffineQuantization);
+  const auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(output->quantization.params);
+  TF_LITE_ENSURE(context, affine_quantization);
+  TF_LITE_ENSURE(context, affine_quantization->scale);
+  TF_LITE_ENSURE(context, affine_quantization->scale->size == 1);
+
+  TF_LITE_ENSURE(context, input->type == kTfLiteFloat32 ||
+                              input->type == kTfLiteInt16 ||
+                              input->type == kTfLiteInt8);
+  TF_LITE_ENSURE(context, output->type == kTfLiteInt8 ||
+                              output->type == kTfLiteInt16 ||
+                              output->type == kTfLiteInt32);
+
+  if ((input->type == kTfLiteInt16 && output->type == kTfLiteInt8) ||
+      (input->type == kTfLiteInt8 && output->type == kTfLiteInt8) ||
+      (input->type == kTfLiteInt8 && output->type == kTfLiteInt32) ||
+      (input->type == kTfLiteInt16 && output->type == kTfLiteInt16) ||
+      (input->type == kTfLiteInt16 && output->type == kTfLiteInt32)) {
+    double effective_scale = static_cast<double>(input->params.scale) /
+                             static_cast<double>(output->params.scale);
+
+    QuantizeMultiplier(effective_scale, &data->requantize_output_multiplier,
+                       &data->requantize_output_shift);
+  }
+
+  data->quantization_params.zero_point = output->params.zero_point;
+  data->quantization_params.scale = static_cast<double>(output->params.scale);
+
+  data->input_zero_point = input->params.zero_point;
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantizeReference(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  auto* data = static_cast<OpDataQuantizeReference*>(node->user_data);
+
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+
+  if (input->type == kTfLiteFloat32) {
+    switch (output->type) {
+      case kTfLiteInt8:
+        reference_ops::AffineQuantize(
+            data->quantization_params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<float>(input),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
+        break;
+      case kTfLiteInt16:
+        reference_ops::AffineQuantize(
+            data->quantization_params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<float>(input),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int16_t>(output));
+        return kTfLiteOk;
+      default:
+        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                           TfLiteTypeGetName(input->type),
+                           TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else if (input->type == kTfLiteInt16) {
+    size_t size = ElementCount(*input->dims);
+    switch (output->type) {
+      case kTfLiteInt8:
+        reference_ops::Requantize(
+            tflite::micro::GetTensorData<int16_t>(input), size,
+            data->requantize_output_multiplier, data->requantize_output_shift,
+            data->input_zero_point, data->quantization_params.zero_point,
+            tflite::micro::GetTensorData<int8_t>(output));
+        break;
+      case kTfLiteInt16:
+        reference_ops::Requantize(
+            tflite::micro::GetTensorData<int16_t>(input), size,
+            data->requantize_output_multiplier, data->requantize_output_shift,
+            data->input_zero_point, data->quantization_params.zero_point,
+            tflite::micro::GetTensorData<int16_t>(output));
+        return kTfLiteOk;
+      case kTfLiteInt32:
+        reference_ops::Requantize(
+            tflite::micro::GetTensorData<int16_t>(input), size,
+            data->requantize_output_multiplier, data->requantize_output_shift,
+            data->input_zero_point, data->quantization_params.zero_point,
+            tflite::micro::GetTensorData<int32_t>(output));
+        return kTfLiteOk;
+      default:
+        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                           TfLiteTypeGetName(input->type),
+                           TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else if (input->type == kTfLiteInt8) {
+    // Int8 to Int8 requantization, required if the input and output tensors
+    // have different scales and/or zero points.
+    size_t size = ElementCount(*input->dims);
+    switch (output->type) {
+      case kTfLiteInt8:
+        reference_ops::Requantize(
+            tflite::micro::GetTensorData<int8_t>(input), size,
+            data->requantize_output_multiplier, data->requantize_output_shift,
+            data->input_zero_point, data->quantization_params.zero_point,
+            tflite::micro::GetTensorData<int8_t>(output));
+        break;
+      case kTfLiteInt32:
+        reference_ops::Requantize(
+            tflite::micro::GetTensorData<int8_t>(input), size,
+            data->requantize_output_multiplier, data->requantize_output_shift,
+            data->input_zero_point, data->quantization_params.zero_point,
+            tflite::micro::GetTensorData<int32_t>(output));
+        break;
+      default:
+        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                           TfLiteTypeGetName(input->type),
+                           TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                       TfLiteTypeGetName(input->type),
+                       TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/softmax.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/softmax.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/micro/kernels/softmax.h"

 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -27,86 +28,9 @@ limitations under the License.
 namespace tflite {
 namespace {

-// Softmax parameter data that persists in user_data
-static constexpr int kInt16LUTArraySize = 513;
-
-TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
-                                    const TfLiteTensor* input,
-                                    TfLiteTensor* output,
-                                    const TfLiteSoftmaxParams* params,
-                                    SoftmaxParams* op_data) {
-  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8 ||
-      input->type == kTfLiteInt16) {
-    if (input->type == kTfLiteUInt8) {
-      TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteUInt8);
-      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    } else if (input->type == kTfLiteInt16) {
-      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-      TF_LITE_ENSURE_NEAR(context, output->params.scale, 1.f / 32768,
-                          (0.001f * 1.f / 32768));
-    } else {  // input->type == kTfLiteInt8
-      TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
-      if (output->type == kTfLiteInt16) {
-        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
-        TF_LITE_ENSURE_NEAR(context, output->params.scale, 1.f / 65536,
-                            (0.001f * 1.f / 65536));
-      } else {  // output->type == kTfLiteint8
-        TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
-        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
-        TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
-      }
-    }
-
-    static const int kScaledDiffIntegerBits = 5;
-
-    // Calculate input_multiplier and input_left_shift
-    if (input->type == kTfLiteInt16) {
-      int input_left_shift;
-      double input_scale_beta_rescale =
-          static_cast<double>(input->params.scale) *
-          static_cast<double>(params->beta) /
-          (10.0 / 65535.0);  // scale the input_diff such that [-65535, 0]
-                             // correspond to [-10.0, 0.0]
-      QuantizeMultiplier(input_scale_beta_rescale, &op_data->input_multiplier,
-                         &input_left_shift);
-      op_data->input_left_shift = input_left_shift;
-    } else {
-      int input_left_shift;
-      tflite::PreprocessSoftmaxScaling(
-          static_cast<double>(params->beta),
-          static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
-          &op_data->input_multiplier, &input_left_shift);
-      op_data->input_left_shift = input_left_shift;
-      op_data->diff_min =
-          -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
-                                              op_data->input_left_shift);
-    }
-  } else {
-    TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
-    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
-    op_data->beta = static_cast<double>(params->beta);
-  }
-  return kTfLiteOk;
-}
-
-// Takes a tensor and performs softmax along the last dimension.
-void SoftmaxFloat(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
-                  const SoftmaxParams& op_data) {
-  tflite::reference_ops::Softmax(op_data, tflite::micro::GetTensorShape(input),
-                                 tflite::micro::GetTensorData<float>(input),
-                                 tflite::micro::GetTensorShape(output),
-                                 tflite::micro::GetTensorData<float>(output));
-}
-
 void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
                      const SoftmaxParams& op_data) {
-  if (input->type == kTfLiteUInt8) {
-    tflite::reference_ops::Softmax(
-        op_data, tflite::micro::GetTensorShape(input),
-        tflite::micro::GetTensorData<uint8_t>(input),
-        tflite::micro::GetTensorShape(output),
-        tflite::micro::GetTensorData<uint8_t>(output));
-  } else if (input->type == kTfLiteInt8) {
+  if (input->type == kTfLiteInt8) {
    if (output->type == kTfLiteInt16) {
      tflite::reference_ops::Softmax(
          op_data, tflite::micro::GetTensorShape(input),
@@ -129,60 +53,6 @@ void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
  }
 }

-void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
-  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams));
-}
-
-TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TF_LITE_ENSURE(context, input != nullptr);
-  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE(context, output != nullptr);
-
-  TF_LITE_ENSURE(context, node->user_data != nullptr);
-  SoftmaxParams* op_data = static_cast<SoftmaxParams*>(node->user_data);
-  // Only allocate LUTs for KTfLiteInt16 data type
-  if (input->type == kTfLiteInt16) {
-    void* raw_exp_lut = context->AllocatePersistentBuffer(
-        context, sizeof(int16_t) * kInt16LUTArraySize);
-    TF_LITE_ENSURE(context, raw_exp_lut != nullptr);
-    op_data->exp_lut = reinterpret_cast<int16_t*>(raw_exp_lut);
-    void* one_over_one_plus_x_lut = context->AllocatePersistentBuffer(
-        context, sizeof(int16_t) * kInt16LUTArraySize);
-    TF_LITE_ENSURE(context, one_over_one_plus_x_lut != nullptr);
-    op_data->one_over_one_plus_x_lut =
-        reinterpret_cast<int16_t*>(one_over_one_plus_x_lut);
-  }
-
-  if (output->type == kTfLiteInt16) {
-    TF_LITE_ENSURE(context, input->type == kTfLiteInt8 ||
-                                input->type == kTfLiteUInt8 ||
-                                input->type == kTfLiteInt16);
-  } else {
-    TF_LITE_ENSURE_EQ(context, input->type, output->type);
-  }
-
-  // Populate LUT if required
-  if (input->type == kTfLiteInt16) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    // exp LUT only used on negative values
-    // we consider exp(-10.0) is insignificant to accumulation
-    gen_lut([](float value) { return std::exp(value); }, -10.0f, 0.0f,
-            op_data->exp_lut, kInt16LUTArraySize);
-    gen_lut([](float value) { return 1.0f / (1.0f + value); }, 0.0f, 1.0f,
-            op_data->one_over_one_plus_x_lut, kInt16LUTArraySize);
-    op_data->zero_point = output->params.zero_point;
-    op_data->scale = output->params.scale;
-  }
-
-  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
-  return CalculateSoftmaxParams(context, input, output, params, op_data);
-}
-
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
@@ -192,11 +62,14 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {

  switch (input->type) {
    case kTfLiteFloat32: {
-      SoftmaxFloat(input, output, op_data);
+      tflite::reference_ops::Softmax(
+          op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
      return kTfLiteOk;
    }
    case kTfLiteInt8:
-    case kTfLiteUInt8:
    case kTfLiteInt16: {
      SoftmaxQuantized(input, output, op_data);
      return kTfLiteOk;
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/softmax.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/softmax.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_SOFTMAX_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_SOFTMAX_H_
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length);
+
+TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_SOFTMAX_H_
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/softmax_common.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/softmax_common.cc
@@ -0,0 +1,140 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/softmax.h"
+
+namespace tflite {
+
+namespace {
+// Softmax parameter data that persists in user_data
+const int kInt16LUTArraySize = 513;
+
+TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
+                                    const TfLiteTensor* input,
+                                    TfLiteTensor* output,
+                                    const TfLiteSoftmaxParams* params,
+                                    SoftmaxParams* op_data) {
+  if (input->type == kTfLiteInt8 || input->type == kTfLiteInt16) {
+    if (input->type == kTfLiteInt16) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+      TF_LITE_ENSURE_NEAR(context, output->params.scale, 1.f / 32768,
+                          (0.001f * 1.f / 32768));
+    } else {  // input->type == kTfLiteInt8
+      TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
+      if (output->type == kTfLiteInt16) {
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
+        TF_LITE_ENSURE_NEAR(context, output->params.scale, 1.f / 65536,
+                            (0.001f * 1.f / 65536));
+      } else {  // output->type == kTfLiteint8
+        TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
+        TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
+      }
+    }
+
+    static const int kScaledDiffIntegerBits = 5;
+
+    // Calculate input_multiplier and input_left_shift
+    if (input->type == kTfLiteInt16) {
+      int input_left_shift;
+      double input_scale_beta_rescale =
+          static_cast<double>(input->params.scale) *
+          static_cast<double>(params->beta) /
+          (10.0 / 65535.0);  // scale the input_diff such that [-65535, 0]
+                             // correspond to [-10.0, 0.0]
+      QuantizeMultiplier(input_scale_beta_rescale, &op_data->input_multiplier,
+                         &input_left_shift);
+      op_data->input_left_shift = input_left_shift;
+    } else {
+      int input_left_shift;
+      tflite::PreprocessSoftmaxScaling(
+          static_cast<double>(params->beta),
+          static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
+          &op_data->input_multiplier, &input_left_shift);
+      op_data->input_left_shift = input_left_shift;
+      op_data->diff_min =
+          -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
+                                              op_data->input_left_shift);
+    }
+  } else {
+    TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
+    op_data->beta = static_cast<double>(params->beta);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams));
+}
+
+TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  TF_LITE_ENSURE(context, node->user_data != nullptr);
+  SoftmaxParams* op_data = static_cast<SoftmaxParams*>(node->user_data);
+  // Only allocate LUTs for KTfLiteInt16 data type
+  if (input->type == kTfLiteInt16) {
+    void* raw_exp_lut = context->AllocatePersistentBuffer(
+        context, sizeof(int16_t) * kInt16LUTArraySize);
+    TF_LITE_ENSURE(context, raw_exp_lut != nullptr);
+    op_data->exp_lut = reinterpret_cast<int16_t*>(raw_exp_lut);
+    void* one_over_one_plus_x_lut = context->AllocatePersistentBuffer(
+        context, sizeof(int16_t) * kInt16LUTArraySize);
+    TF_LITE_ENSURE(context, one_over_one_plus_x_lut != nullptr);
+    op_data->one_over_one_plus_x_lut =
+        reinterpret_cast<int16_t*>(one_over_one_plus_x_lut);
+  }
+
+  if (output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE(context,
+                   input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  }
+
+  // Populate LUT if required
+  if (input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    // exp LUT only used on negative values
+    // we consider exp(-10.0) is insignificant to accumulation
+    gen_lut([](float value) { return std::exp(value); }, -10.0f, 0.0f,
+            op_data->exp_lut, kInt16LUTArraySize);
+    gen_lut([](float value) { return 1.0f / (1.0f + value); }, 0.0f, 1.0f,
+            op_data->one_over_one_plus_x_lut, kInt16LUTArraySize);
+    op_data->zero_point = output->params.zero_point;
+    op_data->scale = output->params.scale;
+  }
+
+  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+  return CalculateSoftmaxParams(context, input, output, params, op_data);
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/space_to_batch_nd.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/space_to_batch_nd.cc
@@ -0,0 +1,121 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kBlockShapeTensor = 1;
+constexpr int kCropsTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// Currently, only 3D NHC and 4D NHWC input/output op_context are supported.
+// In case of 3D input, it will be extended to 3D NHWC by adding W=1.
+// The 4D array need to have exactly 2 spatial dimensions.
+// TODO(b/149952582): Support arbitrary dimension in SpaceToBatchND.
+const int kInputOutputMinDimensionNum = 3;
+const int kInputOutputMaxDimensionNum = 4;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(SpaceToBatchParams));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, input != nullptr && output != nullptr);
+
+  TF_LITE_ENSURE(context, NumDimensions(input) >= kInputOutputMinDimensionNum);
+  TF_LITE_ENSURE(context, NumDimensions(output) >= kInputOutputMinDimensionNum);
+  TF_LITE_ENSURE(context, NumDimensions(input) <= kInputOutputMaxDimensionNum);
+  TF_LITE_ENSURE(context, NumDimensions(output) <= kInputOutputMaxDimensionNum);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const SpaceToBatchParams& params =
+      *(static_cast<const SpaceToBatchParams*>(node->user_data));
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* block_shape =
+      tflite::micro::GetEvalInput(context, node, kBlockShapeTensor);
+  const TfLiteEvalTensor* crops =
+      tflite::micro::GetEvalInput(context, node, kCropsTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      reference_ops::SpaceToBatchND(
+          params, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(block_shape),
+          tflite::micro::GetTensorData<int32_t>(block_shape),
+          tflite::micro::GetTensorShape(crops),
+          tflite::micro::GetTensorData<int32_t>(crops),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+      break;
+    case kTfLiteInt8:
+      reference_ops::SpaceToBatchND(
+          params, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(block_shape),
+          tflite::micro::GetTensorData<int32_t>(block_shape),
+          tflite::micro::GetTensorShape(crops),
+          tflite::micro::GetTensorData<int32_t>(crops),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace.
+
+TfLiteRegistration Register_SPACE_TO_BATCH_ND() {
+  return {/*init=*/Init,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/squeeze.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/squeeze.cc
@@ -0,0 +1,111 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+
+namespace tflite {
+namespace {
+
+struct SqueezeContext {
+  SqueezeContext(TfLiteContext* context, TfLiteNode* node)
+      : params(reinterpret_cast<TfLiteSqueezeParams*>(node->builtin_data)),
+        input(GetInput(context, node, 0)),
+        output(GetOutput(context, node, 0)) {}
+  TfLiteSqueezeParams* params;
+  const TfLiteTensor* const input;
+  TfLiteTensor* output;
+};
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  SqueezeContext op_context(context, node);
+  const int input_num_dims = NumDimensions(op_context.input);
+  const int num_squeeze_dims = op_context.params->num_squeeze_dims;
+
+  // Determines number of dimensions of output tensor after squeeze.
+  const TfLiteIntArray* input_dims = op_context.input->dims;
+  const TfLiteIntArray* output_dims = op_context.output->dims;
+  const int* squeeze_dims = op_context.params->squeeze_dims;
+
+  constexpr int max_squeeze_dims = 8;
+  TF_LITE_ENSURE(context, input_num_dims <= max_squeeze_dims);
+  bool should_squeeze[max_squeeze_dims] = {};
+
+  if (num_squeeze_dims == 0) {
+    for (int idx = 0; idx < input_num_dims; ++idx) {
+      if (input_dims->data[idx] == 1) {
+        should_squeeze[idx] = true;
+      }
+    }
+  } else {
+    for (int idx = 0; idx < num_squeeze_dims; ++idx) {
+      int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + input_num_dims
+                                          : squeeze_dims[idx];
+      TF_LITE_ENSURE(context, current >= 0 && current < input_num_dims &&
+                                  input_dims->data[current] == 1);
+      should_squeeze[current] = true;
+    }
+  }
+
+  // Ensure output dimensions are big enough.
+  for (int in_idx = 0, out_idx = 0; in_idx < input_num_dims; ++in_idx) {
+    if (!should_squeeze[in_idx]) {
+      TFLITE_CHECK_GE(output_dims->data[out_idx++], input_dims->data[in_idx]);
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  SqueezeContext op_context(context, node);
+
+  if (op_context.input->type == kTfLiteString) {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(op_context.input->type),
+                       op_context.input->type);
+    return kTfLiteError;
+  }
+
+  TF_LITE_ENSURE_EQ(context, op_context.input->bytes, op_context.output->bytes);
+  memcpy(op_context.output->data.raw, op_context.input->data.raw,
+         op_context.input->bytes);
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_SQUEEZE() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/svdf.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/svdf.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

+#include "tensorflow/lite/micro/kernels/svdf.h"
+
 #include <math.h>

 #include "tensorflow/lite/c/builtin_op_data.h"
@@ -29,496 +31,44 @@ limitations under the License.
 namespace tflite {
 namespace {

-struct OpData {
-  int32_t effective_scale_1_a;
-  int32_t effective_scale_2_a;
-  // b versions of each scale are kept at int since the numbers are just the
-  // shift value - typically between [-32, 32].
-  int effective_scale_1_b;
-  int effective_scale_2_b;
-  int scratch_tensor_index;
-  int scratch_output_tensor_index;
-
-  // Cached tensor zero point values for quantized operations.
-  int input_zero_point;
-  int output_zero_point;
-};
-
-// Input tensors.
-constexpr int kInputTensor = 0;
-constexpr int kWeightsFeatureTensor = 1;
-constexpr int kWeightsTimeTensor = 2;
-constexpr int kBiasTensor = 3;
-// This is a variable tensor, and will be modified by this op.
-constexpr int kInputActivationStateTensor = 4;
-
-// Output tensor.
-constexpr int kOutputTensor = 0;
-
-/**
- * This version of SVDF is specific to TFLite Micro. It contains the following
- * differences between the TFLite version:
- *
- * 1.) Scratch tensor allocation - scratch tensors must be known ahead of time
- * for the Micro interpreter.
- * 2.) Output dimensions - the TFLite version determines output size and runtime
- * and resizes the output tensor. Micro runtime does not support tensor
- * resizing.
- */
-static inline void ApplyTimeWeightsBiasAndActivation(
-    int batch_size, int memory_size, int num_filters, int num_units, int rank,
-    const float* const __restrict__ weights_time_ptr,
-    const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
-    float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
-    float* const __restrict__ output_ptr) {
-  // Compute matmul(activation_state, weights_time).
-  for (int b = 0; b < batch_size; ++b) {
-    // Perform batched vector dot product:
-    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
-    const float* vector1_ptr = weights_time_ptr;
-    const float* vector2_ptr = state_ptr + b * memory_size * num_filters;
-    for (int i = 0; i < num_filters; ++i) {
-      *scratch_ptr_batch = 0.f;
-      for (int j = 0; j < memory_size; ++j) {
-        *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
-      }
-      scratch_ptr_batch++;
-    }
-  }
-
-  // Initialize output with bias if provided.
-  if (bias_ptr) {
-    // VectorBatchVectorAssign
-    for (int i = 0; i < batch_size; ++i) {
-      float* output_data = output_ptr + i * num_units;
-      const float* bias_data = bias_ptr;
-      for (int j = 0; j < num_units; ++j) {
-        *output_data++ = *bias_data++;
-      }
-    }
-  } else {
-    float* output_data = output_ptr;
-    for (int i = 0; i < batch_size * num_units; ++i) {
-      *output_data++ = 0.0f;
-    }
-  }
-
-  // Reduction sum.
-  for (int b = 0; b < batch_size; ++b) {
-    float* output_ptr_batch = output_ptr + b * num_units;
-    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
-
-    // Reduction sum vector
-    for (int i = 0; i < num_units; ++i) {
-      for (int j = 0; j < rank; j++) {
-        output_ptr_batch[i] += *scratch_ptr_batch++;
-      }
-    }
-  }
-
-  // Apply activation.
-  for (int b = 0; b < batch_size; ++b) {
-    float* output_ptr_batch = output_ptr + b * num_units;
-    for (int i = 0; i < num_units; ++i) {
-      *output_ptr_batch =
-          tflite::ops::micro::ActivationValFloat(activation, *output_ptr_batch);
-      ++output_ptr_batch;
-    }
-  }
-}
-
-inline void EvalFloatSVDF(
-    TfLiteContext* context, TfLiteNode* node, const TfLiteEvalTensor* input,
-    const TfLiteEvalTensor* weights_feature,
-    const TfLiteEvalTensor* weights_time, const TfLiteEvalTensor* bias,
-    const TfLiteSVDFParams* params, int scratch_tensor_index,
-    TfLiteEvalTensor* activation_state, TfLiteEvalTensor* output) {
-  const int rank = params->rank;
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
-  const int num_filters = weights_feature->dims->data[0];
-  const int num_units = num_filters / rank;
-  const int memory_size = weights_time->dims->data[1];
-
-  const float* weights_feature_ptr =
-      tflite::micro::GetTensorData<float>(weights_feature);
-  const float* weights_time_ptr =
-      tflite::micro::GetTensorData<float>(weights_time);
-  const float* bias_ptr = tflite::micro::GetTensorData<float>(bias);
-  const float* input_ptr = tflite::micro::GetTensorData<float>(input);
-
-  float* state_ptr = tflite::micro::GetTensorData<float>(activation_state);
-
-  TFLITE_DCHECK(context != nullptr);
-  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
-
-  float* scratch_ptr = static_cast<float*>(
-      context->GetScratchBuffer(context, scratch_tensor_index));
-
-  float* output_ptr = tflite::micro::GetTensorData<float>(output);
-
-  // Left shift the activation_state.
-  {
-    float* new_state_start = state_ptr;
-    const float* old_state_start = state_ptr + 1;
-    const float* old_state_end =
-        state_ptr + batch_size * num_filters * memory_size;
-    while (old_state_start != old_state_end) {
-      *new_state_start++ = *old_state_start++;
-    }
-  }
-
-  // Note: no need to clear the latest activation, matmul is not accumulative.
-
-  // Compute conv1d(inputs, weights_feature).
-  // The activation_state's rightmost column is used to save current cycle
-  // activation. This is achieved by starting at state_ptr[memory_size - 1] and
-  // having the stride equal to memory_size.
-
-  // Perform batched matrix vector multiply operation:
-  {
-    const float* matrix = weights_feature_ptr;
-    const float* vector = input_ptr;
-    float* result = &state_ptr[memory_size - 1];
-    float* result_in_batch = result;
-    for (int i = 0; i < batch_size; ++i) {
-      const float* matrix_ptr = matrix;
-      for (int j = 0; j < num_filters; ++j) {
-        float dot_prod = 0.0f;
-        const float* vector_in_batch = vector + i * input_size;
-        for (int k = 0; k < input_size; ++k) {
-          dot_prod += *matrix_ptr++ * *vector_in_batch++;
-        }
-        *result_in_batch = dot_prod;
-        result_in_batch += memory_size;
-      }
-    }
-  }
-
-  ApplyTimeWeightsBiasAndActivation(
-      batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
-      bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
-}
-
-void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
-                     const TfLiteEvalTensor* input_tensor,
-                     const TfLiteEvalTensor* weights_feature_tensor,
-                     const TfLiteEvalTensor* weights_time_tensor,
-                     const TfLiteEvalTensor* bias_tensor,
-                     const TfLiteSVDFParams* params,
-                     TfLiteEvalTensor* activation_state_tensor,
-                     TfLiteEvalTensor* output_tensor, const OpData& data) {
-  const int n_rank = params->rank;
-  const int n_batch = input_tensor->dims->data[0];
-  const int n_input = input_tensor->dims->data[1];
-  const int n_filter = weights_feature_tensor->dims->data[0];
-  const int n_unit = n_filter / n_rank;
-  const int n_memory = weights_time_tensor->dims->data[1];
-
-  TFLITE_DCHECK(context != nullptr);
-  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
-
-  int32_t* scratch_tensor = static_cast<int32_t*>(
-      context->GetScratchBuffer(context, data.scratch_tensor_index));
-  int32_t* scratch_output_tensor = static_cast<int32_t*>(
-      context->GetScratchBuffer(context, data.scratch_output_tensor_index));
-
-  // Shift states.
-  int16_t* const state_ptr =
-      tflite::micro::GetTensorData<int16_t>(activation_state_tensor);
-
-  // Left shift the activation_state.
-  {
-    int16_t* new_state_start = state_ptr;
-    const int16_t* old_state_start = state_ptr + 1;
-    const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
-    while (old_state_start != old_state_end) {
-      *new_state_start++ = *old_state_start++;
-    }
-  }
-
-  // Note: no need to clear the latest activation, matmul is not accumulative.
-
-  // Feature matmul.
-  {
-    int16_t* state =
-        tflite::micro::GetTensorData<int16_t>(activation_state_tensor);
-    const int8_t* input = tflite::micro::GetTensorData<int8_t>(input_tensor);
-    const int8_t* weight_feature =
-        tflite::micro::GetTensorData<int8_t>(weights_feature_tensor);
-    const int32_t output_max = std::numeric_limits<int16_t>::max();
-    const int32_t output_min = std::numeric_limits<int16_t>::min();
-    int16_t* result_in_batch = state + (n_memory - 1);
-    for (int b = 0; b < n_batch; b++) {
-      const int8_t* matrix_ptr = weight_feature;
-      for (int r = 0; r < n_filter; r++) {
-        int32_t dot_prod = 0;
-        const int8_t* vector_in_batch = input + b * n_input;
-        for (int c = 0; c < n_input; c++) {
-          dot_prod +=
-              *matrix_ptr++ * (*vector_in_batch++ - data.input_zero_point);
-        }
-        dot_prod = MultiplyByQuantizedMultiplier(
-            dot_prod, data.effective_scale_1_a, data.effective_scale_1_b);
-        dot_prod = std::min(std::max(output_min, dot_prod), output_max);
-        // This assumes state is symmetrically quantized. Otherwise last bit of
-        // state should be initialized to its zero point and accumulate the
-        // dot_prod.
-        // Equivalent as the following:
-        //     result_in_batch = zero point, which happens to be zero.
-        //     result_in_batch += dot_prod_56.
-        *result_in_batch = dot_prod;
-        result_in_batch += n_memory;
-      }
-    }
-  }
-
-  // Time.
-  {
-    for (int b = 0; b < n_batch; ++b) {
-      int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter;
-
-      // Perform batched vector dot product:
-      const int16_t* vector1_ptr =
-          tflite::micro::GetTensorData<int16_t>(weights_time_tensor);
-      const int16_t* vector2_ptr =
-          tflite::micro::GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-
-      for (int i = 0; i < n_filter; i++) {
-        *scratch_ptr_batch = 0;
-        for (int j = 0; j < n_memory; j++) {
-          *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
-        }
-        scratch_ptr_batch++;
-      }
-    }
-  }
-
-  // Reduce, add bias, rescale, activation.
-  {
-    // Add bias.
-    if (bias_tensor) {
-      // Vector batch assign:
-      const int32_t* bias_data =
-          tflite::micro::GetTensorData<int32_t>(bias_tensor);
-      for (int i = 0; i < n_batch; ++i) {
-        int32_t* output_ptr = scratch_output_tensor + i * n_unit;
-        const int32_t* bias_ptr = bias_data;
-        for (int j = 0; j < n_unit; ++j) {
-          *output_ptr++ = *bias_ptr++;
-        }
-      }
-    } else {
-      int32_t* output_ptr = scratch_output_tensor;
-      for (int i = 0; i < n_batch * n_unit; ++i) {
-        *output_ptr++ = 0;
-      }
-    }
-
-    // Reduce.
-    for (int b = 0; b < n_batch; ++b) {
-      int32_t* output_temp_ptr = scratch_output_tensor + b * n_unit;
-      int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter;
-
-      // Reduction sum vector
-      for (int i = 0; i < n_unit; ++i) {
-        for (int j = 0; j < n_rank; ++j) {
-          output_temp_ptr[i] += *scratch_ptr_batch++;
-        }
-      }
-    }
-
-    // Rescale.
-    const int32_t output_max = std::numeric_limits<int8_t>::max();
-    const int32_t output_min = std::numeric_limits<int8_t>::min();
-    for (int i = 0; i < n_batch * n_unit; ++i) {
-      int32_t x1 = scratch_output_tensor[i];
-      int32_t x2 = MultiplyByQuantizedMultiplier(x1, data.effective_scale_2_a,
-                                                 data.effective_scale_2_b);
-      int32_t x3 = x2 + data.output_zero_point;
-      int32_t x4 = std::min(std::max(output_min, x3), output_max);
-      tflite::micro::GetTensorData<int8_t>(output_tensor)[i] =
-          static_cast<int8_t>(x4);
-    }
-  }
-}
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }

-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->builtin_data != nullptr);
-
-  const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
-
-  // Validate Tensor Inputs (dtype depends on quantization):
-  // [0] = Input, {2, batch_size, input_size}
-  // [1] = Weights Feature, {2, num_filters, input_size}
-  // [2] = Weights Time, {2, num_filters, memory_size}
-  // [3] = Bias (optional), {1, num_units}
-  // [4] = Activation State (variable),
-  //         {2, batch_size, memory_size * num_filters}
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* weights_feature =
-      GetInput(context, node, kWeightsFeatureTensor);
-  TF_LITE_ENSURE(context, weights_feature != nullptr);
-  const TfLiteTensor* weights_time =
-      GetInput(context, node, kWeightsTimeTensor);
-  TF_LITE_ENSURE(context, weights_time != nullptr);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  const TfLiteTensor* activation_state =
-      GetInput(context, node, kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
-
-  // Define input constants based on input tensor definition above:
-  const int rank = params->rank;
-  const int input_size = input->dims->data[1];
-  const int batch_size = input->dims->data[0];
-  const int num_filters = weights_feature->dims->data[0];
-  TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
-  const int num_units = num_filters / rank;
-  const int memory_size = weights_time->dims->data[1];
-
-  // Validate Input Tensor:
-  TF_LITE_ENSURE(context,
-                 input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
-
-  // Validate Tensor Output:
-  // [0] = float/int8_t, {2, batch_size, num_units}
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE(context, output != nullptr);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
-  TF_LITE_ENSURE_EQ(context, output->dims->data[0], batch_size);
-  TF_LITE_ENSURE_EQ(context, output->dims->data[1], num_units);
-
-  // Validate Weights Feature Input Tensor:
-  TF_LITE_ENSURE_EQ(context, NumDimensions(weights_feature), 2);
-  TF_LITE_ENSURE_EQ(context, weights_feature->dims->data[1], input_size);
-
-  // Validate Weights Time Input Tensor:
-  TF_LITE_ENSURE_EQ(context, NumDimensions(weights_time), 2);
-  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[0], num_filters);
-  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);
-
-  // Validate Optional Bias Input Tensor:
-  if (bias != nullptr) {
-    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
-  }
-
-  // Validate Activation State Input Tensor:
-  TF_LITE_ENSURE_EQ(context, NumDimensions(activation_state), 2);
-  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[0], batch_size);
-  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
-                    memory_size * num_filters);
-  // Since is_variable is not part of TFLiteEvalTensor, check is_variable here.
-  TF_LITE_ENSURE_EQ(context, activation_state->is_variable, true);
-
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
-
-  TFLITE_DCHECK(node->user_data != nullptr);
-  OpData* data = static_cast<OpData*>(node->user_data);
-
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
-    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
-    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
-    if (bias != nullptr) {
-      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
-    }
-
-    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
-
-    const double effective_scale_1 = static_cast<double>(
-        input->params.scale * weights_feature->params.scale /
-        activation_state->params.scale);
-    const double effective_scale_2 =
-        static_cast<double>(activation_state->params.scale *
-                            weights_time->params.scale / output->params.scale);
-
-    // TODO(b/162018098): Use TF_LITE_ENSURE_NEAR when it is ready.
-    TF_LITE_ENSURE(
-        context,
-        std::abs(static_cast<double>(bias->params.scale) -
-                 static_cast<double>(activation_state->params.scale *
-                                     weights_time->params.scale)) < 1e-5);
-
-    QuantizeMultiplier(effective_scale_1, &(data->effective_scale_1_a),
-                       &(data->effective_scale_1_b));
-    QuantizeMultiplier(effective_scale_2, &(data->effective_scale_2_a),
-                       &(data->effective_scale_2_b));
-
-    data->input_zero_point = input->params.zero_point;
-    data->output_zero_point = output->params.zero_point;
-
-    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
-
-    const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
-        context, batch_size * num_filters * sizeof(int32_t),
-        &(data->scratch_tensor_index));
-    TF_LITE_ENSURE_OK(context, scratch_status);
-
-    const TfLiteStatus scratch_output_status =
-        context->RequestScratchBufferInArena(
-            context, batch_size * num_units * sizeof(int32_t),
-            &(data->scratch_output_tensor_index));
-    TF_LITE_ENSURE_OK(context, scratch_output_status);
-  } else {
-    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
-    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteFloat32);
-    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteFloat32);
-    if (bias != nullptr) {
-      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
-    }
-    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
-
-    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
-    const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
-        context, batch_size * num_filters * sizeof(float),
-        &(data->scratch_tensor_index));
-    TF_LITE_ENSURE_OK(context, scratch_status);
-  }
-
-  return kTfLiteOk;
-}
-
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
  TFLITE_DCHECK(node->user_data != nullptr);
  const OpData& data = *(static_cast<const OpData*>(node->user_data));

  const TfLiteEvalTensor* input =
-      tflite::micro::GetEvalInput(context, node, kInputTensor);
+      tflite::micro::GetEvalInput(context, node, kSvdfInputTensor);
  const TfLiteEvalTensor* weights_feature =
-      tflite::micro::GetEvalInput(context, node, kWeightsFeatureTensor);
+      tflite::micro::GetEvalInput(context, node, kSvdfWeightsFeatureTensor);
  const TfLiteEvalTensor* weights_time =
-      tflite::micro::GetEvalInput(context, node, kWeightsTimeTensor);
+      tflite::micro::GetEvalInput(context, node, kSvdfWeightsTimeTensor);
  const TfLiteEvalTensor* bias =
      (NumInputs(node) == 5)
-          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          ? tflite::micro::GetEvalInput(context, node, kSvdfBiasTensor)
          : nullptr;
  TfLiteEvalTensor* activation_state = tflite::micro::GetMutableEvalInput(
-      context, node, kInputActivationStateTensor);
+      context, node, kSvdfInputActivationStateTensor);
  TfLiteEvalTensor* output =
-      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+      tflite::micro::GetEvalOutput(context, node, kSvdfOutputTensor);

  switch (weights_feature->type) {
    case kTfLiteFloat32: {
-      EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias,
-                    params, data.scratch_tensor_index, activation_state,
-                    output);
+      EvalFloatSvdfReference(
+          context, node, input, weights_feature, weights_time, bias, params,
+          data.scratch_tensor_index, activation_state, output);
      return kTfLiteOk;
      break;
    }

    case kTfLiteInt8: {
-      EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias,
-                      params, activation_state, output, data);
+      EvalIntegerSvdfReference(context, node, input, weights_feature,
+                               weights_time, bias, params, activation_state,
+                               output, data);
      return kTfLiteOk;
      break;
    }
@@ -536,7 +86,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 TfLiteRegistration Register_SVDF() {
  return {/*init=*/Init,
          /*free=*/nullptr,
-          /*prepare=*/Prepare,
+          /*prepare=*/PrepareSvdf,
          /*invoke=*/Eval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/svdf.h
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/svdf.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_SVDF_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_SVDF_H_
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+struct OpData {
+  int32_t effective_scale_1_a;
+  int32_t effective_scale_2_a;
+  // b versions of each scale are kept at int since the numbers are just the
+  // shift value - typically between [-32, 32].
+  int effective_scale_1_b;
+  int effective_scale_2_b;
+  int scratch_tensor_index;
+  int scratch_output_tensor_index;
+
+  // Cached tensor zero point values for quantized operations.
+  int input_zero_point;
+  int output_zero_point;
+};
+
+// Input tensors.
+extern const int kSvdfInputTensor;
+extern const int kSvdfWeightsFeatureTensor;
+extern const int kSvdfWeightsTimeTensor;
+extern const int kSvdfBiasTensor;
+// This is a variable tensor, and will be modified by this op.
+extern const int kSvdfInputActivationStateTensor;
+
+// Output tensor.
+extern const int kSvdfOutputTensor;
+
+// TensorflowLite Micro-specific reference implementation for Integer SVDF.
+void EvalIntegerSvdfReference(TfLiteContext* context, TfLiteNode* node,
+                              const TfLiteEvalTensor* input_tensor,
+                              const TfLiteEvalTensor* weights_feature_tensor,
+                              const TfLiteEvalTensor* weights_time_tensor,
+                              const TfLiteEvalTensor* bias_tensor,
+                              const TfLiteSVDFParams* params,
+                              TfLiteEvalTensor* activation_state_tensor,
+                              TfLiteEvalTensor* output_tensor,
+                              const OpData& data);
+
+void EvalFloatSvdfReference(
+    TfLiteContext* context, TfLiteNode* node, const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* weights_feature,
+    const TfLiteEvalTensor* weights_time, const TfLiteEvalTensor* bias,
+    const TfLiteSVDFParams* params, int scratch_tensor_index,
+    TfLiteEvalTensor* activation_state, TfLiteEvalTensor* output);
+
+TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_SVDF_H_
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/svdf_common.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/svdf_common.cc
@@ -0,0 +1,469 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <math.h>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/activation_utils.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/svdf.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+
+/**
+ * This version of SVDF is specific to TFLite Micro. It contains the following
+ * differences between the TFLite version:
+ *
+ * 1.) Scratch tensor allocation - scratch tensors must be known ahead of time
+ * for the Micro interpreter.
+ * 2.) Output dimensions - the TFLite version determines output size and runtime
+ * and resizes the output tensor. Micro runtime does not support tensor
+ * resizing.
+ */
+
+const int kSvdfInputTensor = 0;
+const int kSvdfWeightsFeatureTensor = 1;
+const int kSvdfWeightsTimeTensor = 2;
+const int kSvdfBiasTensor = 3;
+const int kSvdfInputActivationStateTensor =
+    4;  // This is a variable tensor, and will be modified by this op.
+const int kSvdfOutputTensor = 0;
+
+void EvalIntegerSvdfReference(TfLiteContext* context, TfLiteNode* node,
+                              const TfLiteEvalTensor* input_tensor,
+                              const TfLiteEvalTensor* weights_feature_tensor,
+                              const TfLiteEvalTensor* weights_time_tensor,
+                              const TfLiteEvalTensor* bias_tensor,
+                              const TfLiteSVDFParams* params,
+                              TfLiteEvalTensor* activation_state_tensor,
+                              TfLiteEvalTensor* output_tensor,
+                              const OpData& data) {
+  const int n_rank = params->rank;
+  const int n_batch = input_tensor->dims->data[0];
+  const int n_input = input_tensor->dims->data[1];
+  const int n_filter = weights_feature_tensor->dims->data[0];
+  const int n_unit = n_filter / n_rank;
+  const int n_memory = weights_time_tensor->dims->data[1];
+
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  int32_t* scratch_tensor = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_tensor_index));
+  int32_t* scratch_output_tensor = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_output_tensor_index));
+
+  // Shift states.
+  int16_t* const state_ptr =
+      tflite::micro::GetTensorData<int16_t>(activation_state_tensor);
+
+  // Left shift the activation_state.
+  {
+    int16_t* new_state_start = state_ptr;
+    const int16_t* old_state_start = state_ptr + 1;
+    const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
+    while (old_state_start != old_state_end) {
+      *new_state_start++ = *old_state_start++;
+    }
+  }
+
+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
+  // Feature matmul.
+  {
+    int16_t* state =
+        tflite::micro::GetTensorData<int16_t>(activation_state_tensor);
+    const int8_t* input = tflite::micro::GetTensorData<int8_t>(input_tensor);
+    const int8_t* weight_feature =
+        tflite::micro::GetTensorData<int8_t>(weights_feature_tensor);
+    const int32_t output_max = std::numeric_limits<int16_t>::max();
+    const int32_t output_min = std::numeric_limits<int16_t>::min();
+    int16_t* result_in_batch = state + (n_memory - 1);
+    for (int b = 0; b < n_batch; b++) {
+      const int8_t* matrix_ptr = weight_feature;
+      for (int r = 0; r < n_filter; r++) {
+        int32_t dot_prod = 0;
+        const int8_t* vector_in_batch = input + b * n_input;
+        for (int c = 0; c < n_input; c++) {
+          dot_prod +=
+              *matrix_ptr++ * (*vector_in_batch++ - data.input_zero_point);
+        }
+        dot_prod = MultiplyByQuantizedMultiplier(
+            dot_prod, data.effective_scale_1_a, data.effective_scale_1_b);
+        dot_prod = std::min(std::max(output_min, dot_prod), output_max);
+        // This assumes state is symmetrically quantized. Otherwise last bit of
+        // state should be initialized to its zero point and accumulate the
+        // dot_prod.
+        // Equivalent as the following:
+        //     result_in_batch = zero point, which happens to be zero.
+        //     result_in_batch += dot_prod_56.
+        *result_in_batch = dot_prod;
+        result_in_batch += n_memory;
+      }
+    }
+  }
+
+  // Time.
+  {
+    for (int b = 0; b < n_batch; ++b) {
+      int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter;
+
+      // Perform batched vector dot product:
+      const int16_t* vector1_ptr =
+          tflite::micro::GetTensorData<int16_t>(weights_time_tensor);
+      const int16_t* vector2_ptr =
+          tflite::micro::GetTensorData<int16_t>(activation_state_tensor) +
+          b * n_memory * n_filter;
+
+      for (int i = 0; i < n_filter; i++) {
+        *scratch_ptr_batch = 0;
+        for (int j = 0; j < n_memory; j++) {
+          *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
+        }
+        scratch_ptr_batch++;
+      }
+    }
+  }
+
+  // Reduce, add bias, rescale, activation.
+  {
+    // Add bias.
+    if (bias_tensor) {
+      // Vector batch assign:
+      const int32_t* bias_data =
+          tflite::micro::GetTensorData<int32_t>(bias_tensor);
+      for (int i = 0; i < n_batch; ++i) {
+        int32_t* output_ptr = scratch_output_tensor + i * n_unit;
+        const int32_t* bias_ptr = bias_data;
+        for (int j = 0; j < n_unit; ++j) {
+          *output_ptr++ = *bias_ptr++;
+        }
+      }
+    } else {
+      int32_t* output_ptr = scratch_output_tensor;
+      for (int i = 0; i < n_batch * n_unit; ++i) {
+        *output_ptr++ = 0;
+      }
+    }
+
+    // Reduce.
+    for (int b = 0; b < n_batch; ++b) {
+      int32_t* output_temp_ptr = scratch_output_tensor + b * n_unit;
+      int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter;
+
+      // Reduction sum vector
+      for (int i = 0; i < n_unit; ++i) {
+        for (int j = 0; j < n_rank; ++j) {
+          output_temp_ptr[i] += *scratch_ptr_batch++;
+        }
+      }
+    }
+
+    // Rescale.
+    const int32_t output_max = std::numeric_limits<int8_t>::max();
+    const int32_t output_min = std::numeric_limits<int8_t>::min();
+    for (int i = 0; i < n_batch * n_unit; ++i) {
+      int32_t x1 = scratch_output_tensor[i];
+      int32_t x2 = MultiplyByQuantizedMultiplier(x1, data.effective_scale_2_a,
+                                                 data.effective_scale_2_b);
+      int32_t x3 = x2 + data.output_zero_point;
+      int32_t x4 = std::min(std::max(output_min, x3), output_max);
+      tflite::micro::GetTensorData<int8_t>(output_tensor)[i] =
+          static_cast<int8_t>(x4);
+    }
+  }
+}
+static inline void ApplyTimeWeightsBiasAndActivation(
+    int batch_size, int memory_size, int num_filters, int num_units, int rank,
+    const float* const __restrict__ weights_time_ptr,
+    const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
+    float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
+    float* const __restrict__ output_ptr) {
+  // Compute matmul(activation_state, weights_time).
+  for (int b = 0; b < batch_size; ++b) {
+    // Perform batched vector dot product:
+    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
+    const float* vector1_ptr = weights_time_ptr;
+    const float* vector2_ptr = state_ptr + b * memory_size * num_filters;
+    for (int i = 0; i < num_filters; ++i) {
+      *scratch_ptr_batch = 0.f;
+      for (int j = 0; j < memory_size; ++j) {
+        *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
+      }
+      scratch_ptr_batch++;
+    }
+  }
+
+  // Initialize output with bias if provided.
+  if (bias_ptr) {
+    // VectorBatchVectorAssign
+    for (int i = 0; i < batch_size; ++i) {
+      float* output_data = output_ptr + i * num_units;
+      const float* bias_data = bias_ptr;
+      for (int j = 0; j < num_units; ++j) {
+        *output_data++ = *bias_data++;
+      }
+    }
+  } else {
+    float* output_data = output_ptr;
+    for (int i = 0; i < batch_size * num_units; ++i) {
+      *output_data++ = 0.0f;
+    }
+  }
+
+  // Reduction sum.
+  for (int b = 0; b < batch_size; ++b) {
+    float* output_ptr_batch = output_ptr + b * num_units;
+    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
+
+    // Reduction sum vector
+    for (int i = 0; i < num_units; ++i) {
+      for (int j = 0; j < rank; j++) {
+        output_ptr_batch[i] += *scratch_ptr_batch++;
+      }
+    }
+  }
+
+  // Apply activation.
+  for (int b = 0; b < batch_size; ++b) {
+    float* output_ptr_batch = output_ptr + b * num_units;
+    for (int i = 0; i < num_units; ++i) {
+      *output_ptr_batch =
+          tflite::ops::micro::ActivationValFloat(activation, *output_ptr_batch);
+      ++output_ptr_batch;
+    }
+  }
+}
+
+void EvalFloatSvdfReference(
+    TfLiteContext* context, TfLiteNode* node, const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* weights_feature,
+    const TfLiteEvalTensor* weights_time, const TfLiteEvalTensor* bias,
+    const TfLiteSVDFParams* params, int scratch_tensor_index,
+    TfLiteEvalTensor* activation_state, TfLiteEvalTensor* output) {
+  const int rank = params->rank;
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  const int num_filters = weights_feature->dims->data[0];
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time->dims->data[1];
+
+  const float* weights_feature_ptr =
+      tflite::micro::GetTensorData<float>(weights_feature);
+  const float* weights_time_ptr =
+      tflite::micro::GetTensorData<float>(weights_time);
+  const float* bias_ptr = tflite::micro::GetTensorData<float>(bias);
+  const float* input_ptr = tflite::micro::GetTensorData<float>(input);
+
+  float* state_ptr = tflite::micro::GetTensorData<float>(activation_state);
+
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  float* scratch_ptr = static_cast<float*>(
+      context->GetScratchBuffer(context, scratch_tensor_index));
+
+  float* output_ptr = tflite::micro::GetTensorData<float>(output);
+
+  // Left shift the activation_state.
+  {
+    float* new_state_start = state_ptr;
+    const float* old_state_start = state_ptr + 1;
+    const float* old_state_end =
+        state_ptr + batch_size * num_filters * memory_size;
+    while (old_state_start != old_state_end) {
+      *new_state_start++ = *old_state_start++;
+    }
+  }
+
+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
+  // Compute conv1d(inputs, weights_feature).
+  // The activation_state's rightmost column is used to save current cycle
+  // activation. This is achieved by starting at state_ptr[memory_size - 1] and
+  // having the stride equal to memory_size.
+
+  // Perform batched matrix vector multiply operation:
+  {
+    const float* matrix = weights_feature_ptr;
+    const float* vector = input_ptr;
+    float* result = &state_ptr[memory_size - 1];
+    float* result_in_batch = result;
+    for (int i = 0; i < batch_size; ++i) {
+      const float* matrix_ptr = matrix;
+      for (int j = 0; j < num_filters; ++j) {
+        float dot_prod = 0.0f;
+        const float* vector_in_batch = vector + i * input_size;
+        for (int k = 0; k < input_size; ++k) {
+          dot_prod += *matrix_ptr++ * *vector_in_batch++;
+        }
+        *result_in_batch = dot_prod;
+        result_in_batch += memory_size;
+      }
+    }
+  }
+
+  ApplyTimeWeightsBiasAndActivation(
+      batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
+      bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
+}
+
+TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
+
+  // Validate Tensor Inputs (dtype depends on quantization):
+  // [0] = Input, {2, batch_size, input_size}
+  // [1] = Weights Feature, {2, num_filters, input_size}
+  // [2] = Weights Time, {2, num_filters, memory_size}
+  // [3] = Bias (optional), {1, num_units}
+  // [4] = Activation State (variable),
+  //         {2, batch_size, memory_size * num_filters}
+  const TfLiteTensor* input = GetInput(context, node, kSvdfInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  const TfLiteTensor* weights_feature =
+      GetInput(context, node, kSvdfWeightsFeatureTensor);
+  TF_LITE_ENSURE(context, weights_feature != nullptr);
+  const TfLiteTensor* weights_time =
+      GetInput(context, node, kSvdfWeightsTimeTensor);
+  TF_LITE_ENSURE(context, weights_time != nullptr);
+  const TfLiteTensor* bias =
+      GetOptionalInputTensor(context, node, kSvdfBiasTensor);
+  const TfLiteTensor* activation_state =
+      GetInput(context, node, kSvdfInputActivationStateTensor);
+  TF_LITE_ENSURE(context, activation_state != nullptr);
+
+  // Define input constants based on input tensor definition above:
+  const int rank = params->rank;
+  const int input_size = input->dims->data[1];
+  const int batch_size = input->dims->data[0];
+  const int num_filters = weights_feature->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time->dims->data[1];
+
+  // Validate Input Tensor:
+  TF_LITE_ENSURE(context,
+                 input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
+
+  // Validate Tensor Output:
+  // [0] = float/int8_t, {2, batch_size, num_units}
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+  TfLiteTensor* output = GetOutput(context, node, kSvdfOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
+  TF_LITE_ENSURE_EQ(context, output->dims->data[0], batch_size);
+  TF_LITE_ENSURE_EQ(context, output->dims->data[1], num_units);
+
+  // Validate Weights Feature Input Tensor:
+  TF_LITE_ENSURE_EQ(context, NumDimensions(weights_feature), 2);
+  TF_LITE_ENSURE_EQ(context, weights_feature->dims->data[1], input_size);
+
+  // Validate Weights Time Input Tensor:
+  TF_LITE_ENSURE_EQ(context, NumDimensions(weights_time), 2);
+  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[0], num_filters);
+  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);
+
+  // Validate Optional Bias Input Tensor:
+  if (bias != nullptr) {
+    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
+  }
+
+  // Validate Activation State Input Tensor:
+  TF_LITE_ENSURE_EQ(context, NumDimensions(activation_state), 2);
+  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[0], batch_size);
+  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
+                    memory_size * num_filters);
+  // Since is_variable is not part of TFLiteEvalTensor, check is_variable here.
+  TF_LITE_ENSURE_EQ(context, activation_state->is_variable, true);
+
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
+    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
+    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
+    if (bias != nullptr) {
+      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+    }
+
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
+
+    const double effective_scale_1 = static_cast<double>(
+        input->params.scale * weights_feature->params.scale /
+        activation_state->params.scale);
+    const double effective_scale_2 =
+        static_cast<double>(activation_state->params.scale *
+                            weights_time->params.scale / output->params.scale);
+
+    // TODO(b/162018098): Use TF_LITE_ENSURE_NEAR when it is ready.
+    TF_LITE_ENSURE(
+        context,
+        std::abs(static_cast<double>(bias->params.scale) -
+                 static_cast<double>(activation_state->params.scale *
+                                     weights_time->params.scale)) < 1e-5);
+
+    QuantizeMultiplier(effective_scale_1, &(data->effective_scale_1_a),
+                       &(data->effective_scale_1_b));
+    QuantizeMultiplier(effective_scale_2, &(data->effective_scale_2_a),
+                       &(data->effective_scale_2_b));
+
+    data->input_zero_point = input->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
+
+    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+
+    const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+        context, batch_size * num_filters * sizeof(int32_t),
+        &(data->scratch_tensor_index));
+    TF_LITE_ENSURE_OK(context, scratch_status);
+
+    const TfLiteStatus scratch_output_status =
+        context->RequestScratchBufferInArena(
+            context, batch_size * num_units * sizeof(int32_t),
+            &(data->scratch_output_tensor_index));
+    TF_LITE_ENSURE_OK(context, scratch_output_status);
+  } else {
+    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteFloat32);
+    if (bias != nullptr) {
+      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
+    }
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
+
+    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+    const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+        context, batch_size * num_filters * sizeof(float),
+        &(data->scratch_tensor_index));
+    TF_LITE_ENSURE_OK(context, scratch_status);
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/transpose_conv.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/transpose_conv.cc
@@ -0,0 +1,269 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/transpose_conv.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace {
+
+// For the TfLite transpose_conv implementation, input tensor 0 corresponds to
+// the OutputShapeTensor. However, since TFLM does not support dynamic tensors,
+// the TFLM implementation ignores input tensor 0 and the only inputs we care
+// about are kFilterTensor, kInputTensor and kBiasTensor.
+constexpr int kFilterTensor = 1;
+constexpr int kInputTensor = 2;
+constexpr int kBiasTensor = 3;
+constexpr int kOutputTensor = 0;
+
+// Conv is quantized along dimension 0:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+constexpr int kConvQuantizedDimension = 0;
+
+struct OpData {
+  ConvParams params;
+
+  // A scratch buffer is required for quantized implementations.
+  int scratch_buffer_index;
+
+  // Multiplier and shift arrays are required for the int8 implementation.
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
+};
+
+inline PaddingType RuntimePaddingType(TfLitePadding padding) {
+  switch (padding) {
+    case TfLitePadding::kTfLitePaddingSame:
+      return PaddingType::kSame;
+    case TfLitePadding::kTfLitePaddingValid:
+      return PaddingType::kValid;
+    case TfLitePadding::kTfLitePaddingUnknown:
+    default:
+      return PaddingType::kNone;
+  }
+}
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             const TfLiteConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             int out_width, int out_height,
+                             const TfLiteType data_type, OpData* data) {
+  bool has_bias = node->inputs->size == 4;
+  // Check number of inputs/outputs
+  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 3);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  // Matching GetWindowedOutputSize in TensorFlow.
+  auto padding = params->padding;
+  TfLitePaddingValues padding_values = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width,
+      params->dilation_height_factor, params->dilation_width_factor, height,
+      width, filter_height, filter_width, padding, &out_height, &out_width);
+
+  data->params.padding_type = RuntimePaddingType(padding);
+  data->params.padding_values.width = padding_values.width;
+  data->params.padding_values.height = padding_values.height;
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    TF_LITE_ENSURE(context, input != nullptr);
+    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    TF_LITE_ENSURE(context, filter != nullptr);
+    const TfLiteTensor* bias =
+        GetOptionalInputTensor(context, node, kBiasTensor);
+    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    TF_LITE_ENSURE(context, output != nullptr);
+    int output_channels = filter->dims->data[kConvQuantizedDimension];
+
+    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params->activation,
+        &data->params.output_multiplier, &data->params.output_shift,
+        &data->params.quantized_activation_min,
+        &data->params.quantized_activation_max,
+        data->per_channel_output_multiplier,
+        reinterpret_cast<int*>(data->per_channel_output_shift),
+        output_channels));
+  }
+  return kTfLiteOk;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+
+  int input_width = input->dims->data[2];
+  int input_height = input->dims->data[1];
+  int filter_width = filter->dims->data[2];
+  int filter_height = filter->dims->data[1];
+  int output_width = output->dims->data[2];
+  int output_height = output->dims->data[1];
+
+  // Dynamically allocate per-channel quantization parameters.
+  const int num_channels = filter->dims->data[kConvQuantizedDimension];
+  data->per_channel_output_multiplier =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // Quantized kernels use an int32 scratch buffer.
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+    TFLITE_DCHECK(context->RequestScratchBufferInArena(
+                      context,
+                      GetTensorShape(output).FlatSize() * sizeof(int32_t),
+                      &(data->scratch_buffer_index)) == kTfLiteOk);
+  }
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+
+    TF_LITE_ENSURE(context,
+                   affine_quantization->scale->size == 1 ||
+                       affine_quantization->scale->size ==
+                           filter->dims->data[kConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(
+      context, node, params, input_width, input_height, filter_width,
+      filter_height, output_width, output_height, input->type, data));
+
+  // Offsets (zero points)
+  data->params.input_offset = -input->params.zero_point;
+  data->params.weights_offset = -filter->params.zero_point;
+  data->params.output_offset = output->params.zero_point;
+
+  // Stride + dilation
+  data->params.stride_width = params->stride_width;
+  data->params.stride_height = params->stride_height;
+  data->params.dilation_width_factor = params->dilation_width_factor;
+  data->params.dilation_height_factor = params->dilation_height_factor;
+
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  data->params.float_activation_min = output_activation_min;
+  data->params.float_activation_max = output_activation_max;
+  return kTfLiteOk;
+}  // namespace conv
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 4)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32: {
+      reference_ops::TransposeConv(
+          data.params, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output),
+          tflite::micro::GetTensorShape(nullptr), nullptr);
+      break;
+    }
+    case kTfLiteInt8: {
+      int32_t* scratch_buffer = static_cast<int32_t*>(
+          context->GetScratchBuffer(context, data.scratch_buffer_index));
+      reference_integer_ops::TransposeConv(
+          data.params, data.per_channel_output_multiplier,
+          data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<int8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output),
+          tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
+      break;
+    }
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_TRANSPOSE_CONV() {
+  return {/*init=*/Init,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
--- a/code/components/tfmicro/tensorflow/lite/micro/kernels/zeros_like.cc
+++ b/code/components/tfmicro/tensorflow/lite/micro/kernels/zeros_like.cc
@@ -0,0 +1,89 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  output->type = input->type;
+
+  return kTfLiteOk;
+}
+
+template <typename T>
+void resetZeros(T* out, const int num_elements) {
+  for (int i = 0; i < num_elements; ++i) {
+    out[i] = static_cast<T>(0);
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  int flat_size = MatchingFlatSize(tflite::micro::GetTensorShape(input),
+                                   tflite::micro::GetTensorShape(output));
+  switch (input->type) {
+    case kTfLiteInt64:
+      resetZeros(tflite::micro::GetTensorData<int64_t>(output), flat_size);
+      break;
+    case kTfLiteInt32:
+      resetZeros(tflite::micro::GetTensorData<int32_t>(output), flat_size);
+      break;
+    case kTfLiteInt8:
+      resetZeros(tflite::micro::GetTensorData<int8_t>(output), flat_size);
+      break;
+    case kTfLiteFloat32:
+      resetZeros(tflite::micro::GetTensorData<float>(output), flat_size);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context,
+                         "ZerosLike only currently supports int64, int32, "
+                         "and float32, got %d.",
+                         input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteRegistration Register_ZEROS_LIKE() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite